In [4]:
!pip install pandas scikit-learn nltk



In [5]:
!pip install xgboost



In [47]:
# Import necessary libraries
import pandas as pd
import numpy as np
import nltk
import re
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder

from sklearn import svm
import xgboost as xgb

import random
import os
import time


In [48]:
# File path where the variable will be saved
results_file = "results.txt"

# # Open the file in write mode ('w') to empty its contents
# with open(results_file, "w") as file:
#     pass  # This clears the file

# print(f"Contents of '{results_file}' have been deleted.")

Contents of 'results.txt' have been deleted.


## Train-Test split for TFIDF vectorizer

In [80]:
# Path to the dataset
# dataset_path = '../final_datasets/articles_dataset.csv'
dataset_path = '../final_datasets/tweets_dataset.csv'
# dataset_path = '../final_datasets/combined_dataset.csv'

# Load the dataset
df = pd.read_csv(dataset_path)


# Encode the labels (real/fake)
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['label'])  # Assuming 'label' column has 'real' and 'fake' values

# Split the data into training and testing sets (80% train, 20% test)
X_train_texts, X_test_texts, y_train, y_test = train_test_split(df['text'], y, test_size=0.2, random_state=42)

# Fit the TF-IDF vectorizer on the training data only, then transform both sets
tfidf_vectorizer = TfidfVectorizer(max_features=1000) 
# tfidf_vectorizer = TfidfVectorizer() #default features (max=None) is 112,486: this is when it takes all words into accounts, not the top 5 for example. Mean Accuracy CrossVal: 0.9928 ± 0.0020

X_train = tfidf_vectorizer.fit_transform(X_train_texts)
X_test = tfidf_vectorizer.transform(X_test_texts)


# Get the number of features used by the vectorizer
num_features = len(tfidf_vectorizer.vocabulary_)

# Print the number of features
print(f"Number of features used by TfidfVectorizer: {num_features}")



# Check the shapes
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)


# Open the file in append mode ('a') and write the variable value
with open(results_file, "a") as file:
    file.write("Training and Testing dataset: " + str(os.path.basename(dataset_path)) + "\n")  
    file.write("X_train.shape: " + str(X_train.shape) + "\n")  # Append the value with a newline for readability
    file.write("X_test.shape: " + str(X_test.shape) + "\n")  # Append the value with a newline for readability
    file.write("y_train.shape: " + str(y_train.shape) + "\n")  # Append the value with a newline for readability
    file.write("y_test.shape: " + str(y_test.shape) + "\n")  # Append the value with a newline for readability
    file.write(f"Number of features used by TfidfVectorizer: {num_features}"  + "\n\n")



Number of features used by TfidfVectorizer: 1000
(2249, 1000)
(563, 1000)
(2249,)
(563,)


### SVM
- **Code source:** Support Vector Machine (SVM): https://www.kaggle.com/code/mehmetlaudatekman/text-classification-svm-explained

Support Vector Machine (SVM) is a supervised machine learning technique commonly applied to classification problems, like fake news detection. In this context, SVM works by separating real and fake news articles using a decision boundary based on the features extracted from Arabic text data. For instance, these features might include word frequencies, linguistic patterns, or even word embeddings tailored for Arabic, which capture contextual relationships in the text.

In [81]:
from sklearn.model_selection import cross_val_score


# Train the SVM classifier
time_i=time.time()
svm_classifier = svm.SVC(kernel='linear', random_state=42) 

# Perform cross-validation to confirm that the number of tfidf features was enough by checking consistency of accuracy among folds
cv_scores = cross_val_score(svm_classifier, X_train, y_train, cv=5) 
time_f=time.time()

time_taken=time_f-time_i

# Print cross-validation results
print(f'Cross-Validation Scores: {cv_scores}')
print(f'Mean Accuracy: {np.mean(cv_scores):.4f} ± {np.std(cv_scores):.4f}')
print(f'Time taken: {time_taken}')

# Open the file in append mode ('a') and write the variable value
with open(results_file, "a") as file:
    file.write('Cross-Validation Scores: ' + str(cv_scores) + "\n")
    file.write(f'Mean Accuracy: {np.mean(cv_scores):.4f} ± {np.std(cv_scores):.4f}'  + "\n") 
    file.write(f'Time taken: {time_taken}'  + "\n\n")


Cross-Validation Scores: [0.91777778 0.86666667 0.86666667 0.87555556 0.86636971]
Mean Accuracy: 0.8786 ± 0.0199
Time taken: 1.1211111545562744


In [82]:
svm_classifier.fit(X_train, y_train)

# Predict on the test set
y_pred = svm_classifier.predict(X_test)


# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print the evaluation results
print("SVM Results:")
print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')


# Open the file in append mode ('a') and write the variable value
with open(results_file, "a") as file:
    file.write("SVM Results:" + "\n")
    file.write(f'Accuracy: {accuracy:.4f}'  + "\n")
    file.write(f'Precision: {precision:.4f}' + "\n")
    file.write(f'Recall: {recall:.4f}' + "\n")
    file.write(f'F1 Score: {f1:.4f}' + "\n\n\n")

SVM Results:
Accuracy: 0.8686
Precision: 0.8750
Recall: 0.8561
F1 Score: 0.8655


### XGBOOST
- **Code source:** eXtreme Gradient Boosting (XGBoost): https://www.kaggle.com/code/iamarjunchandra/text-classification-with-rnn-xgboost

In [83]:
# Split the data into training and testing sets (80% train, 20% test)
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 1: Initialize the XGBoost classifier
xgb_classifier = xgb.XGBClassifier(objective='binary:logistic', use_label_encoder=False, eval_metric='logloss', random_state=42)

# Step 2: Train the classifier
xgb_classifier.fit(X_train, y_train)

# Step 3: Predict on the test set
y_pred_xgb = xgb_classifier.predict(X_test)


# Step 4: Evaluate the model
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
precision_xgb = precision_score(y_test, y_pred_xgb)
recall_xgb = recall_score(y_test, y_pred_xgb)
f1_xgb = f1_score(y_test, y_pred_xgb)

# Step 5: Print the evaluation results for XGBoost
print("XGBoost Results:")
print(f'Accuracy: {accuracy_xgb:.4f}')
print(f'Precision: {precision_xgb:.4f}')
print(f'Recall: {recall_xgb:.4f}')
print(f'F1 Score: {f1_xgb:.4f}')


# Open the file in append mode ('a') and write the variable value
with open(results_file, "a") as file:
    file.write("XGBoost Results:" + "\n")
    file.write(f'Accuracy: {accuracy_xgb:.4f}' + "\n")
    file.write(f'Precision: {precision_xgb:.4f}' + "\n")
    file.write(f'Recall: {recall_xgb:.4f}' + "\n")
    file.write(f'F1 Score: {f1_xgb:.4f}' + "\n\n\n\n")

Parameters: { "use_label_encoder" } are not used.



XGBoost Results:
Accuracy: 0.8455
Precision: 0.8930
Recall: 0.7806
F1 Score: 0.8330
