In [7]:
!pip install pandas scikit-learn nltk



In [8]:
!pip install xgboost



In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import nltk
import re
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder

from sklearn import svm
import xgboost as xgb

import random

## Preprocessing of the combined dataset.csv

In [2]:
# Download necessary NLTK data
nltk.download('stopwords')

# Arabic stopwords
arabic_stop_words = set(stopwords.words('arabic'))

# Load the dataset
df = pd.read_csv('compiled_real_fake_news_dataset.csv')

# Step 1: Text Preprocessing for Arabic
def preprocess_text_arabic(text):
    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Remove non-Arabic characters and digits
    text = re.sub(r'[^\u0600-\u06FF\s]', '', text)

    # Normalize text
    text = re.sub(r'[إأآا]', 'ا', text)
    text = re.sub(r'ة', 'ه', text)
    text = re.sub(r'ى', 'ي', text)

    # Remove stopwords
    text = ' '.join([word for word in text.split() if word not in arabic_stop_words])

    return text

# Apply the preprocessing function to the 'text' column
df['clean_text'] = df['text'].apply(preprocess_text_arabic)

# Step 2: Encode the labels (real/fake)
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['label'])  # Assuming 'label' column has 'real' and 'fake' values

# Step 3: Split the data into training and testing sets (80% train, 20% test)
X_train_texts, X_test_texts, y_train, y_test = train_test_split(df['clean_text'], y, test_size=0.2, random_state=42)

# Step 4: Fit the TF-IDF vectorizer on the training data only, then transform both sets
tfidf_vectorizer = TfidfVectorizer(max_features=50)
X_train = tfidf_vectorizer.fit_transform(X_train_texts)
X_test = tfidf_vectorizer.transform(X_test_texts)

# Check the shapes
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


(5096, 50)
(1274, 50)
(5096,)
(1274,)


### SVM
Support Vector Machine (SVM) is a supervised machine learning technique commonly applied to classification problems, like fake news detection. In this context, SVM works by separating real and fake news articles using a decision boundary based on the features extracted from Arabic text data. For instance, these features might include word frequencies, linguistic patterns, or even word embeddings tailored for Arabic, which capture contextual relationships in the text.

In [3]:
from sklearn.model_selection import cross_val_score

# Train the SVM classifier
svm_classifier = svm.SVC(kernel='linear', random_state=42)

# Perform cross-validation to confirm that the number of tfidf features was enough by checking consistency of accuracy among folds
cv_scores = cross_val_score(svm_classifier, X_train, y_train, cv=5) 

# Print cross-validation results
print(f'Cross-Validation Scores: {cv_scores}')
print(f'Mean Accuracy: {np.mean(cv_scores):.4f} ± {np.std(cv_scores):.4f}')

Cross-Validation Scores: [0.96764706 0.95976447 0.9656526  0.96074583 0.96467125]
Mean Accuracy: 0.9637 ± 0.0030


In [4]:
svm_classifier.fit(X_train, y_train)

# Predict on the test set
y_pred = svm_classifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print the evaluation results
print("SVM Results:")
print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')

SVM Results:
Accuracy: 0.9560
Precision: 0.9643
Recall: 0.9530
F1 Score: 0.9586


### XGBOOST

In [5]:
# Split the data into training and testing sets (80% train, 20% test)
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 1: Initialize the XGBoost classifier
xgb_classifier = xgb.XGBClassifier(objective='binary:logistic', use_label_encoder=False, eval_metric='logloss', random_state=42)

# Step 2: Train the classifier
xgb_classifier.fit(X_train, y_train)

# Step 3: Predict on the test set
y_pred_xgb = xgb_classifier.predict(X_test)

# Step 4: Evaluate the model
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
precision_xgb = precision_score(y_test, y_pred_xgb)
recall_xgb = recall_score(y_test, y_pred_xgb)
f1_xgb = f1_score(y_test, y_pred_xgb)

# Step 5: Print the evaluation results for XGBoost
print("XGBoost Results:")
print(f'Accuracy: {accuracy_xgb:.4f}')
print(f'Precision: {precision_xgb:.4f}')
print(f'Recall: {recall_xgb:.4f}')
print(f'F1 Score: {f1_xgb:.4f}')

# Comparison with SVM

Parameters: { "use_label_encoder" } are not used.



XGBoost Results:
Accuracy: 0.9757
Precision: 0.9924
Recall: 0.9618
F1 Score: 0.9769
