In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd
import re
import string
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [2]:
# Load datasets
data_true = pd.read_csv('datasets/True.csv')
data_fake = pd.read_csv('datasets/Fake.csv')

In [3]:
# Label datasets
data_true["label"] = 1
data_fake["label"] = 0

In [4]:
# Drop unnecessary columns
drop_true_data = data_true.drop(['title', 'subject', 'date'], axis=1)
drop_fake_data = data_fake.drop(['title', 'subject', 'date'], axis=1)

In [5]:
# Concatenate true and fake data
data = pd.concat([drop_true_data, drop_fake_data], axis=0).sample(frac=1).reset_index(drop=True)

In [6]:
# Define text preprocessing function
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'<.*?>+', '', text)
    text = re.sub(f'[{re.escape(string.punctuation)}]', '', text)
    text = re.sub(r'\w*\d\w*', '', text)
    return text

In [7]:
# Apply preprocessing to the dataset
data['text'] = data['text'].apply(preprocess_text)

In [8]:
# Define feature and target variables
x = data['text']
y = data['label']

In [9]:
# Split data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)

In [10]:
# Vectorize text data using TF-IDF
vectorizer = TfidfVectorizer()
x_train_vectorized = vectorizer.fit_transform(x_train)
x_test_vectorized = vectorizer.transform(x_test)

In [11]:
# Define and train classification models
models = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier()
}

In [12]:
# Function to evaluate model performance
def evaluate_model(model, x_test, y_test):
    y_pred = model.predict(x_test)
    print(f"Model: {model.__class__.__name__}")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("\n" + "="*60 + "\n")

In [13]:
# Train and evaluate each model
for name, model in models.items():
    model.fit(x_train_vectorized, y_train)
    evaluate_model(model, x_test_vectorized, y_test)

Model: LogisticRegression
Accuracy: 0.9878841870824053
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99      5820
           1       0.99      0.99      0.99      5405

    accuracy                           0.99     11225
   macro avg       0.99      0.99      0.99     11225
weighted avg       0.99      0.99      0.99     11225

Confusion Matrix:
 [[5753   67]
 [  69 5336]]


Model: DecisionTreeClassifier
Accuracy: 0.996792873051225
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      5820
           1       1.00      1.00      1.00      5405

    accuracy                           1.00     11225
   macro avg       1.00      1.00      1.00     11225
weighted avg       1.00      1.00      1.00     11225

Confusion Matrix:
 [[5803   17]
 [  19 5386]]


Model: RandomForestClassifier
Accuracy: 0.9897550111358575
Classification Report:
       

In [15]:
# Test the system with a sample news
sample_news = """
BREAKING: Scientists confirm that a giant asteroid will strike Earth next month, causing widespread destruction...
"""
cleaned_sample_news = preprocess_text(sample_news)
sample_vector = vectorizer.transform([cleaned_sample_news])

In [18]:
# Predict using Random Forest model as example
prediction = models["Random Forest"].predict(sample_vector)
print("Sample News Prediction (Logistic Regression):", "FAKE" if prediction[0] == 0 else "TRUE")

Sample News Prediction (Logistic Regression): FAKE
