# TruthLens Modelling

In [1]:
#imports
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import time
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

## Phase 1: Binary Classification

### Feature Extraction Using TF-IDF and n-grams

In [2]:
#load data
df = pd.read_csv('Data/final_clean.csv')
df = df.dropna(subset=['content'])
df = df.reset_index(drop=True)

In [3]:
start_time = time.time()
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 3))
X = vectorizer.fit_transform(df['content'])
y = df['label']
print("Feature extraction: {:.4f} seconds".format(time.time() - start_time))

Feature extraction: 166.8379 seconds


### Split dataset

In [4]:
#retain the indices as we need these for looking up explanations later
train_indices, test_indices = train_test_split(df.index, test_size=0.2, random_state=42)
# Split X and y using the train/test indices
X_train = X[train_indices]
X_test = X[test_indices]
y_train = y.iloc[train_indices]
y_test = y.iloc[test_indices]

### Logistic Regression

In [5]:
start_time = time.time()
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("Fit Logistic Regression model: {:.4f} seconds".format(time.time() - start_time))
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Fit Logistic Regression model: 1.7987 seconds
Accuracy: 0.9901960784313726
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99      4288
           1       0.99      0.99      0.99      4688

    accuracy                           0.99      8976
   macro avg       0.99      0.99      0.99      8976
weighted avg       0.99      0.99      0.99      8976



### Support Vector Machine (SVM)

In [6]:
# Train SVM
start_time = time.time()
svm_model = SVC()
svm_model.fit(X_train, y_train)
svm_pred = svm_model.predict(X_test)
print("Fit SVM model: {:.4f} seconds".format(time.time() - start_time))
print("SVM Accuracy:", accuracy_score(y_test, svm_pred))
print("SVM Classification Report:\n", classification_report(y_test, svm_pred))

Fit SVM model: 1474.6836 seconds
SVM Accuracy: 0.9954322638146168
SVM Classification Report:
               precision    recall  f1-score   support

           0       0.99      1.00      1.00      4288
           1       1.00      0.99      1.00      4688

    accuracy                           1.00      8976
   macro avg       1.00      1.00      1.00      8976
weighted avg       1.00      1.00      1.00      8976



### Random Forest

In [7]:
start_time = time.time()
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)
print("Fit Random Forest model: {:.4f} seconds".format(time.time() - start_time))
print("Random Forest Accuracy:", accuracy_score(y_test, rf_pred))
print("Random Forest Classification Report:\n", classification_report(y_test, rf_pred))

Fit Random Forest model: 141.1706 seconds
Random Forest Accuracy: 0.9953208556149733
Random Forest Classification Report:
               precision    recall  f1-score   support

           0       0.99      1.00      1.00      4288
           1       1.00      0.99      1.00      4688

    accuracy                           1.00      8976
   macro avg       1.00      1.00      1.00      8976
weighted avg       1.00      1.00      1.00      8976



### Explain prediction

In [9]:
def explain_prediction(text):
    """
    Explains the prediction of the model by showing the most influential words for the prediction.

    Parameters:
    ----------
    text : str
        The input text to analyze.
    model : object
        The trained machine learning model.
    vectorizer : object
        The TF-IDF vectorizer used to transform the text.

    Returns:
    -------
    dict
        A dictionary containing the prediction ('label') and the top contributing words ('features').
    """
    # Transform the text using the vectorizer
    tfidf_text = vectorizer.transform([text])
    # Predict the label
    prediction = model.predict(tfidf_text)[0]
    # Get top contributing features (words)
    feature_importances = model.coef_[0]  # Logistic regression coefficients
    feature_names = vectorizer.get_feature_names_out()
    # Sort by importance
    top_indices = tfidf_text.toarray().argsort()[0][-5:]  # Top 5 features
    top_features = [feature_names[i] for i in top_indices]

    return {
        "label": prediction,
        "features": top_features
    }

# Create a DataFrame for test data
test_df = pd.DataFrame({
    'text': df.loc[test_indices, 'content'].reset_index(drop=True),
    'true_label': y_test.reset_index(drop=True),
    'predicted_label': y_pred
})

# Row predicted as Real (0)
real_example = test_df[test_df['predicted_label'] == 0].iloc[0]

# Row predicted as Fake (1)
fake_example = test_df[test_df['predicted_label'] == 1].iloc[0]

In [13]:
real_explanation = explain_prediction(real_example['text'])
fake_explanation = explain_prediction(fake_example['text'])

print("Real Example Prediction:")
print("Text:", real_example['text'])
print("Predicted Label:", real_explanation['label'])
print("Top Features:", real_explanation['features'])

print("\nFake Example Prediction:")
print("Text:", fake_example['text'])
print("Predicted Label:", fake_explanation['label'])
print("Top Features:", fake_explanation['features'])


Real Example Prediction:
Text: china say resolutely opposes deployment thaad south korea beijing china say friday resolutely oppose deployment u.s. anti-missile defense system south korea south korea say defense chief agree deploy system year chinese foreign ministry spokesman lu kang reiterate china 's opposition u.s. system daily news briefing beijing china consistently oppose decision deploy terminal high altitude area defense system say threaten china 's security nothing ease tension korean peninsula south korea united states say system intend defend north korean aggression
Predicted Label: 0
Top Features: ['south', 'korea', 'system', 'china', 'south korea']

Fake Example Prediction:
Text: anti-abortion republican cheated tried get mistress abortion text message send january u.s. rep. tim murphy staunch pro-life republican represent pennsylvania 18th district reveal text woman murphy relationship outside marriage take task anti-abortion statement post facebook office public account