In [1]:
# Base Imports
import pandas as pd
import matplotlib.pyplot as plt

#sklearn imports
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC, SVC
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, GridSearchCV

# Feature selection
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

# NLTK Imports
import nltk
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.sentiment import util
from nltk import pos_tag

# Others
import re
import pickle

# Readability
import textstat
# Boxplot
import seaborn as sns

### Baseline Classififer

#### Official Communication vs. Fake News

In [None]:
# Drop features 
features_real_and_fake = feature_df_real_and_fake.columns.drop('label').drop('id').drop('length_preprocessed').drop('preprocessed')
print(features_real_and_fake)

In [None]:
# Define the number of rows and columns for the subplot grid
n = len(features_real_and_fake)
ncols = 2
nrows = n // ncols + (n % ncols > 0)

# Create the subplots
fig, axs = plt.subplots(nrows, ncols, figsize=(15, nrows*5))
axs = axs.flatten()  # Flatten to make indexing easier

for i, feature in enumerate(features_real_and_fake):
    # Calculate median and standard deviation for the feature by class
    median_fake = feature_df_real_and_fake[feature_df_real_and_fake['label'] == 1][feature].median()
    std_dev_fake = feature_df_real_and_fake[feature_df_real_and_fake['label'] == 1][feature].std()

    median_real = feature_df_real_and_fake[feature_df_real_and_fake['label'] == -1][feature].median()
    std_dev_real = feature_df_real_and_fake[feature_df_real_and_fake['label'] == -1][feature].std()

    # Create the boxplot
    sns.boxplot(x='label', y=feature, data=feature_df_real_and_fake, ax=axs[i])

    # Set the title with median and standard deviation
    axs[i].set_title(f'{feature}')

    # Add the calculated stats in the plot
    axs[i].text(0.5, 0.9, f'Fake News - Median: {median_fake:.2f}, Std Dev: {std_dev_fake:.2f}',
                transform=axs[i].transAxes)
    axs[i].text(0.5, 0.8, f'Real News - Median: {median_real:.2f}, Std Dev: {std_dev_real:.2f}',
                transform=axs[i].transAxes)

# Remove extra subplots
if len(features_real_and_fake) < len(axs):
    for i in range(len(features), len(axs)):
        fig.delaxes(axs[i])

plt.tight_layout()
plt.show()


In [None]:
# shuffle the feature_df
feature_df = feature_df.sample(frac=1, random_state=42)
feature_df_real_and_fake = feature_df_real_and_fake.sample(frac=1, random_state=42)

In [None]:
### Linear SVM

# Define your feature set and target variable
features = ['sentiment_vader', 'readability', 'lexical_diversity', 'superlatives', 'exclamation_marks']
X = feature_df[features]
y = feature_df['label']

print(X)

In [None]:
# Split the data into training and testing sets (shuffle by default)
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=42)

In [None]:
# Create and fit the model
model = LinearSVC(random_state=42, class_weight='balanced')

# Sequential backward selection
sbs = SFS(model, 
           k_features=1, 
           forward=False, 
           floating=False, 
           scoring='f1_macro',
           cv=5)

sbs = sbs.fit(X_train, y_train)

# Get the final set of features
final_features = list(sbs.k_feature_names_)
print('Final features:', final_features)


In [None]:
# So what is going on? 
for k in sbs.subsets_:
    print(f'Number of features: {k}')
    print('Selected features:', sbs.subsets_[k]['feature_names'])
    print('CV score:', sbs.subsets_[k]['avg_score'])
    print('-' * 50)


In [None]:
# Fit model with final features and get accuracy
model.fit(X_train[['sentiment_vader','readability','lexical_diversity']], y_train)
y_pred = model.predict(X_test[['sentiment_vader', 'readability', 'lexical_diversity']])

In [None]:
from sklearn.metrics import precision_score, recall_score, confusion_matrix, f1_score

# Calculate and print F1 score
f1 = f1_score(y_test, y_pred, average='macro')
print(f'The F1 score of the Linear SVM model with selected features is: {f1:.2f}')

# Calculate and print precision
precision = precision_score(y_test, y_pred, average='macro')
print(f'The precision of the Linear SVM model with selected features is: {precision:.2f}')

# Calculate and print recall
recall = recall_score(y_test, y_pred, average='macro')
print(f'The recall of the Linear SVM model with selected features is: {recall:.2f}')

# Print confusion matrix
cm = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(cm)

#### Real vs. Fake News

In [None]:
# Define your feature set and target variable
features = ['sentiment_vader', 'readability', 'lexical_diversity', 'superlatives']
X_real_and_fake = feature_df_real_and_fake[features]
y_real_and_fake = feature_df_real_and_fake['label']

print(X_real_and_fake)

In [None]:
# Split the data into training and testing sets (shuffle by default)
X_train_real_and_fake, X_test_real_and_fake, y_train_real_and_fake, y_test_real_and_fake = train_test_split(X_real_and_fake,
                                                    y_real_and_fake,
                                                    test_size=0.2,
                                                    random_state=42)


In [None]:
# Create and fit the model
model_real_and_fake = LinearSVC(random_state=42)

# Sequential backward selection
sbs_real_and_fake = SFS(model_real_and_fake, 
           k_features=1, 
           forward=False, 
           floating=False, 
           scoring='accuracy',
           cv=5)

sbs_real_and_fake = sbs_real_and_fake.fit(X_train_real_and_fake, y_train_real_and_fake)

# Get the final set of features
final_features = list(sbs.k_feature_names_)
print('Final features:', final_features)


In [None]:
# So what is going on? 
for k in sbs_real_and_fake.subsets_:
    print(f'Number of features: {k}')
    print('Selected features:', sbs_real_and_fake.subsets_[k]['feature_names'])
    print('CV score:', sbs_real_and_fake.subsets_[k]['avg_score'])
    print('-' * 50)


In [None]:
# Fit model with final features and get accuracy
model_real_and_fake.fit(X_train_real_and_fake[['sentiment_vader','readability','lexical_diversity']], y_train_real_and_fake)
y_pred_real_and_fake = model_real_and_fake.predict(X_test_real_and_fake[['sentiment_vader', 'readability', 'lexical_diversity']])

In [None]:
# Calculate and print Accuracy
acc_real_and_fake = accuracy_score(y_test_real_and_fake, y_pred_real_and_fake)
print(f'The Accuracy score of the Linear SVM model with selected features is: {acc_real_and_fake:.2f}')

# Print confusion matrix
cm_real_and_fake = confusion_matrix(y_test_real_and_fake, y_pred_real_and_fake)
print('Confusion Matrix:')
print(cm_real_and_fake)

### Advanced Model 

#### Official Communication vs. Fake News

In [None]:
# Define previously chosen features 
features = ['sentiment_vader','readability','lexical_diversity']
X = feature_df[features]
y = feature_df['label']

print(X)

In [None]:
# Split the data into training and testing sets (shuffle by default)
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=42)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

# Define the parameter grid
param_grid = {'C': [0.1, 10, 1000], 
              'gamma': [0.1, 0.001],
              'kernel': ['rbf', 'poly']}

# Create a SVC model
svc = SVC(probability=True, class_weight='balanced')

# Use Stratified CV
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Create the GridSearchCV model
grid_search = GridSearchCV(svc, param_grid, cv=cv, verbose=3, n_jobs=-1, scoring='f1_macro')

# Fit the model
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print(f"Best parameters: {best_params}")

# Get the best score
best_score = grid_search.best_score_
print(f"Best score: {best_score}")

# Use the best estimator for predictions
best_svc = grid_search.best_estimator_
y_pred = best_svc.predict(X_test)


In [None]:
from sklearn.metrics import precision_score, recall_score, confusion_matrix, f1_score

# Calculate and print F1 score
f1 = f1_score(y_test, y_pred, average='macro')
print(f'The F1 score of the Linear SVM model with selected features is: {f1:.2f}')

# Calculate and print precision
precision = precision_score(y_test, y_pred, average='macro')
print(f'The precision of the Linear SVM model with selected features is: {precision:.2f}')

# Calculate and print recall
recall = recall_score(y_test, y_pred, average='macro')
print(f'The recall of the Linear SVM model with selected features is: {recall:.2f}')

# Print confusion matrix
cm = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(cm)