In [26]:
# Base Imports
import pandas as pd
import matplotlib.pyplot as plt

#sklearn imports
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC, SVC
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, GridSearchCV

# Feature selection
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

# NLTK Imports
import nltk
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.sentiment import util
from nltk import pos_tag

# Others
import re
import pickle

# Readability
import textstat
# Boxplot
import seaborn as sns

In [64]:
# load in the testing feature_df
feature_df = pd.read_csv('feature_df.csv')
feature_df

Unnamed: 0.1,Unnamed: 0,favourites,retweets,followers,source,label,preprocessed,length_preprocessed,sentiment_BERT,sentiment_RoBERTa,...,dominance,arousal,sentiment_vader,readability,exclamation_marks,question_marks,digits,hashtags,lexical_diversity,superlatives
0,0,50.0,20.0,691748.0,official,-1,"['the', 'drivethrough', 'coronavirus', 'testin...",157,4,1,...,0.630947,0.626204,0.0000,6.8,0.0,0.0,0.0,1.0,0.958333,0
1,1,58.0,25.0,12269236.0,official,-1,"['and', 'who', 'are', 'exploring', 'how', 'the...",224,5,1,...,0.634640,0.612494,0.4019,17.7,0.0,0.0,0.0,0.0,0.878788,1
2,2,555.0,236.0,12269233.0,official,-1,"['media', 'briefing', 'on', 'covid19', 'with']",32,4,1,...,0.618594,0.622606,0.0000,2.9,0.0,0.0,0.0,1.0,1.000000,0
3,3,0.0,3251.0,512848.0,official,-1,"['rt', 'the', 'new', 'nhscovid19app', 'now', '...",125,5,1,...,0.617799,0.632323,-0.2732,6.0,0.0,0.0,0.0,1.0,0.954545,1
4,4,,,,competition,1,"['dr', 'yan', 'presented', 'evidence', 'covid'...",190,1,1,...,0.621102,0.628665,0.8074,9.9,0.0,0.0,0.0,1.0,0.962963,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28604,28604,1.0,0.0,2509.0,truth_seeker,1,"['omicron', 'is', 'genetically', 'distant', 'e...",290,4,1,...,0.622211,0.626217,-0.3182,9.9,0.0,0.0,0.0,0.0,0.928571,0
28605,28605,123.0,15.0,691756.0,official,-1,"['update', 'on', 'covidー19', 'testing', 'in', ...",203,2,1,...,0.633660,0.632345,-0.1027,8.9,0.0,0.0,0.0,0.0,0.885714,1
28606,28606,39.0,33.0,691757.0,official,-1,"['the', 'pandemic', 'is', 'affecting', 'us', '...",244,5,1,...,0.621570,0.634196,-0.3400,5.2,0.0,0.0,0.0,0.0,0.846154,0
28607,28607,129.0,51.0,12269355.0,official,-1,"['the', 'interim', 'guideline', 'on', 'the', '...",168,4,1,...,0.618315,0.613622,0.0000,14.2,0.0,0.0,0.0,0.0,0.833333,0


### Baseline Classififer

#### Official Communication vs. Fake News

In [62]:
# shuffle the feature_df
feature_df = feature_df.sample(frac=1, random_state=42)
# feature_df_real_and_fake = feature_df_real_and_fake.sample(frac=1, random_state=42)

In [66]:
feature_df = feature_df[['label', 'sentiment_BERT', 'sentiment_RoBERTa', 'concreteness', 'valence', 'dominance', 'arousal', 'sentiment_vader', 'readability', 'exclamation_marks', 'question_marks', 'digits', 'hashtags', 'lexical_diversity', 'superlatives']]
feature_df

Unnamed: 0,label,sentiment_BERT,sentiment_RoBERTa,concreteness,valence,dominance,arousal,sentiment_vader,readability,exclamation_marks,question_marks,digits,hashtags,lexical_diversity,superlatives
0,-1,4,1,1.0,0.606417,0.630947,0.626204,0.0000,6.8,0.0,0.0,0.0,1.0,0.958333,0
1,-1,5,1,1.0,0.646490,0.634640,0.612494,0.4019,17.7,0.0,0.0,0.0,0.0,0.878788,1
2,-1,4,1,1.0,0.633763,0.618594,0.622606,0.0000,2.9,0.0,0.0,0.0,1.0,1.000000,0
3,-1,5,1,1.0,0.610125,0.617799,0.632323,-0.2732,6.0,0.0,0.0,0.0,1.0,0.954545,1
4,1,1,1,1.0,0.609986,0.621102,0.628665,0.8074,9.9,0.0,0.0,0.0,1.0,0.962963,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28604,1,4,1,1.0,0.603416,0.622211,0.626217,-0.3182,9.9,0.0,0.0,0.0,0.0,0.928571,0
28605,-1,2,1,1.0,0.642929,0.633660,0.632345,-0.1027,8.9,0.0,0.0,0.0,0.0,0.885714,1
28606,-1,5,1,0.0,0.564541,0.621570,0.634196,-0.3400,5.2,0.0,0.0,0.0,0.0,0.846154,0
28607,-1,4,1,1.0,0.617875,0.618315,0.613622,0.0000,14.2,0.0,0.0,0.0,0.0,0.833333,0


In [67]:
print(feature_df.isna().sum())
print(len(feature_df))

label                0
sentiment_BERT       0
sentiment_RoBERTa    0
concreteness         0
valence              0
dominance            0
arousal              0
sentiment_vader      0
readability          0
exclamation_marks    3
question_marks       3
digits               3
hashtags             3
lexical_diversity    0
superlatives         0
dtype: int64
28609


In [68]:
feature_df = feature_df.dropna()
len(feature_df)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  feature_df.dropna(inplace=True)


28606

In [69]:
### Linear SVM

# Define your feature set and target variable
features = feature_df.columns.drop('label')
X = feature_df[features]
y = feature_df['label']

print(X)

       sentiment_BERT  sentiment_RoBERTa  concreteness   valence  dominance  \
0                   4                  1           1.0  0.606417   0.630947   
1                   5                  1           1.0  0.646490   0.634640   
2                   4                  1           1.0  0.633763   0.618594   
3                   5                  1           1.0  0.610125   0.617799   
4                   1                  1           1.0  0.609986   0.621102   
...               ...                ...           ...       ...        ...   
28604               4                  1           1.0  0.603416   0.622211   
28605               2                  1           1.0  0.642929   0.633660   
28606               5                  1           0.0  0.564541   0.621570   
28607               4                  1           1.0  0.617875   0.618315   
28608               4                  0           0.0  0.613032   0.613898   

        arousal  sentiment_vader  readability  excl

In [70]:
# Split the data into training and testing sets (shuffle by default)
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=42)

In [71]:
# Create and fit the model
model = LinearSVC(random_state=42, class_weight='balanced')

# Sequential backward selection
sbs = SFS(model, 
           k_features=1, 
           forward=False, 
           floating=False, 
           scoring='accuracy',
           cv=5)

sbs = sbs.fit(X_train, y_train)

# Get the final set of features
final_features = list(sbs.k_feature_names_)
print('Final features:', final_features)




Final features: ['sentiment_RoBERTa']


In [72]:
# So what is going on? 
for k in sbs.subsets_:
    print(f'Number of features: {k}')
    print('Selected features:', sbs.subsets_[k]['feature_names'])
    print('CV score:', sbs.subsets_[k]['avg_score'])
    print('-' * 50)


Number of features: 14
Selected features: ('sentiment_BERT', 'sentiment_RoBERTa', 'concreteness', 'valence', 'dominance', 'arousal', 'sentiment_vader', 'readability', 'exclamation_marks', 'question_marks', 'digits', 'hashtags', 'lexical_diversity', 'superlatives')
CV score: 0.7832985427288464
--------------------------------------------------
Number of features: 13
Selected features: ('sentiment_BERT', 'sentiment_RoBERTa', 'concreteness', 'valence', 'dominance', 'arousal', 'readability', 'exclamation_marks', 'question_marks', 'digits', 'hashtags', 'lexical_diversity', 'superlatives')
CV score: 0.783866524015639
--------------------------------------------------
Number of features: 12
Selected features: ('sentiment_BERT', 'sentiment_RoBERTa', 'concreteness', 'valence', 'dominance', 'arousal', 'readability', 'exclamation_marks', 'digits', 'hashtags', 'lexical_diversity', 'superlatives')
CV score: 0.7844346676373659
--------------------------------------------------
Number of features: 11

In [105]:
final_features = ['sentiment_BERT', 'sentiment_RoBERTa', 'dominance', 'arousal','superlatives', 'readability']

In [106]:
# Fit model with final features and get accuracy
model.fit(X_train[final_features], y_train)
y_pred = model.predict(X_test[final_features])



In [107]:
from sklearn.metrics import precision_score, recall_score, confusion_matrix, f1_score

# Calculate and print F1 score
acc = accuracy_score(y_test, y_pred)
print(f'The accuracy score of the Linear SVM model with selected features is: {acc:.4f}')

# Calculate and print F1 score
f1 = f1_score(y_test, y_pred, average='macro')
print(f'The F1 score of the Linear SVM model with selected features is: {f1:.4f}')

# Calculate and print precision
precision = precision_score(y_test, y_pred, average='macro')
print(f'The precision of the Linear SVM model with selected features is: {precision:.4f}')

# Calculate and print recall
recall = recall_score(y_test, y_pred, average='macro')
print(f'The recall of the Linear SVM model with selected features is: {recall:.4f}')

# Print confusion matrix
cm = confusion_matrix(y_test, y_pred)
print('\nConfusion Matrix:')
print(cm)

The accuracy score of the Linear SVM model with selected features is: 0.7810
The F1 score of the Linear SVM model with selected features is: 0.7797
The precision of the Linear SVM model with selected features is: 0.7791
The recall of the Linear SVM model with selected features is: 0.7816

Confusion Matrix:
[[2457  710]
 [ 543 2012]]


In [113]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

# define feature selection
fs = SelectKBest(score_func=f_classif, k=5)

# apply feature selection
X_selected = fs.fit_transform(X, y)
print(X_selected.shape)

# Get the mask of selected feature indices
mask = fs.get_support()

# Get the names of the selected features
selected_features = X.columns[mask]
print(selected_features)

(28606, 5)
Index(['sentiment_BERT', 'sentiment_RoBERTa', 'valence', 'dominance',
       'sentiment_vader'],
      dtype='object')


  f = msb / msw


In [115]:
# Tree-baed model

from sklearn.ensemble import RandomForestClassifier

# Initialize the model
rf = RandomForestClassifier(n_estimators=100, random_state=42)

# Fit the model
rf.fit(X_train, y_train)

# Get feature importances
importances = rf.feature_importances_

# Create a list of (feature, importance) tuples
feature_importances = [(feature, importance) for feature, importance in zip(X_train.columns, importances)]

# Sort the feature importances by most important first
feature_importances_sorted = sorted(feature_importances, key = lambda x: x[1], reverse=True)

# Print feature importances
for feature, importance in feature_importances_sorted:
    print(f"Feature: {feature}, Importance: {importance}")

Feature: sentiment_BERT, Importance: 0.15862658360851092
Feature: sentiment_RoBERTa, Importance: 0.1569556096962467
Feature: valence, Importance: 0.12564716497128292
Feature: dominance, Importance: 0.11212870570028369
Feature: arousal, Importance: 0.10964379508408235
Feature: sentiment_vader, Importance: 0.09769969475869023
Feature: readability, Importance: 0.0863843241540005
Feature: lexical_diversity, Importance: 0.07761687869982559
Feature: hashtags, Importance: 0.025909960718554767
Feature: superlatives, Importance: 0.016172739852404288
Feature: concreteness, Importance: 0.014910564657134496
Feature: question_marks, Importance: 0.010975058392375843
Feature: exclamation_marks, Importance: 0.00732891970660775
Feature: digits, Importance: 0.0


#### Real vs. Fake News

In [None]:
# # Define your feature set and target variable
# features = ['sentiment_vader', 'readability', 'lexical_diversity', 'superlatives']
# X_real_and_fake = feature_df_real_and_fake[features]
# y_real_and_fake = feature_df_real_and_fake['label']

# print(X_real_and_fake)

In [None]:
# # Split the data into training and testing sets (shuffle by default)
# X_train_real_and_fake, X_test_real_and_fake, y_train_real_and_fake, y_test_real_and_fake = train_test_split(X_real_and_fake,
#                                                     y_real_and_fake,
#                                                     test_size=0.2,
#                                                     random_state=42)


In [None]:
# # Create and fit the model
# model_real_and_fake = LinearSVC(random_state=42)

# # Sequential backward selection
# sbs_real_and_fake = SFS(model_real_and_fake, 
#            k_features=1, 
#            forward=False, 
#            floating=False, 
#            scoring='accuracy',
#            cv=5)

# sbs_real_and_fake = sbs_real_and_fake.fit(X_train_real_and_fake, y_train_real_and_fake)

# # Get the final set of features
# final_features = list(sbs.k_feature_names_)
# print('Final features:', final_features)


In [None]:
# # So what is going on? 
# for k in sbs_real_and_fake.subsets_:
#     print(f'Number of features: {k}')
#     print('Selected features:', sbs_real_and_fake.subsets_[k]['feature_names'])
#     print('CV score:', sbs_real_and_fake.subsets_[k]['avg_score'])
#     print('-' * 50)


In [None]:
# # Fit model with final features and get accuracy
# model_real_and_fake.fit(X_train_real_and_fake[['sentiment_vader','readability','lexical_diversity']], y_train_real_and_fake)
# y_pred_real_and_fake = model_real_and_fake.predict(X_test_real_and_fake[['sentiment_vader', 'readability', 'lexical_diversity']])

In [None]:
# # Calculate and print Accuracy
# acc_real_and_fake = accuracy_score(y_test_real_and_fake, y_pred_real_and_fake)
# print(f'The Accuracy score of the Linear SVM model with selected features is: {acc_real_and_fake:.2f}')

# # Print confusion matrix
# cm_real_and_fake = confusion_matrix(y_test_real_and_fake, y_pred_real_and_fake)
# print('Confusion Matrix:')
# print(cm_real_and_fake)

In [None]:
# Tree model 


### Advanced Models 

#### Official Communication vs. Fake News

In [None]:
# Define previously chosen features 
features = ['sentiment_vader','readability','lexical_diversity']
X = feature_df[features]
y = feature_df['label']

print(X)

In [None]:
# Split the data into training and testing sets (shuffle by default)
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=42)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

# Define the parameter grid
param_grid = {'C': [0.1, 10, 1000], 
              'gamma': [0.1, 0.001],
              'kernel': ['rbf', 'poly']}

# Create a SVC model
svc = SVC(probability=True, class_weight='balanced')

# Use Stratified CV
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Create the GridSearchCV model
grid_search = GridSearchCV(svc, param_grid, cv=cv, verbose=3, n_jobs=-1, scoring='f1_macro')

# Fit the model
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print(f"Best parameters: {best_params}")

# Get the best score
best_score = grid_search.best_score_
print(f"Best score: {best_score}")

# Use the best estimator for predictions
best_svc = grid_search.best_estimator_
y_pred = best_svc.predict(X_test)


In [None]:
from sklearn.metrics import precision_score, recall_score, confusion_matrix, f1_score

# Calculate and print F1 score
f1 = f1_score(y_test, y_pred, average='macro')
print(f'The F1 score of the Linear SVM model with selected features is: {f1:.2f}')

# Calculate and print precision
precision = precision_score(y_test, y_pred, average='macro')
print(f'The precision of the Linear SVM model with selected features is: {precision:.2f}')

# Calculate and print recall
recall = recall_score(y_test, y_pred, average='macro')
print(f'The recall of the Linear SVM model with selected features is: {recall:.2f}')

# Print confusion matrix
cm = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(cm)