In [26]:
# Base Imports
import pandas as pd
import matplotlib.pyplot as plt

#sklearn imports
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC, SVC
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, GridSearchCV

# Feature selection
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

# NLTK Imports
import nltk
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.sentiment import util
from nltk import pos_tag

# Others
import re
import pickle

# Readability
import textstat
# Boxplot
import seaborn as sns

In [None]:
# load in the testing feature_df
feature_df = pd.read_csv('feature_df_test.csv')
feature_df

In [46]:
# feature_df = pd.read_csv('feature_df.csv')
# feature_df

Unnamed: 0.1,Unnamed: 0,favourites,retweets,followers,source,label,preprocessed,length_preprocessed,sentiment_BERT,sentiment_RoBERTa,sentiment_vader,readability,exclamation_marks,question_marks,digits,hashtags,lexical_diversity,superlatives
0,0,50.0,20.0,691748.0,official,-1,"['the', 'drivethrough', 'coronavirus', 'testin...",157,4,1,0.0000,17.8,0.0,0.0,0.0,1.0,0.958333,0
1,1,58.0,25.0,12269236.0,official,-1,"['and', 'who', 'are', 'exploring', 'how', 'the...",224,5,1,0.4951,10.7,0.0,0.0,0.0,0.0,0.878788,1
2,2,555.0,236.0,12269233.0,official,-1,"['media', 'briefing', 'on', 'covid19', 'with']",32,4,1,0.0000,14.2,0.0,0.0,0.0,1.0,1.000000,0
3,3,0.0,3251.0,512848.0,official,-1,"['rt', 'the', 'new', 'nhscovid19app', 'now', '...",125,5,1,-0.1901,10.7,0.0,0.0,0.0,1.0,0.954545,1
4,4,,,,competition,1,"['dr', 'yan', 'presented', 'evidence', 'covid'...",190,1,1,0.6249,5.0,0.0,0.0,0.0,1.0,0.962963,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28604,28604,1.0,0.0,2509.0,truth_seeker,1,"['omicron', 'is', 'genetically', 'distant', 'e...",290,4,1,-0.1779,3.1,0.0,0.0,0.0,0.0,0.928571,0
28605,28605,123.0,15.0,691756.0,official,-1,"['update', 'on', 'covidー19', 'testing', 'in', ...",203,2,1,-0.5267,4.7,0.0,0.0,0.0,0.0,0.885714,1
28606,28606,39.0,33.0,691757.0,official,-1,"['the', 'pandemic', 'is', 'affecting', 'us', '...",244,5,1,-0.9329,3.5,0.0,0.0,0.0,0.0,0.846154,0
28607,28607,129.0,51.0,12269355.0,official,-1,"['the', 'interim', 'guideline', 'on', 'the', '...",168,4,1,-0.7579,4.0,0.0,0.0,0.0,0.0,0.833333,0


### Baseline Classififer

#### Official Communication vs. Fake News

In [47]:
# shuffle the feature_df
feature_df = feature_df.sample(frac=1, random_state=42)
# feature_df_real_and_fake = feature_df_real_and_fake.sample(frac=1, random_state=42)

In [48]:
feature_df = feature_df[['label', 'sentiment_BERT', 'sentiment_RoBERTa', 'sentiment_vader', 'readability', 'exclamation_marks', 'question_marks', 'digits', 'hashtags', 'lexical_diversity', 'superlatives']]
feature_df

Unnamed: 0,label,sentiment_BERT,sentiment_RoBERTa,sentiment_vader,readability,exclamation_marks,question_marks,digits,hashtags,lexical_diversity,superlatives
10853,1,1,0,0.3995,15.4,0.0,0.0,0.0,2.0,0.900000,0
4034,1,1,0,0.6369,8.5,0.0,0.0,0.0,1.0,0.785714,0
3858,-1,5,2,0.0772,3.4,0.0,1.0,0.0,2.0,0.777778,0
11946,1,2,0,0.7783,6.8,0.0,0.0,0.0,1.0,0.961538,0
17497,1,1,0,-0.6908,15.4,0.0,0.0,0.0,0.0,0.755556,0
...,...,...,...,...,...,...,...,...,...,...,...
21575,1,1,0,-0.9034,4.1,0.0,2.0,0.0,0.0,0.972222,0
5390,-1,3,1,-0.3818,7.4,0.0,0.0,0.0,0.0,0.923077,0
860,-1,4,1,0.2023,7.5,0.0,0.0,0.0,2.0,0.756757,0
15795,-1,5,1,0.0000,8.0,0.0,0.0,0.0,1.0,0.884615,0


In [41]:
print(feature_df.isna().sum())
print(len(feature_df))

label                0
sentiment_BERT       0
sentiment_RoBERTa    0
sentiment_vader      3
readability          3
exclamation_marks    3
question_marks       3
digits               3
hashtags             3
lexical_diversity    0
superlatives         0
dtype: int64
28609


In [50]:
feature_df.dropna(inplace=True)
len(feature_df)

28606

In [53]:
### Linear SVM

# Define your feature set and target variable
features = feature_df.columns.drop('label')
X = feature_df[features]
y = feature_df['label']

print(X)

       sentiment_BERT  sentiment_RoBERTa  sentiment_vader  readability  \
10853               1                  0           0.3995         15.4   
4034                1                  0           0.6369          8.5   
3858                5                  2           0.0772          3.4   
11946               2                  0           0.7783          6.8   
17497               1                  0          -0.6908         15.4   
...               ...                ...              ...          ...   
21575               1                  0          -0.9034          4.1   
5390                3                  1          -0.3818          7.4   
860                 4                  1           0.2023          7.5   
15795               5                  1           0.0000          8.0   
23654               5                  1           0.6116          8.0   

       exclamation_marks  question_marks  digits  hashtags  lexical_diversity  \
10853                0.0      

In [54]:
# Split the data into training and testing sets (shuffle by default)
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=42)

In [58]:
# Create and fit the model
model = LinearSVC(random_state=42, class_weight='balanced')

# Sequential backward selection
sbs = SFS(model, 
           k_features=1, 
           forward=False, 
           floating=False, 
           scoring='accuracy',
           cv=5)

sbs = sbs.fit(X_train, y_train)

# Get the final set of features
final_features = list(sbs.k_feature_names_)
print('Final features:', final_features)




Final features: ['sentiment_RoBERTa']


In [59]:
# So what is going on? 
for k in sbs.subsets_:
    print(f'Number of features: {k}')
    print('Selected features:', sbs.subsets_[k]['feature_names'])
    print('CV score:', sbs.subsets_[k]['avg_score'])
    print('-' * 50)


Number of features: 10
Selected features: ('sentiment_BERT', 'sentiment_RoBERTa', 'sentiment_vader', 'readability', 'exclamation_marks', 'question_marks', 'digits', 'hashtags', 'lexical_diversity', 'superlatives')
CV score: 0.7805454472881281
--------------------------------------------------
Number of features: 9
Selected features: ('sentiment_BERT', 'sentiment_RoBERTa', 'sentiment_vader', 'exclamation_marks', 'question_marks', 'digits', 'hashtags', 'lexical_diversity', 'superlatives')
CV score: 0.7809824242831671
--------------------------------------------------
Number of features: 8
Selected features: ('sentiment_BERT', 'sentiment_RoBERTa', 'sentiment_vader', 'question_marks', 'digits', 'hashtags', 'lexical_diversity', 'superlatives')
CV score: 0.7810261210277597
--------------------------------------------------
Number of features: 7
Selected features: ('sentiment_BERT', 'sentiment_RoBERTa', 'question_marks', 'digits', 'hashtags', 'lexical_diversity', 'superlatives')
CV score: 0.7

In [None]:
# Fit model with final features and get accuracy
model.fit(X_train[['sentiment_vader','readability','lexical_diversity']], y_train)
y_pred = model.predict(X_test[['sentiment_vader', 'readability', 'lexical_diversity']])

In [None]:
from sklearn.metrics import precision_score, recall_score, confusion_matrix, f1_score

# Calculate and print F1 score
acc = accuracy_score(y_test, y_pred)
print(f'The accuracy score of the Linear SVM model with selected features is: {acc:.2f}')

# Calculate and print F1 score
f1 = f1_score(y_test, y_pred, average='macro')
print(f'The F1 score of the Linear SVM model with selected features is: {f1:.2f}')

# Calculate and print precision
precision = precision_score(y_test, y_pred, average='macro')
print(f'The precision of the Linear SVM model with selected features is: {precision:.2f}')

# Calculate and print recall
recall = recall_score(y_test, y_pred, average='macro')
print(f'The recall of the Linear SVM model with selected features is: {recall:.2f}')

# Print confusion matrix
cm = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(cm)

#### Real vs. Fake News

In [None]:
# # Define your feature set and target variable
# features = ['sentiment_vader', 'readability', 'lexical_diversity', 'superlatives']
# X_real_and_fake = feature_df_real_and_fake[features]
# y_real_and_fake = feature_df_real_and_fake['label']

# print(X_real_and_fake)

In [None]:
# # Split the data into training and testing sets (shuffle by default)
# X_train_real_and_fake, X_test_real_and_fake, y_train_real_and_fake, y_test_real_and_fake = train_test_split(X_real_and_fake,
#                                                     y_real_and_fake,
#                                                     test_size=0.2,
#                                                     random_state=42)


In [None]:
# # Create and fit the model
# model_real_and_fake = LinearSVC(random_state=42)

# # Sequential backward selection
# sbs_real_and_fake = SFS(model_real_and_fake, 
#            k_features=1, 
#            forward=False, 
#            floating=False, 
#            scoring='accuracy',
#            cv=5)

# sbs_real_and_fake = sbs_real_and_fake.fit(X_train_real_and_fake, y_train_real_and_fake)

# # Get the final set of features
# final_features = list(sbs.k_feature_names_)
# print('Final features:', final_features)


In [None]:
# # So what is going on? 
# for k in sbs_real_and_fake.subsets_:
#     print(f'Number of features: {k}')
#     print('Selected features:', sbs_real_and_fake.subsets_[k]['feature_names'])
#     print('CV score:', sbs_real_and_fake.subsets_[k]['avg_score'])
#     print('-' * 50)


In [None]:
# # Fit model with final features and get accuracy
# model_real_and_fake.fit(X_train_real_and_fake[['sentiment_vader','readability','lexical_diversity']], y_train_real_and_fake)
# y_pred_real_and_fake = model_real_and_fake.predict(X_test_real_and_fake[['sentiment_vader', 'readability', 'lexical_diversity']])

In [None]:
# # Calculate and print Accuracy
# acc_real_and_fake = accuracy_score(y_test_real_and_fake, y_pred_real_and_fake)
# print(f'The Accuracy score of the Linear SVM model with selected features is: {acc_real_and_fake:.2f}')

# # Print confusion matrix
# cm_real_and_fake = confusion_matrix(y_test_real_and_fake, y_pred_real_and_fake)
# print('Confusion Matrix:')
# print(cm_real_and_fake)

### Advanced Model 

#### Official Communication vs. Fake News

In [None]:
# Define previously chosen features 
features = ['sentiment_vader','readability','lexical_diversity']
X = feature_df[features]
y = feature_df['label']

print(X)

In [None]:
# Split the data into training and testing sets (shuffle by default)
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=42)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

# Define the parameter grid
param_grid = {'C': [0.1, 10, 1000], 
              'gamma': [0.1, 0.001],
              'kernel': ['rbf', 'poly']}

# Create a SVC model
svc = SVC(probability=True, class_weight='balanced')

# Use Stratified CV
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Create the GridSearchCV model
grid_search = GridSearchCV(svc, param_grid, cv=cv, verbose=3, n_jobs=-1, scoring='f1_macro')

# Fit the model
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print(f"Best parameters: {best_params}")

# Get the best score
best_score = grid_search.best_score_
print(f"Best score: {best_score}")

# Use the best estimator for predictions
best_svc = grid_search.best_estimator_
y_pred = best_svc.predict(X_test)


In [None]:
from sklearn.metrics import precision_score, recall_score, confusion_matrix, f1_score

# Calculate and print F1 score
f1 = f1_score(y_test, y_pred, average='macro')
print(f'The F1 score of the Linear SVM model with selected features is: {f1:.2f}')

# Calculate and print precision
precision = precision_score(y_test, y_pred, average='macro')
print(f'The precision of the Linear SVM model with selected features is: {precision:.2f}')

# Calculate and print recall
recall = recall_score(y_test, y_pred, average='macro')
print(f'The recall of the Linear SVM model with selected features is: {recall:.2f}')

# Print confusion matrix
cm = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(cm)