## Claims Data: Relationships to Twitter Engagement, and Potential Indicators of Validity

## Imports

In [156]:
import pandas as pd 
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
import imblearn
from imblearn.over_sampling import RandomOverSampler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score


In [157]:
df = pd.read_excel('/Users/mfeene/Desktop/hackathon_2021/claims_topics_eng.xlsx')

# fill missing values
df = df.fillna(0)


df = df[[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19, 'eval', 'tweet_counts', 'replies']]

df.columns = ['topic_1', 'topic_2', 'topic_3', 'topic_4', 'topic_5', 'topic_6', 'topic_7', 'topic_8', 'topic_9', 
             'topic_10', 'topic_11', 'topic_12', 'topic_13', 'topic_14', 'topic_15', 'topic_16', 'topic_17',
             'topic_18', 'topic_19', 'topic_20', 'eval', 'tweet_counts', 'replies']


In [158]:
df.head()

Unnamed: 0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,topic_10,...,topic_14,topic_15,topic_16,topic_17,topic_18,topic_19,topic_20,eval,tweet_counts,replies
0,0.003132,0.001405,0.010406,0.999999,0.00462,0.011083,0.010577,0.00106,0.002529,0.000719,...,0.00138,0.001034,0.001823,0.002395,0.002773,0.175692,0.169631,real,1.0,2.0
1,0.003132,0.001405,0.999999,0.009513,0.00462,0.011083,0.010577,0.00106,0.002529,0.000719,...,0.00138,0.000522,0.001823,0.002395,0.002773,0.173437,0.16943,real,10.0,14.0
2,0.999999,0.001405,0.010406,0.009513,0.00462,0.011083,0.010577,0.00106,0.002529,0.000719,...,0.00138,0.001034,0.001228,0.002395,0.002773,0.175692,0.170225,real,0.0,0.0
3,0.005846,0.001405,0.010406,0.009513,0.999999,0.011083,0.010577,0.00106,0.002529,0.000719,...,0.00138,0.001034,0.001823,0.002395,0.002773,0.175692,0.169681,real,0.0,0.0
4,0.999999,0.001405,0.010406,0.009513,0.006246,0.011083,0.010577,0.00106,0.002529,0.000719,...,0.00138,0.000641,0.001823,0.002395,0.002773,0.174114,0.169673,real,0.0,0.0


## Topic and Twitter Engagement Correlations 

In [159]:
### Correlations with topic and engagement
# The more likely that a claim was to belong to a certain topic, the higher engagement

In [166]:
df = pd.read_excel('/Users/mfeene/Desktop/hackathon_2021/claims_topics_eng.xlsx')

# fill missing values
df = df.fillna(0)

tweet_eng = df[[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19, 'tweet_counts']].corr().iloc[-1:]
#tweet_eng.to_csv('tweet_eng.csv')

tweet_eng


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,tweet_counts
tweet_counts,-0.081106,-0.053357,-0.050753,-0.06962,-0.071533,-0.067407,-0.038546,-0.003051,-0.039775,0.029378,...,0.057071,0.011244,-0.021774,-0.018677,0.031587,-0.000309,0.007993,0.05171,0.035636,1.0


In [167]:
df = pd.read_excel('/Users/mfeene/Desktop/hackathon_2021/claims_topics_eng.xlsx')

# fill missing values
df = df.fillna(0)

rep_eng = df[[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19, 'replies']].corr().iloc[-1:]
#rep_eng.to_csv('rep_eng.csv')

rep_eng


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,replies
replies,-0.076354,-0.060291,-0.052998,-0.072363,-0.050655,-0.067874,-0.041973,0.052444,-0.033757,0.029216,...,0.059004,0.000938,-0.014005,-0.020968,0.024363,-0.016541,-0.008914,0.093077,0.035675,1.0


## Topics as Potential Indicators of Claim Validity

In [168]:
df = pd.read_excel('/Users/mfeene/Desktop/hackathon_2021/claims_topics_eng.xlsx')

# fill missing values
df = df.fillna(0)


df = df[[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19, 'eval', 'tweet_counts', 'replies']]

df.columns = ['topic_1', 'topic_2', 'topic_3', 'topic_4', 'topic_5', 'topic_6', 'topic_7', 'topic_8', 'topic_9', 
             'topic_10', 'topic_11', 'topic_12', 'topic_13', 'topic_14', 'topic_15', 'topic_16', 'topic_17',
             'topic_18', 'topic_19', 'topic_20', 'eval', 'tweet_counts', 'tweet_replies']


# Also encode the target variable
lb_make = LabelEncoder()
df['eval_encoded'] = lb_make.fit_transform(df['eval']) # 0 = fake, 1 = real
df = df.drop('eval', axis = 1)


In [169]:
# Serious class imbalance- way more real than fake claims in the data
df['eval_encoded'].value_counts()


1    490
0     28
Name: eval_encoded, dtype: int64

In [170]:
# Train test split
x = df.drop('eval_encoded', axis = 1)
y = df['eval_encoded']

x_train, x_test, y_train, y_test = train_test_split(x, y, stratify = y, test_size = 0.3, random_state = 123)

# Define oversampling strategy- SMOTE
sm = SMOTE(random_state = 1)

# Fit and apply the over sampling to the training data
x_train_res, y_train_res = sm.fit_sample(x_train, y_train)


### Random Forest

In [173]:
# Random Forest
rf = RandomForestClassifier(random_state = 123, 
                            n_jobs = -1, 
                            class_weight = 'balanced', 
                            n_estimators = 1000)
    
rf.fit(x_train_res, y_train_res)

y_pred = rf.predict(x_test)

## Accuracy score
print('Accuracy:', metrics.accuracy_score(y_test, y_pred))
print('')


# ROC AUC 
print('AUC:', metrics.roc_auc_score(y_test, y_pred))
print('')


# Confusion matrix
print('Confusion Matrix:', metrics.confusion_matrix(y_test, y_pred))



Accuracy: 0.9358974358974359

AUC: 0.6706081081081081

Confusion Matrix: [[  3   5]
 [  5 143]]


### Random Forest Feature Importance

In [174]:
# Set features variable with the names of the features in X
features = x.columns

feature_importances = rf.feature_importances_

features_df = pd.DataFrame({'Features': features, 'Importance Score': feature_importances})

features_df.sort_values('Importance Score', inplace = True, ascending = False)

features_df

Unnamed: 0,Features,Importance Score
20,tweet_counts,0.192403
13,topic_14,0.127279
21,tweet_replies,0.121006
6,topic_7,0.081235
12,topic_13,0.066782
0,topic_1,0.062765
2,topic_3,0.056334
5,topic_6,0.055789
4,topic_5,0.047987
11,topic_12,0.038198
