Imports

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
from sklearn.metrics import confusion_matrix, recall_score

Using TensorFlow backend.


Load in dataframes

In [2]:
df = pd.read_csv('combined_scrape_2.csv')

In [3]:
df.shape

(3543710, 4)

In [4]:
df.head()

Unnamed: 0,Comment Text,Author,Subreddit,Post ID
0,"Just curious, but what triggered your feelings...",PDXorax,YangForPresidentHQ,ew5ohva
1,Christmas time put up a light-up Yang sign!,PDXorax,YangForPresidentHQ,ew5o8vv
2,"During the general election, get in contact wi...",PDXorax,YangForPresidentHQ,ew42bdx
3,"We need to understand as a country, that the m...",PDXorax,YangForPresidentHQ,evva7l8
4,"Part of the reason for this, is because we nee...",PDXorax,YangForPresidentHQ,evoo179


In [5]:
df_target_train = pd.read_csv('fin_users_train.csv')

In [6]:
df_target_train.shape

(2487, 2)

In [7]:
df_target_test = pd.read_csv('fin_users_test.csv')

In [8]:
df_target_test.shape

(830, 2)

In [9]:
df_target = pd.concat([df_target_test,df_target_train])

In [10]:
df_target.shape

(3317, 2)

Create subreddit features

In [11]:
users = list(df_target['Unnamed: 0'])

In [12]:
def clean_df(row):
    if row['Author'] in users:
        return '1'
    else:
        return '0'


In [13]:
df['keep'] = df.apply(clean_df,axis=1)

In [14]:
df = df[df['keep']=='1']

In [15]:
#Group relevant posts by subreddit and author
sum_series = df.groupby(['Author','Subreddit']).size()

In [16]:
#Add in the subreddits I want to create features on based on my spreadsheet analysis
#Remove subreddits that I identified from my first model run as not predictive
list_of_subs = ['ENLIGHTENEDCENTRISM','Fuckthealtright','LateStageCapitalism','ABoringDystopia','nba',
                'CollegeBasketball','lakers','bostonceltics','sixers','Conservative','tuesday',
                'CryptoCurrency','Bitcoin','nfl','CFB','CHIBears','buildapcsales','Amd','Games',
                'gaming','conspiracy','neoliberal','stupidpol','Android','anime','askgaybros','atheism',
                'aznidentity','BasicIncome','BigBrother','books','cars','Christianity','Documentaries',
                'Economics','environment','Futurology','hiphopheads','hockey','investing','JoeRogan',
                'JordanPeterson','keto','liberalgunowners','Libertarian','marvelstudios','MMA','movies',
                'Music','rupaulsdragrace','samharris','StarWars','technology','teenagers','television',
                'teslamotors','thebachelor','TheLastAirbender','Tinder','trees','TwoXChromosomes',
                'cats','Cricket','Fitness','gratefuldead','HomeImprovement']

In [17]:
#Create a dictionary of user / subreddits
features = {}
for user in users:
    feature_dict = {}
    for sub in sum_series[user].index:
        if sub in list_of_subs:
            feature_dict[sub]=sum_series[user][sub]
    if len(feature_dict)!=0:
        features[user]=feature_dict


In [18]:
df_feature = pd.DataFrame(features).T

In [19]:
df_feature.head()

Unnamed: 0,ABoringDystopia,Amd,Android,BasicIncome,BigBrother,Bitcoin,CFB,CHIBears,Christianity,CollegeBasketball,...,samharris,sixers,stupidpol,technology,teenagers,television,teslamotors,thebachelor,trees,tuesday
Boomslangalang,,,,,,,,,,,...,,,,3.0,,,,,,
bczeon27,,,,2.0,,2.0,,,,,...,,,,,,,,,,
madosooki,,,,,,,,,,,...,,,,,,,,,5.0,
krom0025,,,,,,,,,,,...,,,,10.0,,,,,8.0,
Its_not_him,,,,,,,,,,,...,,,,,3.0,,,,1.0,


In [20]:
df_feature.shape

(2804, 66)

In [21]:
#NaNs at this stage are subs with no posts, replace NaN with "0"
df_feature.fillna(0,inplace=True)

In [22]:
#Combine related subs into a single feature
df_feature['anti_alt_right_comb']=df_feature['ENLIGHTENEDCENTRISM']+df_feature['Fuckthealtright']
df_feature['bad_capitalism_comb']=df_feature['LateStageCapitalism']+df_feature['ABoringDystopia']
df_feature['basketball_comb']=df_feature['nba']+df_feature['CollegeBasketball']+df_feature['lakers']+df_feature['bostonceltics']+df_feature['sixers']
df_feature['conservative_comb']=df_feature['Conservative']+df_feature['tuesday']
df_feature['crypto_comb']=df_feature['CryptoCurrency']+df_feature['Bitcoin']
df_feature['football_comb']=df_feature['nfl']+df_feature['CFB']+df_feature['CHIBears']
df_feature['pc_comb']=df_feature['buildapcsales']+df_feature['Amd']
df_feature['gaming_comb']=df_feature['gaming']+df_feature['Games']


In [23]:
#remove subreddits that were used in a combination feature
df_feature.drop(['ENLIGHTENEDCENTRISM','Fuckthealtright','LateStageCapitalism','ABoringDystopia',
                 'nba','CollegeBasketball','sixers','lakers','bostonceltics','Conservative','tuesday',
                 'CryptoCurrency','Bitcoin','nfl','CFB','CHIBears',
                 'buildapcsales','Amd','Games','gaming'],axis=1,inplace=True)

In [24]:
df_feature = df_feature.div(df_feature.sum(axis=1), axis=0)

In [25]:
df_feature.head(20)

Unnamed: 0,Android,BasicIncome,BigBrother,Christianity,Cricket,Documentaries,Economics,Fitness,Futurology,HomeImprovement,...,thebachelor,trees,anti_alt_right_comb,bad_capitalism_comb,basketball_comb,conservative_comb,crypto_comb,football_comb,pc_comb,gaming_comb
Boomslangalang,0.0,0.0,0.0,0.0,0.0,0.125,0.0,0.0,0.03125,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.03125
bczeon27,0.0,0.133333,0.0,0.0,0.0,0.0,0.0,0.0,0.133333,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.133333,0.0,0.0,0.0
madosooki,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.083333,0.166667,0.0,...,0.0,0.416667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
krom0025,0.0,0.0,0.0,0.0,0.0,0.0,0.017467,0.0,0.0,0.0,...,0.0,0.034934,0.0,0.0,0.0,0.0,0.0,0.004367,0.0,0.004367
Its_not_him,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.030303,0.0,...,0.0,0.030303,0.0,0.0,0.060606,0.0,0.0,0.030303,0.0,0.030303
staiano,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.722222,0.0,0.0,0.25,0.0,0.0
bintherematthat,0.0,0.0,0.0,0.0,0.0,0.0,0.016949,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.050847,0.101695,0.0,0.779661,0.0,0.0
Ghee_Guys,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01087,0.005435,0.027174,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.798913,0.0,0.005435
Zernin,0.0,0.0,0.0,0.0,0.0,0.0,0.666667,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
seedster5,0.0,0.0,0.0,0.0,0.046512,0.0,0.015504,0.0,0.0,0.0,...,0.0,0.0,0.015504,0.0,0.341085,0.0,0.0,0.0,0.0,0.015504


In [26]:
df_feature.isnull().sum().head()

Android         0
BasicIncome     0
BigBrother      0
Christianity    0
Cricket         0
dtype: int64

Split features dataframe into test and training ones

In [27]:
df_target_train = df_target_train.set_index('Unnamed: 0')

In [28]:
df_target_test = df_target_test.set_index('Unnamed: 0')

In [29]:
df_feature.shape

(2804, 54)

In [30]:
df_train = pd.concat([df_feature,df_target_train],axis=1,join='inner')

In [31]:
df_test = pd.concat([df_feature,df_target_test],axis=1,join='inner')

In [32]:
df_train.shape

(2110, 55)

In [33]:
df_test.shape

(694, 55)

In [34]:
#Export out final train dataframe for additional models
df_train.to_csv('fin_train_df.csv')

In [35]:
#Export out final test dataframe for additional models
df_test.to_csv('fin_test_df.csv')

Second Logisitic Regression Classifier

In [36]:
from sklearn.linear_model import LogisticRegression

In [37]:
lr = LogisticRegression(solver='lbfgs')

In [38]:
X_train = df_train.drop('target',axis=1)

In [39]:
y_train = df_train['target']

In [40]:
X_test = df_test.drop('target',axis=1)

In [41]:
y_test = df_test['target']

In [42]:
lr.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [43]:
lr.score(X_train,y_train)

0.7767772511848341

In [44]:
y_train.value_counts(normalize=True)

0    0.736967
1    0.263033
Name: target, dtype: float64

In [45]:
lr.score(X_test,y_test)

0.7694524495677233

In [46]:
y_test.value_counts(normalize=True)

0    0.740634
1    0.259366
Name: target, dtype: float64

In [47]:
pd.Series(lr.predict(X_test)).value_counts(normalize=True)

0    0.913545
1    0.086455
dtype: float64

In [48]:
confusion_matrix(y_test,lr.predict(X_test))

array([[494,  20],
       [140,  40]])

In [49]:
y_test.value_counts()

0    514
1    180
Name: target, dtype: int64

In [50]:
#Calculate sensitivity
recall_score(y_test,lr.predict(X_test))

0.2222222222222222

LR model using SMOTE

In [51]:
smt = SMOTE(random_state=42)

In [52]:
X_train_2,y_train_2 = smt.fit_sample(X_train,y_train)

In [53]:
lrsmt = LogisticRegression(solver='lbfgs')

In [54]:
lrsmt.fit(X_train_2,y_train_2)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [55]:
lrsmt.score(X_train_2,y_train_2)

0.7289389067524116

In [56]:
lrsmt.score(X_test,y_test)

0.6714697406340058

In [57]:
pd.Series(lrsmt.predict(X_test)).value_counts(normalize=True)

0    0.631124
1    0.368876
dtype: float64

In [58]:
confusion_matrix(y_test,lrsmt.predict(X_test))

array([[362, 152],
       [ 76, 104]])

In [59]:
y_test.value_counts()

0    514
1    180
Name: target, dtype: int64

In [60]:
#Calculate sensitivity
recall_score(y_test,lrsmt.predict(X_test))

0.5777777777777777

In [61]:
results = pd.DataFrame(lrsmt.coef_.T,X_test.columns)

In [62]:
#Identify features with the greatest predictive power
results[abs(results[0])>1].sort_values([0],ascending=False)

Unnamed: 0,0
BasicIncome,4.816385
aznidentity,3.623301
samharris,2.628694
crypto_comb,2.338303
Libertarian,2.170814
Futurology,2.074155
JordanPeterson,1.924864
hiphopheads,1.875131
Android,1.680891
teenagers,1.605866


These results are both informative and interesting. The strongest subreddits or topics of interests associated with Yang supporters are "BasicIncome" (Yang's central policy proposal is universal basic income), "aznidentity" (Yang is Asian-American so it makes sense that other Asian Americans would identify with him) and "samharris" (One of Yang's first widespread exposure moments was on the Sam Harris podcast). The less expected and thus more informative results / areas are where current Yang supporters, especially ones who are already interested in the topic can find other potential Yang supporters. Some of these topics/subreddits are crypto currency, hip hop fans, and marijuana users (r/trees is a subreddit for marijuana enthusiasts)