Imports

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords 
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,recall_score
from imblearn.over_sampling import SMOTE

Using TensorFlow backend.


Load in dataframes

In [2]:
df_train = pd.read_csv('fin_users_train.csv')

In [3]:
df_train.head()

Unnamed: 0.1,Unnamed: 0,target
0,FjolnirFimbulvetr,0
1,Kazehaya,0
2,ClawedGiroux,0
3,MrKixs,0
4,icancubutucantcme,0


In [4]:
df_train = df_train.set_index('Unnamed: 0')

In [5]:
df_test = pd.read_csv('fin_users_test.csv')

In [6]:
df_test.head()

Unnamed: 0.1,Unnamed: 0,target
0,jcwinny,0
1,lazigrdnr,0
2,Boomslangalang,0
3,bczeon27,1
4,madosooki,1


In [7]:
df_test = df_test.set_index('Unnamed: 0')

In [8]:
df_train.head()

Unnamed: 0_level_0,target
Unnamed: 0,Unnamed: 1_level_1
FjolnirFimbulvetr,0
Kazehaya,0
ClawedGiroux,0
MrKixs,0
icancubutucantcme,0


In [9]:
df_nlp = pd.read_csv('fin_cleaned_nlp.csv')

In [10]:
df_nlp.head()

Unnamed: 0.1,Unnamed: 0,User,Text,Clean
0,0,PDXorax,"The Lump of Labor doesn't apply in this case,...",the lump of labor doesn apply in this case we ...
1,1,PrincePizza1,"I think things ""getting better"" to people lik...",think things getting better to people like thi...
2,2,Marmar79,Thank you. This made my day He so edgy.\n\nEd...,thank you this made my day he so edgy edit don...
3,3,IRISHE3,I just wish politicians would put the money s...,just wish politicians would put the money shit...
4,4,RonZiggy,Climate change as well. Wow...just wow What i...,climate change as well wow just wow what is th...


In [11]:
df_nlp.set_index('User',inplace=True)

In [12]:
df_nlp.isnull().sum()

Unnamed: 0    0
Text          0
Clean         3
dtype: int64

In [13]:
df_nlp.dropna(axis=0,inplace=True)

Remove stopwords and lemmatize the NLP dataframe

In [14]:
tokenizer = RegexpTokenizer('\s+', gaps=True)
lemmatizer = WordNetLemmatizer()
stops = set(stopwords.words('english'))

In [15]:
#Add candidates to the stop words list since these will greatly skew results and not help us identify the topics they talk about
stops.update(['yang','andrew','pete','buttigieg','mayor','sanders','bernie','bern','biden','kamala','harris','warren','elizabeth','tulsi','gabbard'])

In [16]:
#don can be a nickname for donald and has relevance for this project
stops.remove('don')

In [17]:
#Actual function that tokenizes text, removes stop words, lemmatizes the tokens, and returns the tokens joined with a space
def stop_and_lemma(text):
    processed_tokens=[]
    tokens = tokenizer.tokenize(text)
    for token in tokens:
        if token not in stops:
            processed_tokens.append(lemmatizer.lemmatize(token))
    return(" ".join(processed_tokens))

In [18]:
df_nlp['Use'] = df_nlp['Clean'].apply(stop_and_lemma)

In [19]:
df_nlp.drop(labels=['Unnamed: 0','Text','Clean'],axis=1,inplace=True)

Create train and test dataframes

In [20]:
#Using inner joins here to remove out users who did not have a post in r/politics

In [21]:
df_train = pd.concat([df_train,df_nlp],join='inner',axis=1)
df_train['predicted_target']=df_train['target']
df_train.drop(labels=['target'],axis=1,inplace=True)

In [22]:
df_test = pd.concat([df_test,df_nlp],join='inner',axis=1)
df_test['predicted_target']=df_test['target']
df_test.drop(labels=['target'],axis=1,inplace=True)

Applying TF-IDF vectorizer, Logisitic Regression Classifier and hypertuning

In [23]:
#Due to the multiple operations that needed to be run on these dataframes between the vecotorizing and
## the logisitic regression, I opted to use a for loop to tune my hyperparameters as opposed to a grid search
for n_grams in [(1,1),(1,2),(1,3)]:
    for features in [100,500,1000]:
        #initialize TF-IDF vectorizer
        tf = TfidfVectorizer(ngram_range = n_grams,max_features=features)
        #Generate sparse matrixes and combine with previous features
        sparseXtrain = tf.fit_transform(df_train['Use'])
        sparseXtrain_df= pd.DataFrame(sparseXtrain.toarray(),
               columns=tf.get_feature_names(),index=df_train.index)
        df_train2 = pd.concat([df_train.drop('Use',axis=1),sparseXtrain_df],axis=1,join='inner')

        sparseXtest = tf.transform(df_test['Use'])
        sparseXtest_df= pd.DataFrame(sparseXtest.toarray(),
               columns=tf.get_feature_names(),index=df_test.index)
        df_test2 = pd.concat([df_test.drop('Use',axis=1),sparseXtest_df],axis=1,join='inner')

        #Create X_Train/y_train
        X_train = df_train2.drop('predicted_target',axis=1)
        X_test = df_test2.drop('predicted_target',axis=1)
        y_train = df_train2['predicted_target']
        y_test = df_test2['predicted_target']

        #Run logisitic Regression
        lr = LogisticRegression(solver='lbfgs')
        lr.fit(X_train,y_train)
        print('Scores For: ',n_grams,features)
        print(lr.score(X_train,y_train))
        print(lr.score(X_test,y_test))

Scores For:  (1, 1) 100
0.797289972899729
0.7850162866449512
Scores For:  (1, 1) 500
0.8509485094850948
0.8127035830618893
Scores For:  (1, 1) 1000
0.8612466124661247
0.8175895765472313
Scores For:  (1, 2) 100
0.797289972899729
0.7850162866449512
Scores For:  (1, 2) 500
0.8504065040650407
0.8127035830618893
Scores For:  (1, 2) 1000
0.8596205962059621
0.8208469055374593
Scores For:  (1, 3) 100
0.797289972899729
0.7850162866449512
Scores For:  (1, 3) 500
0.8504065040650407
0.8127035830618893
Scores For:  (1, 3) 1000
0.8596205962059621
0.8208469055374593


In [24]:
#count vectorizer model generally doesn't converge, scores look worse than TF-IDF so not using

# for n_grams in [(1,1),(1,2)]:
#     for features in [100,500]:
#         #initialize Count Vectorizer
#         cv = CountVectorizer(ngram_range = n_grams,max_features=features)
#         #Generate sparse matrixes and combine with previous features
#         sparseXtrain = cv.fit_transform(df_train['Use'])
#         sparseXtrain_df= pd.DataFrame(sparseXtrain.toarray(),
#                columns=cv.get_feature_names(),index=df_train.index)
#         df_train2 = pd.concat([df_train.drop('Use',axis=1),sparseXtrain_df],axis=1,join='inner')

#         sparseXtest = cv.transform(df_test['Use'])
#         sparseXtest_df= pd.DataFrame(sparseXtest.toarray(),
#                columns=cv.get_feature_names(),index=df_test.index)
#         df_test2 = pd.concat([df_test.drop('Use',axis=1),sparseXtest_df],axis=1,join='inner')

#         #Create X_Train/y_train
#         X_train = df_train2.drop('predicted_target',axis=1)
#         X_test = df_test2.drop('predicted_target',axis=1)
#         y_train = df_train2['predicted_target']
#         y_test = df_test2['predicted_target']

#         #Run logisitic Regression
#         lr = LogisticRegression(solver='lbfgs')
#         lr.fit(X_train,y_train)
#         print(lr.score(X_test,y_test),"variables: ",n_grams,features)

In [25]:
#Using the TF-IDF vecotrizer that had the best score and ran in the shortest time
tf = TfidfVectorizer(ngram_range = (1,2),max_features=1000)
        
#Generate sparse matrixes and combine with previous features
sparseXtrain = tf.fit_transform(df_train['Use'])
sparseXtrain_df= pd.DataFrame(sparseXtrain.toarray(),
       columns=tf.get_feature_names(),index=df_train.index)
df_train2 = pd.concat([df_train.drop('Use',axis=1),sparseXtrain_df],axis=1,join='inner')

sparseXtest = tf.transform(df_test['Use'])
sparseXtest_df= pd.DataFrame(sparseXtest.toarray(),
       columns=tf.get_feature_names(),index=df_test.index)
df_test2 = pd.concat([df_test.drop('Use',axis=1),sparseXtest_df],axis=1,join='inner')

#Create X_Train/y_train
X_train = df_train2.drop('predicted_target',axis=1)
X_test = df_test2.drop('predicted_target',axis=1)
y_train = df_train2['predicted_target']
y_test = df_test2['predicted_target']

#Run logisitic Regression
lr = LogisticRegression(solver='lbfgs')
lr.fit(X_train,y_train)
print('Scores For: (1,2) & 1000 features')
print(lr.score(X_train,y_train))
print(lr.score(X_test,y_test))

Scores For: (1,2) & 1000 features
0.8596205962059621
0.8208469055374593


In [26]:
confusion_matrix(y_test,lr.predict(X_test))

array([[456,  10],
       [100,  48]])

In [27]:
y_test.value_counts()

0    466
1    148
Name: predicted_target, dtype: int64

In [28]:
y_test.value_counts(normalize=True)

0    0.758958
1    0.241042
Name: predicted_target, dtype: float64

In [29]:
recall_score(y_test,lr.predict(X_test))

0.32432432432432434

In [30]:
#Unbalanced classes are causing us to underpreict on the positive class, let's use an oversampling technique

Apply Oversampling of Minority Class

In [31]:
smt = SMOTE(random_state=42)

In [32]:
X_train_bal,y_train_bal = smt.fit_sample(X_train,y_train)

In [33]:
lrsmt = LogisticRegression(solver='lbfgs')

In [34]:
lrsmt.fit(X_train_bal,y_train_bal)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [35]:
lrsmt.score(X_train_bal,y_train_bal)

0.8600572655690766

In [36]:
lrsmt.score(X_test,y_test)

0.737785016286645

In [37]:
confusion_matrix(y_test,lrsmt.predict(X_test))

array([[347, 119],
       [ 42, 106]])

In [38]:
recall_score(y_test,lrsmt.predict(X_test))

0.7162162162162162

In [39]:
#Resulting dataframe showing words with the most predictive power
pd.DataFrame(lrsmt.coef_.T,X_train.columns).sort_values(by=0,ascending=False)

Unnamed: 0,0
ubi,6.715239
welfare,2.733742
debate,2.643199
solution,2.155616
freedom,2.020890
job,1.861347
program,1.669680
business,1.608740
could,1.480669
polling,1.372441


These results aren't that surprising. The strongest word associated with Yang supporters is "ubi" which stands for universal basic income and is the center topic in Andrew Yang's platform. Other highly associated words are "welfare" (perhaps indicating Yang supporters discussing how ubi would interact with current welfare platforms or how ubi isn't the same as welfare), "debate" (a big topic of Yang supporters the last couple months are how he did in the July debates and if he'll qualify for the Sept/Oct debates), and "freedom" (Andrew's ubi proposal is known as the "freedom dividend"). Words that are not associated with Yang supporters are a bit more interesting and harder to explain. The word that's least associated with Yang supporters is "f&ast;&ast;&ast;". One guess for this is "f&ast;&ast;&ast;" is often used in conjunction with Trump. Many supporters of Yang are known to be more moderate or even previous Trump supporters so one would expect Yang supporters to probably be critical of the president and his politics but less outright hostile and willing to drop blanket anti-Trump phrases such as "f&ast;&ast;&ast; Trump" or "f&ast;&ast;&ast; republicans". Another interesting word here is "corporate", perhaps indicating that Yang supporters are less likely to complain about corporations and corporate influence in politics than non-Yang supporters.