In [134]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# Import CountVectorizer and TFIDFVectorizer from feature_extraction.text.
from sklearn.feature_extraction.text import CountVectorizer

In [135]:
#read in data
qcas_con = pd.read_csv('./data/total_cleaned.csv')
qcas_con.drop(columns= 'Unnamed: 0', inplace=True)

In [136]:
#sanity check
qcas_con

Unnamed: 0,title,selftext,is_conspiracy
0,Does tictock listen to you?,I’m not no expert on conspiracy theories but I...,1
1,Dr Seuss.. Looks like everyone is running with...,Seems some sort of cryptic messaging .. I dont...,1
2,Why Catholics thrive in the CIA. - Catholic He...,https://saidit.net/s/Jesuits/comments/70a1/why...,1
3,Sumerian texts speak of ancient kings and a my...,,1
4,Sumerian texts ppeak of ancient kings and a my...,[deleted],1
...,...,...,...
3971,Former Q confesses,Perhaps this article posted here already but i...,0
3972,I’m just exhausted at this point.,I don’t know that my family would consider the...,0
3973,A bit of Psychology that might be helpful to u...,"""First proposed by Sigmund Freud, this theory ...",0
3974,I hate it so much but it’s also what it is,TW: abuse / rape / Q\n\nSooooooo\n\nMy dad / s...,0


In [137]:
X= qcas_con['title']
y= qcas_con['is_conspiracy']

In [138]:
y.value_counts()

1    1988
0    1988
Name: is_conspiracy, dtype: int64

### Logistic Regression

In [139]:
#Set up train test split

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    random_state=420,
                                                   stratify = y)


In [140]:
#Using a pipeline for transforming and fitting the data
pipe = Pipeline([
    ('cv', CountVectorizer(min_df=2)),
    ('lr', LogisticRegression())
])

pipe.fit(X_train, y_train)
pipe.score(X_train, y_train), pipe.score(X_test, y_test)

(0.943326626425218, 0.8018108651911469)

In [141]:
#Gridsearching over the pipeline to find the best features for the Count Vectorizer
# DSIR-Lancelot/5.04-lesson-nlp-ii

pipe = Pipeline([
    ('cv', CountVectorizer()),
    ('lr', LogisticRegression())
])

# Search over the following values of hyperparameters:
# min_df: A word must occur in at least two documents from the corpus
# max_df: Ignore words that occur in > 90%, 95%, 100% of the documents from the corpus
# Check (individual tokens) and also check (individual tokens and 2-grams).

pipe_params = {
    'cv__stop_words': ['english', None],
    'cv__max_features': [900, 1100, 2500, None],
    'cv__min_df': [1, 2, 3],
    'cv__max_df': [.90, .95, 1.0],
    'cv__ngram_range': [(1, 1), (1, 2)]
}

In [142]:
gs = GridSearchCV(pipe, # what object are we optimizing?
                  pipe_params, # what parameters values are we searching?
                  cv=5) # 5-fold cross-validation.

In [143]:
gs.fit(X_train, y_train);


In [144]:
gs.best_params_

{'cv__max_df': 0.9,
 'cv__max_features': None,
 'cv__min_df': 1,
 'cv__ngram_range': (1, 2),
 'cv__stop_words': 'english'}

In [145]:
gs.score(X_train, y_train), gs.score(X_test, y_test)

(0.9922870556673373, 0.8048289738430584)

In [146]:
# Using the best features to fit the Count Vectorizer to get the Logistic Regression coefficients of each word in the model
# For interpretability, I will be cutting out stop words and reducing max features
cvec = CountVectorizer( max_df =  0.9,
 max_features = 2000,
 min_df=2,
 ngram_range= (1, 2),
 stop_words = 'english')

In [147]:
cvec.fit(X_train)

CountVectorizer(max_df=0.9, max_features=2000, min_df=2, ngram_range=(1, 2),
                stop_words='english')

In [148]:
X_train_cv = cvec.transform(X_train)
X_test_cv = cvec.transform(X_test)

In [149]:
pd.DataFrame(X_train_cv.todense()).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [150]:
cvec.get_feature_names()[1000:1015]

['lives',
 'living',
 'll',
 'lmao',
 'local',
 'logo',
 'logs',
 'lol',
 'long',
 'long time',
 'longer',
 'look',
 'looking',
 'looks',
 'looks like']

In [151]:
X_train_df = pd.DataFrame(X_train_cv.todense(), columns=cvec.get_feature_names())

In [152]:
X_train_df.sum().sort_values(ascending=False).head(10)

qanon         249
conspiracy    133
just          132
people        121
covid         113
mom            97
trump          82
like           81
help           80
lost           79
dtype: int64

In [153]:
#instantiate Logistic Regressions, fit the transformed data and generate predictions
lgr = LogisticRegression()
lgr.fit(X_train_cv, y_train)
y_preds = lgr.predict(X_test_cv)
lgr.predict_proba(X_test_cv)[:5]

array([[8.17837330e-01, 1.82162670e-01],
       [9.72123362e-01, 2.78766383e-02],
       [3.17056703e-02, 9.68294330e-01],
       [5.89662551e-01, 4.10337449e-01],
       [9.99994137e-01, 5.86332165e-06]])

In [154]:
#Accuracy Scores
print(f'The training score is: {lgr.score(X_train_cv, y_train)}'),print(f'The testing score is: {lgr.score(X_test_cv, y_test)}');

The training score is: 0.9299128101945003
The testing score is: 0.7917505030181087


In [155]:
# Get the coefficients
lgr.coef_

array([[ 0.80341061,  0.02220649,  0.04422409, ...,  0.63812937,
         0.30881324, -0.11239196]])

In [156]:
# https://towardsdatascience.com/interpreting-coefficients-in-linear-and-logistic-regression-6ddf1295f6f1
# Get the coefficients + odds
lgr.coef_
odds = np.exp(lgr.coef_[0])

#create coefs table w/ vectorized features
lgr_coefs = pd.DataFrame({'Variable':X_train_df.columns,
              'Importance': odds}).sort_values('Importance', ascending=False)

In [157]:
#sort values by coefficient
lgr_coefs.sort_values(by='Importance', ascending=False)

Unnamed: 0,Variable,Importance
759,government,5.424091
1052,mask,4.945194
1729,texas,4.291027
6,11,4.174726
884,interesting,4.058942
...,...,...
1362,qmom,0.143785
1127,mother,0.141933
459,dad,0.117979
1116,mom,0.050232


https://towardsdatascience.com/interpreting-coefficients-in-linear-and-logistic-regression-6ddf1295f6f1

“For every one-unit increase in respective word, the odds that the observation are in r/Conspiracy are **the coefficient** times as large as the odds that the observation is not in (y class) when all other variables are held constant.”

### Random Forests

In [158]:
rfc = RandomForestClassifier()
etc = ExtraTreesClassifier(n_estimators=100)

In [159]:
rfc_params = {
    'n_estimators': [100, 150, 200],
    'max_depth': [None, 1, 2, 3, 4, 5, 10]
}

gs = GridSearchCV(RandomForestClassifier(), param_grid=rfc_params, cv=3, verbose=1)
gs.fit(X_train_cv, y_train)

Fitting 3 folds for each of 21 candidates, totalling 63 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  63 out of  63 | elapsed:   43.9s finished


GridSearchCV(cv=3, estimator=RandomForestClassifier(),
             param_grid={'max_depth': [None, 1, 2, 3, 4, 5, 10],
                         'n_estimators': [100, 150, 200]},
             verbose=1)

In [164]:
gs.best_params_

{'max_depth': None, 'n_estimators': 150}

In [165]:
gs.best_score_

0.7682763246143528

In [166]:
gs.score(X_train_cv, y_train), gs.score(X_test_cv, y_test)

(0.9761904761904762, 0.7605633802816901)

In [167]:
rfc = RandomForestClassifier(max_depth= None, n_estimators=150)
rfc.fit(X_train_cv, y_train)
rfc.score(X_train_cv, y_train), rfc.score(X_test_cv, y_test)

(0.9761904761904762, 0.7645875251509054)

In [168]:
rfc_importance= pd.DataFrame(data=[X_train_df.columns, rfc.feature_importances_]).T

In [169]:
# https://towardsdatascience.com/interpreting-random-forest-and-other-black-box-models-like-xgboost-80f9cc4a3c38

pd.options.display.max_rows = 1000

rfc_importance = pd.DataFrame({'Variable':X_train_df.columns,
              'Importance':rfc.feature_importances_}).sort_values('Importance', ascending=False)

In [170]:
rfc_importance.to_csv('./data/Random_forest_classifier_variables.csv', index=False)

In [173]:
lgr_coefs.to_csv('./data/logistic_regression_variables.csv', index=False)

In [172]:
rfc_importance.head(5)

Unnamed: 0,Variable,Importance
1346,qanon,0.057916
1116,mom,0.024471
1020,lost,0.013568
644,family,0.012905
459,dad,0.010589


In [175]:
lgr_coefs.head(5)

Unnamed: 0,Variable,Importance
759,government,5.424091
1052,mask,4.945194
1729,texas,4.291027
6,11,4.174726
884,interesting,4.058942
