In [111]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

# Import CountVectorizer and TFIDFVectorizer from feature_extraction.text.
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [10]:
#read in data
qcon_cas = pd.read_csv('./data/total_cleaned.csv')
qcon_cas.drop(columns= 'Unnamed: 0', inplace=True)

In [11]:
#sanity check
qcon_cas

Unnamed: 0,title,selftext,is_conspiracy,title_char_length,title_word_count
0,Russia using TASK FORCE to break into peoples ...,,1,93,16
1,Who do you think this guy is? Could it have an...,https://youtu.be/tWdgAMYjYSs\n\n anyone have a...,1,82,17
2,There is a reason why Holocaust survivors neve...,"So basically it’s ok to marginalize, blackball...",1,89,16
3,Sophia The Robot Has Saudi Arabian Citizenship!,,1,47,7
4,Infinity War – Gliding Through The Many Worlds,,1,46,8
...,...,...,...,...,...
1995,On the connection between financial instabilit...,[A majority of the people arrested for Capitol...,0,65,9
1996,My 3 y/o niece has nightmares about being cann...,So most of my in-laws are pretty deep into the...,0,54,9
1997,In Laws are brainwashed,"Hello,\n\n Over the last 4 years my wife’s pa...",0,23,4
1998,"Is this part of the QAnon culture? ""The New No...",[https://muse.ai/v/CRFPmJ1-The-New-Normal-Docu...,0,85,15


In [12]:
X= qcon_cas['title']
y= qcon_cas['is_conspiracy']

In [41]:
y.value_counts()

1    1000
0    1000
Name: is_conspiracy, dtype: int64

### Logistic Regression

In [127]:
#DSIR-Lancelot/5.04-lesson-nlp-ii
#instantiate Count Vectorizer and set up train test split

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    random_state=420)


In [128]:
#DSIR-Lancelot/5.04-lesson-nlp-ii
cv = CountVectorizer(stop_words='english')
cv.fit(X_train)

# Transform the corpus.
X_train_cv = cv.transform(X_train)
X_test_cv = cv.transform(X_test)

In [129]:
X_train_cv.todense()

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]])

In [130]:
pd.DataFrame(X_train_cv.todense()).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3842,3843,3844,3845,3846,3847,3848,3849,3850,3851
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [135]:
cv.get_feature_names()[1000:1015]

['directed',
 'directly',
 'directors',
 'disable',
 'disagree',
 'disappears',
 'disaster',
 'disasters',
 'discipline',
 'discontinued',
 'discount',
 'discover',
 'discuss',
 'discusses',
 'discussion']

In [136]:
X_train_df = pd.DataFrame(X_train_cv.todense(), columns=cv.get_feature_names())

In [140]:
X_train_df.shape

(1500, 3852)

In [141]:
X_train_df.sum().sort_values(ascending=False).head(10)

qanon         97
just          68
people        65
covid         62
conspiracy    56
lost          51
family        49
mom           46
trump         45
vaccine       40
dtype: int64

In [151]:
#instantiate Logistic Regressions, fit the transformed data and generate predictions
lgr = LogisticRegression()
lgr.fit(X_train_cv, y_train)
y_preds = lgr.predict(X_test_cv)
lgr.predict_proba(X_test_cv)[:5]

array([[0.57047088, 0.42952912],
       [0.95667039, 0.04332961],
       [0.47796763, 0.52203237],
       [0.01787447, 0.98212553],
       [0.50681263, 0.49318737]])

In [152]:
#Accuracy Scores
print(f'The training score is: {lgr.score(X_train_cv, y_train)}'),print(f'The testing score is: {lgr.score(X_test_cv, y_test)}');

The training score is: 0.986
The testing score is: 0.754


In [153]:
# Get the coefficients
lgr.coef_

array([[ 0.80031116,  0.06451176, -0.18416131, ...,  0.46012003,
         0.12192909, -0.24640408]])

In [164]:
# https://towardsdatascience.com/interpreting-coefficients-in-linear-and-logistic-regression-6ddf1295f6f1
# Get the coefficients + odds
lgr.coef_
odds = np.exp(lgr.coef_[0])

#create coefs table w/ vectorized features
lgr_coefs = pd.DataFrame(odds,  
             X_train_df.columns, 
             columns = ['coef'])

#sort values by coefficient
lgr_coefs.sort_values(by='coef', ascending=False).head(15)

Unnamed: 0,coef
world,4.466087
mask,3.65536
gates,3.387256
coming,3.372987
biden,3.327956
death,3.237388
yeah,3.178368
syria,3.080565
card,3.024374
banned,2.973434


https://towardsdatascience.com/interpreting-coefficients-in-linear-and-logistic-regression-6ddf1295f6f1

“For every one-unit increase in respective word, the odds that the observation are in r/Conspiracy are **the coefficient** times as large as the odds that the observation is not in (y class) when all other variables are held constant.”

### Random Forests

In [148]:
rfc_params = {
    'n_estimators': [100, 150, 200],
    'max_depth': [None, 1, 2, 3, 4, 5, 10]
}

gs = GridSearchCV(RandomForestClassifier(), param_grid=rfc_params, cv=3, verbose=1)
gs.fit(X_train_cv, y_train)

Fitting 3 folds for each of 21 candidates, totalling 63 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  63 out of  63 | elapsed:   34.8s finished


GridSearchCV(cv=3, estimator=RandomForestClassifier(),
             param_grid={'max_depth': [None, 1, 2, 3, 4, 5, 10],
                         'n_estimators': [100, 150, 200]},
             verbose=1)

In [149]:
gs.best_params_

{'max_depth': 10, 'n_estimators': 200}

In [150]:
gs.best_score_

0.7046666666666667