In [1]:
# Import libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

Let's read in our scraped data.  There should be just two columns: the 'post' title text and the 'is_conspiracy' binary classification.

In [2]:
df = pd.read_csv("../data/raw.csv")
df.head()

Unnamed: 0,post,is_conspiracy
0,Warrants issued for arrest of Breonna Taylor’s...,0
1,Hurricane Laura: storm to bring 'unsurvivable ...,0
2,Jacob Blake's family attorney says he did not ...,0
3,NBA postpones all games Wednesday due to boyco...,0
4,Facebook warns that iPhone software changes wi...,0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2976 entries, 0 to 2975
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   post           2976 non-null   object
 1   is_conspiracy  2976 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 46.6+ KB


Let's get our baseline score.

In [3]:
df['is_conspiracy'].value_counts(normalize=True)

1    0.513777
0    0.486223
Name: is_conspiracy, dtype: float64

The baseline score is 0.51.  Our goal is to improve on this as much as possible while optimizing for sensitivity.

Let's see how much better a Logistic Regression trained on count vectorized features.

In [4]:
X = df['post']
y = df['is_conspiracy']

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    stratify=y,
                                                    random_state=42)

In [5]:
vect_pipe = Pipeline([('vect', CountVectorizer(max_features=None,min_df=3)),
                 ('logreg', LogisticRegression())])

vect_pipe.fit(X_train, y_train)
vect_pipe.score(X_test, y_test)

0.8803763440860215

Logistic Regression scored 0.88 accuracy.  This is good. It gives us a sense that r/news and /r/conspiracy can be differentiated with a machine learning classifier.

In order to build a vocabulary each class, let's compile all of the post titles into one string value for each class.

In [38]:
news_mega = ""
consp_mega = ""

for i in df.index:
    if df.loc[i,"is_conspiracy"] == 1:
        consp_mega = consp_mega + " " + df.loc[i,"post"]
    else:
        news_mega = news_mega + " " + df.loc[i,"post"]
        
df_megas = pd.DataFrame({"mega_string": [consp_mega,news_mega],
                        "is_conspiracy": [1,0]})

X_train = df_megas["mega_string"]
y_train = df_megas["is_conspiracy"]

The function 'get_vector_df' generates a dataframe displaying word count per class and then a final value display the net difference in word count across classes.

In [38]:
def get_vector_df(X_train, ngram = 1):

    cv = CountVectorizer(ngram_range=(ngram, ngram),stop_words='english')

    cv_df = pd.DataFrame(cv.fit_transform(X_train).toarray(),
                      columns=cv.get_feature_names()).T

    cv_df.columns = ["conspiracy", "news"]

    cv_df['net_count'] = cvect_df['conspiracy'] - cvect_df['news']

    cv_df.sort_values(by="net_count",ascending=True,inplace=True)
    
    return cv_df

It makes sense that these words are more associated with conspiracy.  They will help us a lot in differentiating classes.

In [41]:
get_vector_df(X_train, ngram = 1).sort_values(by="net_count",ascending=False).head(10)

Unnamed: 0,conspiracy,news,net_count
trump,154,19,135
just,89,8,81
conspiracy,80,4,76
biden,57,0,57
people,84,27,57
like,57,6,51
think,50,0,50
qanon,50,8,42
media,45,4,41
know,40,3,37


In [44]:
get_vector_df(X_train, ngram = 1).sort_values(by="net_count",ascending=False).tail(11)

Unnamed: 0,conspiracy,news,net_count
postal,3,32,-29
mail,5,36,-31
19,41,72,-31
students,0,33,-33
arrested,5,39,-34
portland,9,47,-38
california,6,63,-57
coronavirus,28,92,-64
says,22,88,-66
man,27,99,-72


Let's check before we move on to modeling, how our model scores using a TfidfVectorizer compared to CountVectorizer

In [36]:
tfidf_pipe = Pipeline([('tfidf', TfidfVectorizer(max_features=None,min_df=3)),
                 ('logreg', LogisticRegression())])

tfidf_pipe.fit(X_train, y_train)
tfidf_pipe.score(X_test, y_test)

0.8629032258064516

Slightly worse.