In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB 
from nltk.stem import PorterStemmer

In [3]:
# Set max columns and rows displayed
pd.set_option('display.max_columns', 10000)
pd.set_option('display.max_rows', 10000)

In [4]:
data = pd.read_csv('../data/posts.csv')

In [5]:
# Time for some EDA - checking for nulls
data.isnull().sum()

subreddit         0
selftext       8548
title             0
created_utc       0
score             0
dtype: int64

In [6]:
# Looks like dropping nulls won't unbalance the classes too badly
data.dropna(subset=['selftext'])['subreddit'].value_counts(normalize=True)

latterdaysaints    0.564006
exmormon           0.435994
Name: subreddit, dtype: float64

In [7]:
data.dropna(subset=['selftext'], inplace=True)

In [8]:
data.head()

Unnamed: 0,subreddit,selftext,title,created_utc,score
0,latterdaysaints,There were a few people there that I never eve...,I went to the homecoming talk for someone from...,1580247795,3
1,latterdaysaints,&amp;#x200B;\n\n# How do I show my family them...,Father of four whos older kids falling away fr...,1580243384,4
2,latterdaysaints,I moved into a ward with my family about a yea...,How to care about a ward that doesn’t care abo...,1580242175,0
4,latterdaysaints,The 25th of January 2020 I was baptized in my ...,It turns out my Baptism was a historical momen...,1580240396,20
6,latterdaysaints,My friend whose not a part of the church menti...,I’ve been pondering this for awhile. What is t...,1580238666,1


In [9]:
# Looks good to drop [removed] posts as well
data.drop(data[data['selftext'] == '[removed]'].index, 0)['subreddit'].value_counts(normalize=True)

latterdaysaints    0.540942
exmormon           0.459058
Name: subreddit, dtype: float64

In [10]:
data.drop(data[data['selftext'] == '[removed]'].index, 0, inplace=True)

In [11]:
# also dropping [deleted] posts
data.drop(data[data['selftext'] == '[deleted]'].index, 0, inplace=True)

In [12]:
# have just over 10k rows left now
data.shape

(10232, 5)

In [13]:
# all as they should be
data.dtypes

subreddit      object
selftext       object
title          object
created_utc     int64
score           int64
dtype: object

In [14]:
# converting "subreddit" column to a dummy
data = pd.get_dummies(data, columns=['subreddit'], drop_first=True)

In [15]:
# Looking good
data.head()

Unnamed: 0,selftext,title,created_utc,score,subreddit_latterdaysaints
0,There were a few people there that I never eve...,I went to the homecoming talk for someone from...,1580247795,3,1
1,&amp;#x200B;\n\n# How do I show my family them...,Father of four whos older kids falling away fr...,1580243384,4,1
2,I moved into a ward with my family about a yea...,How to care about a ward that doesn’t care abo...,1580242175,0,1
4,The 25th of January 2020 I was baptized in my ...,It turns out my Baptism was a historical momen...,1580240396,20,1
6,My friend whose not a part of the church menti...,I’ve been pondering this for awhile. What is t...,1580238666,1,1


In [16]:
# Combining title and content of posts into a single block of text
data['all_text'] = data['title'] + " " + data['selftext']

In [17]:
# reset index to clean up after all those drops
data.reset_index(inplace=True)

In [18]:
# removing formatting strings
data['all_text'] = [text.replace("x200B", "") for text in data['all_text']]

In [19]:
# removing text which appears after &
data['all_text'] = [text.replace("amp;", "") for text in data['all_text']]

In [20]:
# removing single quote/apostrophes so contractions and possessives tokenize correctly
data['all_text'] = [text.replace("'", "") for text in data['all_text']]

In [21]:
# Define features and target
#features = ['all_text']
X = data['all_text']  #[data['subreddit_latterdaysaints'] == 1]
y = data['subreddit_latterdaysaints']  #[data['subreddit_latterdaysaints'] == 1]

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    stratify=y,
                                                    random_state=42)

In [23]:
# Creating first pipeline - cvec with knn
pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('knn', KNeighborsClassifier())
])
pipe_params = {
    'knn__n_neighbors': [3, 5, 7],
    'cvec__ngram_range': [(1,1)],
    'cvec__max_features': [1000, 2000, 5000],
    'cvec__stop_words': ['english', None]
}
gs = GridSearchCV(pipe, param_grid=pipe_params, cv=5)

In [24]:
gs.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('cvec',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                            

In [25]:
gs_model = gs.best_estimator_

In [26]:
# well, that could definitely be better
gs_model.score(X_test, y_test)

0.627443315089914

In [27]:
# it took the largest max features I gave it, so next time I'll included none as an option
gs.best_params_

{'cvec__max_features': 5000,
 'cvec__ngram_range': (1, 1),
 'cvec__stop_words': 'english',
 'knn__n_neighbors': 3}

In [28]:
# creating second pipeline, cvec with nbm
pipe = Pipeline([
    ('cvec', CountVectorizer(stop_words='english')),
    ('nbm', MultinomialNB())
])
pipe_params = {
    'nbm__alpha': [0.1, 0.5, 1, 5],
    'cvec__ngram_range': [(1,1), (1,2)],
    'cvec__max_features': [1000, 2000, 5000, None],
}
gs = GridSearchCV(pipe, param_grid=pipe_params, cv=5)

In [29]:
gs.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('cvec',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                            

In [30]:
gs_model = gs.best_estimator_

In [31]:
# Definitely overfit
gs_model.score(X_train, y_train)

0.997393797237425

In [32]:
# But all in all, pretty good! 
gs_model.score(X_test, y_test)

0.8537920250195465

In [33]:
gs.best_params_

{'cvec__max_features': None, 'cvec__ngram_range': (1, 2), 'nbm__alpha': 0.5}

In [34]:
# lets see if we can look at the predictiveness of different features
feature_list = gs_model.named_steps['cvec'].get_feature_names()
coefs = gs_model.named_steps['nbm'].coef_

In [35]:
coefs = coefs.tolist()[0]

In [36]:
thingy = zip(feature_list, coefs)

In [37]:
thingy = dict(thingy)

In [39]:
# I'm not sure how to sort a dictionary on values and am running out of time, so I'm just going to spot
# check some terms for now - not all words checked are present, some have been checked and then overwritten
thingy['church']

-5.379770716427862

In [40]:
thingy['shit']

-13.300459352322259

In [41]:
thingy['mormon']

-6.688866446557835

In [42]:
thingy['coffee']

-8.857808095831942

In [43]:
thingy['left']

-7.860931817222697

In [44]:
thingy['fucking']

-14.399071640990368

In [45]:
thingy['tscc']

-13.300459352322259

In [46]:
thingy['cult']

-10.843723579500953

In [47]:
thingy['testimony']

-7.627136085150766

In [48]:
thingy['blessing']

-8.375624048029334

In [49]:
thingy['gospel']

-7.125979044990846

In [50]:
thingy['spirit']

-7.277011759361225

In [51]:
thingy['gt']

-7.47642774951448

In [52]:
thingy['ces']

-10.391738455757896

In [61]:
thingy['tapir']

-14.399071640990368

In [62]:
thingy['true']

-7.3969156865867465

In [63]:
# Looks like all the coefficients are negative, and some are much stronger than others. I would love to
# spend more time on this and bring in sentiment and score, as well as look at the strength of individual
# features in greater detail, but for now I'm pleased to conclude that cvec and nbm were able to predict
# subreddit reasonably well even with the strong shared vocabulary seen in the CommonWords notebook. 