In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.stem import PorterStemmer

In [2]:
# Set max columns and rows displayed
pd.set_option('display.max_columns', 10000)
pd.set_option('display.max_rows', 10000)

In [3]:
data = pd.read_csv('../data/posts.csv')

In [4]:
# Time for some EDA - checking for nulls
data.isnull().sum()

subreddit         0
selftext       8548
title             0
created_utc       0
score             0
dtype: int64

In [5]:
# Looks like dropping nulls won't unbalance the classes too badly
data.dropna(subset=['selftext'])['subreddit'].value_counts(normalize=True)

latterdaysaints    0.564006
exmormon           0.435994
Name: subreddit, dtype: float64

In [6]:
data.dropna(subset=['selftext'], inplace=True)

In [7]:
data.head()

Unnamed: 0,subreddit,selftext,title,created_utc,score
0,latterdaysaints,There were a few people there that I never eve...,I went to the homecoming talk for someone from...,1580247795,3
1,latterdaysaints,&amp;#x200B;\n\n# How do I show my family them...,Father of four whos older kids falling away fr...,1580243384,4
2,latterdaysaints,I moved into a ward with my family about a yea...,How to care about a ward that doesn’t care abo...,1580242175,0
4,latterdaysaints,The 25th of January 2020 I was baptized in my ...,It turns out my Baptism was a historical momen...,1580240396,20
6,latterdaysaints,My friend whose not a part of the church menti...,I’ve been pondering this for awhile. What is t...,1580238666,1


In [8]:
# Looks good to drop [removed] posts as well
data.drop(data[data['selftext'] == '[removed]'].index, 0)['subreddit'].value_counts(normalize=True)

latterdaysaints    0.540942
exmormon           0.459058
Name: subreddit, dtype: float64

In [9]:
data.drop(data[data['selftext'] == '[removed]'].index, 0, inplace=True)

In [10]:
# also dropping [deleted] posts
data.drop(data[data['selftext'] == '[deleted]'].index, 0, inplace=True)

In [11]:
# have just over 10k rows left now
data.shape

(10232, 5)

In [12]:
# all as they should be
data.dtypes

subreddit      object
selftext       object
title          object
created_utc     int64
score           int64
dtype: object

In [13]:
# converting "subreddit" column to a dummy
data = pd.get_dummies(data, columns=['subreddit'], drop_first=True)

In [14]:
# Looking good
data.head()

Unnamed: 0,selftext,title,created_utc,score,subreddit_latterdaysaints
0,There were a few people there that I never eve...,I went to the homecoming talk for someone from...,1580247795,3,1
1,&amp;#x200B;\n\n# How do I show my family them...,Father of four whos older kids falling away fr...,1580243384,4,1
2,I moved into a ward with my family about a yea...,How to care about a ward that doesn’t care abo...,1580242175,0,1
4,The 25th of January 2020 I was baptized in my ...,It turns out my Baptism was a historical momen...,1580240396,20,1
6,My friend whose not a part of the church menti...,I’ve been pondering this for awhile. What is t...,1580238666,1,1


In [15]:
# Combining title and content of posts into a single block of text
data['all_text'] = data['title'] + " " + data['selftext']

In [16]:
data.reset_index(inplace=True)

In [17]:
data['all_text'] = [text.replace("x200B", "") for text in data['all_text']]

In [18]:
data['all_text'] = [text.replace("amp;", "") for text in data['all_text']]

In [19]:
data['all_text'] = [text.replace("'", "") for text in data['all_text']]

In [45]:
# Define features and target, filtered by subreddit. Change 1 to 0 in the two lines below to see 100 top
# words in r/exmormon and eliminate filter to see top words overall. 
X = data['all_text'][data['subreddit_latterdaysaints'] == 1]
y = data['subreddit_latterdaysaints'][data['subreddit_latterdaysaints'] == 1]

In [46]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    stratify=y,
                                                    random_state=42)

In [47]:
X_train.head()

3825    Why not women? Im just wondering if there is a...
4732    R Rated Movie Exceptions There was an interest...
3043    Parents, how do you get something out of the s...
3784    What I heard is that I need to do a better job...
3035    The Indians of South America The Indians of So...
Name: all_text, dtype: object

In [48]:
cvec = CountVectorizer(stop_words='english', max_features=100, strip_accents="ascii")
cvec.fit(X_train)
X_train_2 = cvec.transform(X_train)
X_train_df = pd.DataFrame(X_train_2.toarray(),
                          columns=cvec.get_feature_names())
X_train_df.sum().sort_values(ascending=False)

church        4089
im            3229
just          2812
like          2636
know          2339
dont          1992
people        1755
god           1725
time          1720
ive           1705
feel          1612
https         1398
really        1356
want          1290
think         1267
things        1229
day           1206
mormon        1198
temple        1172
book          1146
life          1144
love          1042
help          1028
lds           1002
family         991
www            975
good           971
christ         953
way            935
going          923
years          922
new            906
mission        891
did            889
come           885
said           882
faith          832
joseph         829
does           819
com            801
believe        795
jesus          792
make           788
ward           782
need           764
question       760
say            749
gospel         724
members        693
talk           690
conference     675
org            656
right       