# Do necessary imports and read in data

In [12]:
import numpy as np, pandas as pd
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB, ComplementNB
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

In [13]:
astronomy_posts = pd.read_csv("../data/astronomy_posts.csv")

In [14]:
space_posts = pd.read_csv("../data/space_posts.csv")

# Drop Unnecessary Features

In [15]:
astronomy_posts.columns # see which feature corresponds to the text of the post

Index(['Unnamed: 0', 'all_awardings', 'associated_award', 'author',
       'author_flair_background_color', 'author_flair_css_class',
       'author_flair_richtext', 'author_flair_template_id',
       'author_flair_text', 'author_flair_text_color', 'author_flair_type',
       'author_fullname', 'author_patreon_flair', 'author_premium', 'awarders',
       'body', 'collapsed_because_crowd_control', 'created_utc', 'gildings',
       'id', 'is_submitter', 'link_id', 'locked', 'no_follow', 'parent_id',
       'permalink', 'retrieved_on', 'score', 'send_replies', 'stickied',
       'subreddit', 'subreddit_id', 'total_awards_received', 'edited',
       'author_cakeday', 'steward_reports', 'distinguished'],
      dtype='object')

In [16]:
astronomy_posts = astronomy_posts[["body", "subreddit"]] # cut down the Astronomy DataFrame to just text and subreddit

In [17]:
astronomy_posts.head()

Unnamed: 0,body,subreddit
0,The majesty of the Cosmos knows no bounds.,Astronomy
1,That's awesome man. I live in Boston and you c...,Astronomy
2,I sure did! [This is a wider image](https://i....,Astronomy
3,I was almost expecting M104 - Hubble's image o...,Astronomy
4,Were you able to see any of the Milky Way with...,Astronomy


In [18]:
space_posts = space_posts[["body", "subreddit"]] # same thing for Space DataFrame

In [19]:
space_posts.head()

Unnamed: 0,body,subreddit
0,"they will speed to mach 6 only, not exit velocity",space
1,"Like, you'll be on the plane this telescope's ...",space
2,It's because of the incredible expense of gett...,space
3,So it's a bad thing they want to redo a test t...,space
4,[removed],space


# Check for removed/deleted posts and drop them

In [20]:
len(space_posts[(space_posts["body"] == "[deleted]") | (space_posts["body"] == "[removed]")])

2478

In [21]:
len(astronomy_posts[(astronomy_posts["body"] == "[deleted]") | (astronomy_posts["body"] == "[removed]")])

551

In [22]:
space_posts = space_posts[(space_posts["body"] != "[deleted]") & (space_posts["body"] != "[removed]")]

In [23]:
astronomy_posts = astronomy_posts[(astronomy_posts["body"] != "[deleted]") & (astronomy_posts["body"] != "[removed]")]

In [24]:
len(space_posts), len(astronomy_posts) # check the numbers to make sure it worked

(7522, 9449)

# Combine posts from both subreddits into a single DataFrame

In [25]:
all_posts = pd.concat([space_posts, astronomy_posts])

In [26]:
all_posts.head()

Unnamed: 0,body,subreddit
0,"they will speed to mach 6 only, not exit velocity",space
1,"Like, you'll be on the plane this telescope's ...",space
2,It's because of the incredible expense of gett...,space
3,So it's a bad thing they want to redo a test t...,space
5,What is wrong with the lunar gateway project? ...,space


In [27]:
space_posts.head()

Unnamed: 0,body,subreddit
0,"they will speed to mach 6 only, not exit velocity",space
1,"Like, you'll be on the plane this telescope's ...",space
2,It's because of the incredible expense of gett...,space
3,So it's a bad thing they want to redo a test t...,space
5,What is wrong with the lunar gateway project? ...,space


# Drop duplicate posts and map subreddit column to integer values

In [28]:
all_posts = all_posts.drop_duplicates(); all_posts.reset_index(inplace=True) # drop duplicate posts

In [29]:
all_posts["subreddit"] = all_posts["subreddit"].apply(lambda x: 0 if x == "space" else 1) # binarize subreddit names

In [30]:
all_posts.drop(columns="index", inplace=True) # drop extraneous index column

In [31]:
all_posts.head()

Unnamed: 0,body,subreddit
0,"they will speed to mach 6 only, not exit velocity",0
1,"Like, you'll be on the plane this telescope's ...",0
2,It's because of the incredible expense of gett...,0
3,So it's a bad thing they want to redo a test t...,0
4,What is wrong with the lunar gateway project? ...,0


# Export cleaned DataFrame to a CSV file

In [32]:
all_posts.to_csv("../data/all_posts.csv", index=False)

# Find a baseline score for classifiers

In [33]:
all_posts["subreddit"].value_counts(normalize=True)

1    0.556502
0    0.443498
Name: subreddit, dtype: float64

### The baseline accuracy that models must beat is 55.6%.

# Set up feature matrix and target vector for modeling

In [34]:
X = all_posts["body"]
y = all_posts["subreddit"]

# Convert all posts to lower case

In [35]:
X = X.apply(lambda x: x.lower())

In [36]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

# Fit and test a multinomial Bayes classifier

In [37]:
pipe = Pipeline([
    ("tfidf", TfidfVectorizer()),
    ("mnb", MultinomialNB())
])

In [38]:
pipe_params = {
    "tfidf__max_features": list(range(900, 1_000, 20)),
    "tfidf__max_df": [.35, .4, .45],
}
# best ngram range is (1,1)
# analyzer="word"/"char"/"char_wb" has no effect
# stop_words=None/"english" has almost no effect
# tuning over max_df but not min_df gives the best results

In [39]:
gs = GridSearchCV(estimator=pipe, param_grid=pipe_params)

In [40]:
gs.fit(X_train, y_train);

In [41]:
gs.score(X_train, y_train)

0.7756907922197178

In [42]:
gs.score(X_test, y_test)

0.7666499749624437

In [43]:
gs.best_params_ # use these parameters for TFIDF vectorizer in the rest of the models and tune over model-specific parameters

{'tfidf__max_df': 0.4, 'tfidf__max_features': 980}

# Fit and test a logistic regression model

In [44]:
pipe2 = Pipeline([
    ("tfidf", TfidfVectorizer(max_df=0.4, max_features=980)),
    ("logreg", LogisticRegression())
])

In [45]:
pipe2_params = {
    "logreg__C": [1, 1e9],
    "logreg__max_iter": [5000]
}

In [46]:
gs2 = GridSearchCV(estimator=pipe2, param_grid=pipe2_params)

In [47]:
gs2.fit(X_train, y_train);

In [48]:
gs2.score(X_train, y_train)

0.7867100759662743

In [49]:
gs2.score(X_test, y_test)

0.757135703555333

In [50]:
gs2.best_params_ # use these

{'logreg__C': 1, 'logreg__max_iter': 5000}

# Fit and test a random forest classifier

In [51]:
pipe3 = Pipeline([
    ("tfidf", TfidfVectorizer(max_df=0.4, max_features=980)),
    ("rfc", RandomForestClassifier())
])

In [52]:
pipe3_params = {
    "rfc__n_estimators": [100, 150],
    "rfc__max_depth": [20, 30, 40]
}

In [53]:
gs3 = GridSearchCV(estimator=pipe3, param_grid=pipe3_params)

In [54]:
gs3.fit(X_train, y_train);

In [55]:
gs3.score(X_train, y_train)

0.9035812672176309

In [56]:
gs3.score(X_test, y_test)

0.7403605408112168

In [57]:
gs3.best_params_

{'rfc__max_depth': 40, 'rfc__n_estimators': 100}

### These are not necessarily the best possible parameters, but they are a reasonable compromise to avoid overfitting and/or spending hours running a GridSearch. Now we can fit a model with these parameters and use it to learn about the most important words.

In [58]:
rfc = RandomForestClassifier(n_estimators=150, max_depth=40)

In [59]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf.fit(X_train);
X_train_vec = tfidf.fit_transform(X_train)
rfc.fit(X_train_vec, y_train);

In [60]:
rfc.score(X_train_vec, y_train)

0.7934719091743885

In [61]:
X_test_vec = tfidf.transform(X_test)

In [62]:
rfc.score(X_test_vec, y_test)

0.7208312468703054

# Grab the most predictive words across both subreddits

In [67]:
# this method of associating feature importance to feature names comes from SCB on StackOverflow: https://stackoverflow.com/a/48896026

cols = tfidf.get_feature_names()
rows = rfc.feature_importances_
important_words = pd.DataFrame(zip(rows, cols)).sort_values(by=0, ascending=False)[:40] # make a DF of the 40 most predictive words
for word in important_words[1]:
    print(word)

mars
boeing
planet
telescope
greek
stars
thanks
nebula
nasa
rover
star
space
curiosity
sky
astronomy
light
beautiful
mount
scope
atmosphere
selfie
rocket
image
named
thank
god
congress
earth
night
orion
uranus
pluto
amazing
water
names
galaxy
humans
arm
andromeda
years


# Summarize findings and make technical recommendations

- Multinomial Bayes and logistic regression models had similar performance, but Bayes classifier was slightly faster to fit
- These two models weren't overfit so we can probably expect similar accuracy on unseen data
- Multinomial Bayes classifier is the best tradeoff between compute time and accuracy but doesn't provide the words we need for qualitative analysis
- The random forest classifier was slower and overfit but still did reasonably well on the test data, so we can pull some useful features (words) from it
- Future steps: try different types of models and techniques like tokenizing with regular expressions and custom stopword lists to try to improve accuracy
- Create more complex models and run them on AWS to get better scores without waiting days for results