# **Preprocessing and Modeling**

#### *Imports and Read in Data*

In [143]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from bs4 import BeautifulSoup

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, plot_confusion_matrix, classification_report, f1_score, precision_score 
from sklearn.preprocessing import StandardScaler

from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer

from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier

from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [177]:
reddit = pd.read_csv('../data/reddit_cleaned.csv')
reddit.head(2)

Unnamed: 0,subreddit,author,selftext,created_utc,author_premium,is_video,score,title,upvote_ratio,num_comments,post_char_length,post_word_count
0,0,Jay_Subabove,"This batter is hitting .191 with 3HR, 12 RBI, ...",1655862592,False,False,1,Am I crazy?,1.0,0,263,53
1,0,Stress_Factor,"Not a Yankees fan, but modern day record looki...",1655856890,False,False,1,MLB Record (Wins),1.0,0,96,19


------
## Preprocessing

In [3]:
# using a function to remove html which seemed to be scattered throughout based on EDA
# this code was adapted from the breakfast hour NLP practice for week 5

def remove_html(post):
    '''function to remove html and lowercase all text'''
    post = post.lower()
    no_html = BeautifulSoup(post).text
    
    return no_html

In [4]:
reddit['clean_text'] = reddit['selftext'].apply(remove_html)
reddit.head(2)



Unnamed: 0,subreddit,author,selftext,created_utc,author_premium,is_video,score,title,upvote_ratio,num_comments,post_char_length,post_word_count,clean_text
0,0,Jay_Subabove,"This batter is hitting .191 with 3HR, 12 RBI, ...",1655862592,False,False,1,Am I crazy?,1.0,0,263,53,"this batter is hitting .191 with 3hr, 12 rbi, ..."
1,0,Stress_Factor,"Not a Yankees fan, but modern day record looki...",1655856890,False,False,1,MLB Record (Wins),1.0,0,96,19,"not a yankees fan, but modern day record looki..."


In [65]:
# creating functions that stem and lemmatize text - to be used as hyperparameters
# this code was adapted from the breakfast hour NLP practice for week 5
# lemmatize first

def lemmatize_post(post):
    '''
    Function splits the text data,
    lemmatizes it, and rejoins
    '''
    post_split = post.split()
    lemmatizer = WordNetLemmatizer()
    
    return " ".join([lemmatizer.lemmatize(word) for word in post_split])

In [64]:
# now for stemming

def stem_post(post):
    '''Same framework applied as lemmatize'''
    post_split = post.split()
    p_stemmer = PorterStemmer()
    
    return " ".join([p_stemmer.stem(word) for word in post_split])

------
## **Modeling**

In [7]:
# starting with simple models - just text column and default hyperparameters
# will tune once there appears to be a pipeline that works best

X = reddit['clean_text']
y = reddit['subreddit']

### **Defining the baseline accuracy**

In [8]:
y.value_counts(normalize=True)

0    0.510562
1    0.489438
Name: subreddit, dtype: float64

*hoping to beat the 51% baseline accuracy* (and ideally hit > 80% as defined in problem statement)

In [10]:
# splitting the training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)
print(X_train.shape)
print(X_test.shape)

(1668,)
(557,)


#### 1) Count Vectorizer and Naive Bayes (defaults)

In [14]:
# build the pipeline
pipe_cnb = Pipeline([
    ('cvec',CountVectorizer()),
    ('nb',MultinomialNB())])

In [15]:
pipe_cnb.fit(X_train, y_train)

Pipeline(steps=[('cvec', CountVectorizer()), ('nb', MultinomialNB())])

In [16]:
print(f'Training accuracy: {pipe_cnb.score(X_train, y_train)}')
print(f'Test accuracy: {pipe_cnb.score(X_test, y_test)}')

Training accuracy: 0.8435251798561151
Test accuracy: 0.7468581687612208


In [17]:
# heavily overfit

In [20]:
preds = pipe_cnb.predict(X_test)

# print the classification report after generating predictions
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.81      0.66      0.73       284
           1       0.70      0.84      0.76       273

    accuracy                           0.75       557
   macro avg       0.76      0.75      0.75       557
weighted avg       0.76      0.75      0.75       557



In [22]:
# looking at F1 Score
f1_score(y_test,preds)

0.7638190954773868

#### 2) Tfidf Vectorizer and Naive Bayes (defaults)

In [23]:
# repeat the same process to train additional models (again with default parameters)
pipe_tnb = Pipeline([
    ('tvec', TfidfVectorizer()),
    ('nb', MultinomialNB())
])

pipe_tnb.fit(X_train, y_train)

print(f'Training accuracy: {pipe_tnb.score(X_train, y_train)}')
print(f'Test accuracy: {pipe_tnb.score(X_test, y_test)}')

Training accuracy: 0.8039568345323741
Test accuracy: 0.7019748653500898


In [25]:
# slightly worse accuracy and heavily overfit like before

In [26]:
preds = pipe_tnb.predict(X_test)

print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.70      0.74      0.72       284
           1       0.71      0.66      0.69       273

    accuracy                           0.70       557
   macro avg       0.70      0.70      0.70       557
weighted avg       0.70      0.70      0.70       557



In [27]:
f1_score(y_test, preds)

0.6856060606060606

#### 3) Count Vectorizer and Logistic Regression (defaults)

In [29]:
# repeat process with logistic regression
pipe_clog = Pipeline([
    ('cvec', CountVectorizer()),
    ('log', LogisticRegression())
])

pipe_clog.fit(X_train, y_train)

print(f'Training accuracy: {pipe_clog.score(X_train, y_train)}')
print(f'Test accuracy: {pipe_clog.score(X_test, y_test)}')

Training accuracy: 0.8729016786570744
Test accuracy: 0.7378815080789947


In [33]:
# Still heavily overfit

In [43]:
preds = pipe_clog.predict(X_test)

print(f'F1 Score: {f1_score(y_test, preds)}')
print(f'Precision Score: {precision_score(y_test, preds)}')
print(' ')
print(classification_report(y_test, preds))

F1 Score: 0.7533783783783783
Precision Score: 0.6990595611285266
 
              precision    recall  f1-score   support

           0       0.79      0.66      0.72       284
           1       0.70      0.82      0.75       273

    accuracy                           0.74       557
   macro avg       0.74      0.74      0.74       557
weighted avg       0.75      0.74      0.74       557



#### 4) Tfidf  Vectorizer and Logistic Regression (defaults)

In [40]:
# repeat process with logistic regression
pipe_tlog = Pipeline([
    ('tvec', TfidfVectorizer()),
    ('log', LogisticRegression())
])

pipe_tlog.fit(X_train, y_train)

print(f'Training accuracy: {pipe_tlog.score(X_train, y_train)}')
print(f'Test accuracy: {pipe_tlog.score(X_test, y_test)}')

Training accuracy: 0.8009592326139089
Test accuracy: 0.7163375224416517


In [44]:
# slightly less overfit but still less than ideal
preds = pipe_tlog.predict(X_test)

print(f'F1 Score: {f1_score(y_test, preds)}')
print(f'Precision Score: {precision_score(y_test, preds)}')
print(' ')
print(classification_report(y_test, preds))

F1 Score: 0.7084870848708487
Precision Score: 0.7137546468401487
 
              precision    recall  f1-score   support

           0       0.72      0.73      0.72       284
           1       0.71      0.70      0.71       273

    accuracy                           0.72       557
   macro avg       0.72      0.72      0.72       557
weighted avg       0.72      0.72      0.72       557



#### 1a) Count Vectorizer and Naive Bayes (fine tune w GridSearch)

In [71]:
# build the pipeline
params_cnb = {
    'cvec__max_df': [0.9, 0.95],
    'cvec__min_df': [1,3,5,7,9],
    'cvec__ngram_range': [(1,1), (1, 2)],
    'cvec__stop_words': [None, 'english'],
}

gs = GridSearchCV(pipe_cnb, param_grid=params_cnb, cv =3)
gs.fit(X_train, y_train)

# print out best score and best params
print(f'Best score: {gs.best_score_}')
print(f'Best Parameters: {gs.best_params_}')

Best score: 0.7404076738609112
Best Parameters: {'cvec__max_df': 0.9, 'cvec__min_df': 1, 'cvec__ngram_range': (1, 2), 'cvec__stop_words': None}


In [51]:
gs.score(X_test, y_test)
#also try stopwords, preprocessor (functions created earlier), tokenizer

0.7396768402154399

#### 1b)

In [73]:
# pipe_cnb = Pipeline([
#     ('cvec',CountVectorizer(max_df=.9, min_df=1, ngram_range=(1,2))),
#     ('nb',MultinomialNB())])

# params_cnb = {
#     'cvec__max_features': [2_000, 3_000, 4_000, 5_000],
#     'cvec__preprocessor': [None, lemmatize_post, stem_post] # incorporating functions created earlier
# }

# gs = GridSearchCV(pipe_cnb, params_cnb, cv =5)
# gs.fit(X_train, y_train)

# # print out best score and best params
# print(f'Best score: {gs.best_score_}')
# print(f'Best Parameters: {gs.best_params_}')

Best score: 0.740400280520041
Best Parameters: {'cvec__max_features': 5000, 'cvec__preprocessor': <function stem_post at 0x7fa25604c3a0>}


In [74]:
# No significant improvement from tuning

#### 2a) Tfidf Vectorizer and Naive Bayes (fine tune w GridSearch)

In [75]:
# repeat the same process to train additional models (again with default parameters)
pipe_tnb = Pipeline([
    ('tvec', TfidfVectorizer()),
    ('nb', MultinomialNB())
])

grid_params = {
    'tvec__max_df': [0.9, 0.95],
    'tvec__min_df': [1,3,5,7,9],
    'tvec__ngram_range': [(1,1), (1, 2)],
    'tvec__stop_words': [None, 'english'],
}

gs = GridSearchCV(pipe_tnb, param_grid = grid_params, cv = 3)
gs.fit(X_train, y_train)

print(f'Best Score: {gs.best_score_}')
print(f'Best Parameters: {gs.best_params_}')

Best Score: 0.7434052757793764
Best Parameters: {'tvec__max_df': 0.9, 'tvec__min_df': 3, 'tvec__ngram_range': (1, 2), 'tvec__stop_words': 'english'}


In [76]:
# minor improvement

#### 3a) Count Vectorizer and Logistic Regression (fine tune w GridSearch)

In [141]:
# repeat process with logistic regression
pipe_clog = Pipeline([
    ('cvec', CountVectorizer()),
    ('log', LogisticRegression())
])

clog_params = {
    'cvec__max_df': [0.9, 0.95],
    'cvec__min_df': [1,3,5,7,9],
    'cvec__ngram_range': [(1,1), (1, 2)],
    'cvec__stop_words': [None, 'english'],
}

gs = GridSearchCV(pipe_clog,
                 param_grid= clog_params,
                 cv = 5)
gs.fit(X_train, y_train)

print(f'Best Score: {gs.best_score_}')
print(f'Best Parameters: {gs.best_params_}')

Best Score: 0.7493841146535758
Best Parameters: {'cvec__max_df': 0.9, 'cvec__min_df': 1, 'cvec__ngram_range': (1, 2), 'cvec__stop_words': 'english'}


In [142]:
gs.score(X_test, y_test)

0.7504488330341114

In [139]:
# try running again
pipe_clog = Pipeline([
    ('cvec', CountVectorizer(max_df=.9,min_df=1,ngram_range=(1,2), stop_words = 'english')), 
    ('log', LogisticRegression())
])

clog_params = {
   'cvec__max_features': [2_000, 3_000, 4_000, 5_000],
}

gs = GridSearchCV(pipe_clog,
                 param_grid= clog_params,
                 cv = 5)
gs.fit(X_train, y_train)

print(f'Best Score: {gs.best_score_}')
print(f'Best Parameters: {gs.best_params_}')

Best Score: 0.7481937026847206
Best Parameters: {'cvec__max_features': 5000}


In [140]:
gs.score(X_test, y_test)

0.7450628366247756

#### 3b)

In [92]:
#consider different stopwords to use 
# Print English stopwords.
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [94]:
stopword_list = stopwords.words('english')

In [96]:
new_words = ['removed', 'poll','https','com','www','reddit']
for i in new_words:
    stopword_list.append(i)

print(stopword_list)
stopword_list = stopwords.words('english')

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [100]:
# try running again
pipe_clog = Pipeline([
    ('cvec', CountVectorizer(max_df=.9,min_df=1,ngram_range=(1,2))), 
    ('log', LogisticRegression())
])

clog_params = {
   'cvec__max_features': [2_000, 3_000, 4_000, 5_000],
    'cvec__stop_words': [None, stopword_list]
}

gs = GridSearchCV(pipe_clog,
                 param_grid= clog_params,
                 cv = 5)
gs.fit(X_train, y_train)

print(f'Best Score: {gs.best_score_}')

Best Score: 0.7451889014763265


In [101]:
# still not getting close to the 80% threshold

#### 4a) Tfidf  Vectorizer and Logistic Regression (fine tune w GridSearch)

In [90]:
pipe_tlog = Pipeline([
    ('tvec', TfidfVectorizer()),
    ('log', LogisticRegression())
])

tlog_params = {
    'tvec__max_df': [0.9, 0.95],
    'tvec__min_df': [1,3,5,7,9],
    'tvec__ngram_range': [(1,1), (1, 2)],
    'tvec__stop_words': [None, 'english'],
}

gs = GridSearchCV(pipe_tlog,
                 param_grid=tlog_params,
                 cv = 3)
gs.fit(X_train, y_train)

print(f'Best Score: {gs.best_score_}')
print(f'Best Parameters: {gs.best_params_}')

Best Score: 0.7398081534772182
Best Parameters: {'tvec__max_df': 0.9, 'tvec__min_df': 3, 'tvec__ngram_range': (1, 1), 'tvec__stop_words': 'english'}


In [91]:
# still not getting close to the 80% threshold

------
## Changing Gears to Random Forest

In [113]:
# first attempt with CountVectorizer
pipe_cvrf = Pipeline([
    ('cvec', CountVectorizer()),
    ('rf', RandomForestClassifier(n_estimators=100))
]) 

rf_params = {
    'cvec__max_df': [0.9, 0.95],
    'cvec__min_df': [1,3,5,7,9],
    'rf__n_estimators': [100, 150, 200, 250],
    'rf__max_depth': [1, 2, 3, 4, 5]}

gs = GridSearchCV(pipe_cvrf, rf_params, cv = 3)
gs.fit(X_train, y_train)

print(gs.best_score_)
gs.best_params_

0.7110311750599521


{'cvec__max_df': 0.9,
 'cvec__min_df': 9,
 'rf__max_depth': 5,
 'rf__n_estimators': 200}

In [111]:
rf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [121]:
pipe_cvrf = Pipeline([
    ('cvec', CountVectorizer(max_df = 0.9, min_df = 9)),
    ('rf', RandomForestClassifier(n_estimators=200, max_depth=5))
]) 

rf_params = {
    'cvec__ngram_range': [(1,1), (1, 2)],
    'cvec__stop_words': [None, 'english']
}

gs = GridSearchCV(pipe_cvrf, rf_params, cv = 3)
gs.fit(X_train, y_train)

print(gs.best_score_)
gs.best_params_

0.723021582733813


{'cvec__ngram_range': (1, 2), 'cvec__stop_words': 'english'}

In [122]:
#try the same setup with tfidf
pipe_tvrf = Pipeline([
    ('tvec', TfidfVectorizer()),
    ('rf', RandomForestClassifier(n_estimators=100))
]) 

rf_params = {
    'tvec__max_df': [0.9, 0.95],
    'tvec__min_df': [1,3,5,7,9],
    'rf__n_estimators': [100, 150, 200, 250],
    'rf__max_depth': [1, 2, 3, 4, 5]}

gs = GridSearchCV(pipe_tvrf, rf_params, cv = 3)
gs.fit(X_train, y_train)

print(gs.best_score_)
gs.best_params_

0.7062350119904077


{'rf__max_depth': 5,
 'rf__n_estimators': 150,
 'tvec__max_df': 0.95,
 'tvec__min_df': 5}

In [145]:
# attempt with ExtremelyRandomizeTrees
# first attempt with CountVectorizer
pipe_cvet = Pipeline([
    ('cvec', CountVectorizer()),
    ('et', ExtraTreesClassifier(n_estimators=100))
]) 

et_params = {
    'cvec__max_df': [0.9, 0.95],
    'cvec__min_df': [1,3,5,7,9],
    'et__n_estimators': [100, 150, 200, 250],
    'et__max_depth': [1, 2, 3, 4, 5]}

gs = GridSearchCV(pipe_cvet, et_params, cv = 3)
gs.fit(X_train, y_train)

print(gs.best_score_)
gs.best_params_

0.7134292565947242


{'cvec__max_df': 0.9,
 'cvec__min_df': 9,
 'et__max_depth': 5,
 'et__n_estimators': 150}

------
## Boosting

*AdaBoostClassifier*

In [134]:
# Try with CountVectorizer
ada_cpipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('ada', AdaBoostClassifier(base_estimator=DecisionTreeClassifier()))
])

ada_params = {
    'cvec__max_df': [0.9, 0.95],
    'cvec__ngram_range': [(1,1), (1, 2)],
    'cvec__stop_words': [None, 'english'],
    'ada__n_estimators': [50, 100],
    'ada__base_estimator__max_depth': [1, 2],
    'ada__learning_rate': [.9, 1.0],
}

gs = GridSearchCV(ada_cpipe, param_grid = ada_params, cv = 3)
gs.fit(X_train, y_train)

print(gs.best_score_)
gs.best_params_

0.7260191846522782


{'ada__base_estimator__max_depth': 1,
 'ada__learning_rate': 1.0,
 'ada__n_estimators': 100,
 'cvec__max_df': 0.95,
 'cvec__ngram_range': (1, 2),
 'cvec__stop_words': None}

In [135]:
# Try with Tfidf Vectorizer
ada_tpipe = Pipeline([
    ('tvec', TfidfVectorizer()),
    ('ada', AdaBoostClassifier(base_estimator=DecisionTreeClassifier()))
])

ada_params = {
    'tvec__max_df': [0.9, 0.95],
    'tvec__ngram_range': [(1,1), (1, 2)],
    'tvec__stop_words': [None, 'english'],
    'ada__n_estimators': [50, 100],
    'ada__base_estimator__max_depth': [1, 2],
    'ada__learning_rate': [.9, 1.0],
}

gs = GridSearchCV(ada_tpipe, param_grid = ada_params, cv = 3)
gs.fit(X_train, y_train)

print(gs.best_score_)
gs.best_params_

0.7182254196642686


{'ada__base_estimator__max_depth': 1,
 'ada__learning_rate': 1.0,
 'ada__n_estimators': 50,
 'tvec__max_df': 0.9,
 'tvec__ngram_range': (1, 1),
 'tvec__stop_words': 'english'}

*GradientBoostClassifier*

In [137]:
# Try with CountVectorizer
g_cpipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('g', GradientBoostingClassifier())
])

ada_params = {
    'cvec__max_df': [0.9, 0.95],
    'cvec__ngram_range': [(1,1), (1, 2)],
    'cvec__stop_words': [None, 'english'],
    'g__n_estimators': [50, 100],
    'g__learning_rate': [.9, 1.0],
}

gs = GridSearchCV(g_cpipe, param_grid = ada_params, cv = 3)
gs.fit(X_train, y_train)

print(gs.best_score_)
gs.best_params_

0.7398081534772182


{'cvec__max_df': 0.9,
 'cvec__ngram_range': (1, 2),
 'cvec__stop_words': 'english',
 'g__learning_rate': 0.9,
 'g__n_estimators': 50}

In [138]:
# Try with Tfidf Vectorizer
g_tpipe = Pipeline([
    ('tvec', TfidfVectorizer()),
    ('g', GradientBoostingClassifier())
])

ada_params = {
    'tvec__max_df': [0.9, 0.95],
    'tvec__ngram_range': [(1,1), (1, 2)],
    'tvec__stop_words': [None, 'english'],
    'g__n_estimators': [50, 100],
    'g__learning_rate': [.9, 1.0],
}

gs = GridSearchCV(g_tpipe, param_grid = ada_params, cv = 3)
gs.fit(X_train, y_train)

print(gs.best_score_)
gs.best_params_

0.7122302158273381


{'g__learning_rate': 1.0,
 'g__n_estimators': 100,
 'tvec__max_df': 0.9,
 'tvec__ngram_range': (1, 1),
 'tvec__stop_words': 'english'}

*VotingClassifier*


-------
## Conclusions and next steps

We will plan to cast a wider net in this case by advertising on both the r/mlb and r/redsox, given we were not able to create a model that  accurately predict the correct subreddit at 80% (i.e. difficult to train a model that is able to discern between the two - so we'd rather cast a wider net across the .) Despite not hitting the target accuracy and F1 scores of 80%, ran a sentiment analysis on the two subreddits. 

In [182]:
# Creat an instance of the Vader Sentiment Intensity Analyzer
sent = SentimentIntensityAnalyzer()

# slice the df to just include the specific subreddits
redsox = reddit[reddit['subreddit'] == 1]
mlb = reddit[reddit['subreddit'] == 0]

# calculate the polarity score for each post and create a list of the compound scores
compound_sox_list = [scores_list[num]['compound'] for num in range(0,len([sent.polarity_scores(post) for post in redsox['selftext']]))]
compound_mlb_list = [scores_list[num]['compound'] for num in range(0,len([sent.polarity_scores(post) for post in mlb['selftext']]))]

In [183]:
# checking the sentiment across all posts in the self text column
print(f'r/redsox sentiment: {np.mean(compound_sox_list)}')
print(f'r/mlb sentiment: {np.mean(compound_mlb_list)}')

r/redsox sentiment: 0.19929403122130396
r/mlb sentiment: 0.19729119718309857


Both positive which is good to know - may be more receptice to ads - and given how similar the sentiments are, makes the case to cast wider net by advertising 