# 3). Modeling

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier


# Define random state variable
r = 42

In [2]:
df = pd.read_csv('./data/askmen_askwomen.csv')

In [3]:
#define our dependent and independent variables
X = df['title']
y = df['subreddit']

### Baseline Model

In [4]:
y.value_counts(normalize = True)

0    0.500764
1    0.499236
Name: subreddit, dtype: float64

> We have very balanced classes. Our baseline accuracy is .5 so we need to beat this for any of our models to be valid.

#### Instantiate CountVectorizer and TfidfVectorizer

In [5]:
cvec = CountVectorizer()
tf = TfidfVectorizer()

> For the modeling below, I have only retained the models that I ended up using and their associated pipelines and gridsearches. 

### Logistic Regression

#### With Count Vectorizer

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = r, stratify = y)

In [7]:
pipe = Pipeline([
    ('cvec' , CountVectorizer()),
    ('logreg', LogisticRegression())
])

In [8]:
pipe_params = {
#     'cvec__max_features' : [1000, 2000, 3000],
#     'cvec__min_df' : [2, 3],
#     'cvec__max_df' : [.6, .7],
#     'cvec__ngram_range' : [(1,1)],
#     'cvec__stop_words': [None]
}

In [9]:
gs = GridSearchCV(pipe, # what object are we optimizing?
                  pipe_params, # what parameters values are we searching?
                  cv = 5) # 5-fold cross-validation.

In [10]:
gs.fit(X_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('cvec',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        prep

In [11]:
gs.best_params_

{}

In [12]:
gs.best_score_

0.6834267266228525

In [13]:
gs.score(X_train, y_train)

0.9551630434782609

In [14]:
gs.score(X_test, y_test)

0.7087576374745418

#### With TF-IDF Vectorizer

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = r, stratify = y)

In [16]:
pipe = Pipeline([
    ('tvec' , TfidfVectorizer()),
    ('logreg', LogisticRegression())
])

In [17]:
pipe_params = {
    'tvec__max_features' : [1000, 2000, 3000],
    'tvec__min_df' : [2, 3],
    'tvec__max_df' : [.9],
    'tvec__ngram_range' : [(1,1), (1,2), (1,3)],
    'tvec__stop_words': [None, 'english']
}

In [18]:
gs = GridSearchCV(pipe, # what object are we optimizing?
                  pipe_params, # what parameters values are we searching?
                  cv = 5) # 5-fold cross-validation.

In [19]:
gs.fit(X_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('tvec',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        no

In [20]:
gs.best_params_

{'tvec__max_df': 0.9,
 'tvec__max_features': 1000,
 'tvec__min_df': 2,
 'tvec__ngram_range': (1, 1),
 'tvec__stop_words': None}

In [21]:
gs.best_score_

0.6949567623659634

In [22]:
gs.score(X_train, y_train)

0.8396739130434783

In [23]:
gs.score(X_test, y_test)

0.7026476578411406

### Multinomial Naive Bayes

#### With Count Vectorizer

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = r, stratify = y)

In [25]:
pipe = Pipeline([
    ('cvec' , CountVectorizer()),
    ('nb', MultinomialNB())
])

In [26]:
pipe_params = {
#     'cvec__max_features' : [1000, 2000, 3000],
#     'cvec__min_df' : [2, 3],
#     'cvec__max_df' : [.6, .7],
#     'cvec__ngram_range' : [(1,1)],
#     'cvec__stop_words': [None]
}

In [27]:
gs = GridSearchCV(pipe, # what object are we optimizing?
                  pipe_params, # what parameters values are we searching?
                  cv = 5) # 5-fold cross-validation.

In [28]:
gs.fit(X_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('cvec',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        prep

In [29]:
gs.best_params_

{}

In [30]:
gs.best_score_

0.6868119451170298

In [31]:
gs.score(X_train, y_train)


0.9171195652173914

In [32]:
gs.score(X_test, y_test)

0.6761710794297352

#### With TF-IDF Vectorizer

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = r, stratify = y)

In [34]:
pipe = Pipeline([
    ('tvec', TfidfVectorizer()),
    ('nb', MultinomialNB())
])

In [35]:
pipe_params = {
#     'tvec__max_features' : [1000, 2000, 3000],
#     'tvec__stop_words' : [None, 'english'],
#     'tvec__max_df' : [.5, .6, .7,.8, .9, .95],
#     'tvec__min_df' : [1,2,3],
#     'tvec__ngram_range' : [(1,1), (1,2), (1,3), (2,2), (3,3)]  
}

In [36]:
gs = GridSearchCV(pipe, # what object are we optimizing?
                  pipe_params, # what parameters values are we searching?
                  cv = 5) # 5-fold cross-validation.

In [37]:
gs.fit(X_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('tvec',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        no

In [38]:
gs.best_params_

{}

In [39]:
gs.best_score_

0.6711956647065607

In [40]:
gs.score(X_train, y_train)


0.9300271739130435

In [41]:
gs.score(X_test, y_test)

0.6639511201629328

### Random Forests

In [42]:
# instantiate the classifiers
rf = RandomForestClassifier()
et = ExtraTreesClassifier()

#### With Random Forest Classifier & Count Vectorizer

In [43]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = r, stratify = y)

In [44]:
pipe = Pipeline([
    ('cvec' , CountVectorizer()),
    ('rf', RandomForestClassifier(random_state = r))
])

In [45]:
pipe_params = {
#     'cvec__max_features' : [1000, 2000, 3000, 4000],
#     # 'cvec__min_df' : [2, 3],
#     # 'cvec__max_df' : [.9, .95],
#     'cvec__stop_words' : [None, 'english'],
# #     'cvec__ngram_range' : [(1,1), (2,2), (3,3)],
#     'rf__max_depth': [3, 4],
#     'rf__max_features' : [None, 'auto']
}

In [46]:
gs = GridSearchCV(pipe, 
                  pipe_params,
                  cv = 5)


In [47]:
gs.fit(X_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('cvec',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        prep

In [48]:
gs.best_params_

{}

In [49]:
gs.best_score_

0.6800046120142973

In [50]:
gs.score(X_train, y_train)

0.9904891304347826

In [51]:
gs.score(X_test, y_test)

0.6965376782077393

#### With Extra Trees Classifier and Count Vectorizer

In [52]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = r, stratify = y)

In [53]:
pipe = Pipeline([
    ('cvec' , CountVectorizer()),
    ('et', ExtraTreesClassifier(random_state = r))
])

In [54]:
pipe_params = {
#     'cvec__max_features' : [1000, 2000, 3000, 4000],
#     # 'cvec__min_df' : [2, 3],
#     # 'cvec__max_df' : [.9, .95],
#     'cvec__stop_words' : [None, 'english'],
#     'cvec__ngram_range' : [(1,1), (2,2), (3,3)],
#     'et__max_depth': [3, 4],
#     'et__max_features' : [None, 'auto']
}

In [55]:
gs = GridSearchCV(pipe, 
                  pipe_params,
                  cv = 5)


In [56]:
gs.fit(X_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('cvec',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        prep

In [57]:
gs.best_params_

{}

In [58]:
gs.best_score_

0.6793427879626427

In [59]:
gs.score(X_train, y_train)

0.9904891304347826

In [60]:
gs.score(X_test, y_test)

0.6822810590631364

#### With Random Forest Classifier and TF-IDF Vectorizer

In [61]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = r, stratify = y)

In [62]:
pipe = Pipeline([
    ('tvec' , TfidfVectorizer()),
    ('rf', RandomForestClassifier(random_state = r))
])

In [63]:
pipe_params = {
#     'tvec__max_features' : [1000, 2000, 3000, 4000],
#     # 'tvec__min_df' : [2, 3],
#     # 'tvec__max_df' : [.9, .95],
#     'tvec__stop_words' : [None, 'english'],
#     'tvec__ngram_range' : [(1,1), (2,2), (3,3)],
#     'rf__max_depth': [3, 4],
#     'rf__max_features' : [None, 'auto']
}

In [64]:
gs = GridSearchCV(pipe, 
                  pipe_params,
                  cv = 5)


In [65]:
gs.fit(X_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('tvec',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        no

In [66]:
gs.best_params_

{}

In [67]:
gs.best_score_

0.6691571543871785

In [68]:
gs.score(X_train, y_train)

0.9904891304347826

In [69]:
gs.score(X_test, y_test)

0.7006109979633401

#### With Extra Trees Classifier and TF-IDF Vectorizer

In [70]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = r, stratify = y)

In [71]:
pipe = Pipeline([
    ('tvec' , TfidfVectorizer()),
    ('et', ExtraTreesClassifier(random_state = r))
])

In [72]:
pipe_params = {
    'tvec__max_features' : [500, 1000, 2000],
    'tvec__stop_words' : [None],
    'tvec__ngram_range' : [(1,1)],
    'et__max_depth': [4, 5],
    'et__max_features' : [None, 'auto'],
    'et__n_estimators' : [125, 200]
}

In [73]:
gs = GridSearchCV(pipe, 
                  pipe_params,
                  cv = 5)


In [74]:
gs.fit(X_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('tvec',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        no

In [75]:
gs.best_params_

{'et__max_depth': 4,
 'et__max_features': 'auto',
 'et__n_estimators': 200,
 'tvec__max_features': 500,
 'tvec__ngram_range': (1, 1),
 'tvec__stop_words': None}

In [76]:
gs.best_score_

0.6833967485299205

In [77]:
gs.score(X_train, y_train)

0.8158967391304348

In [78]:
gs.score(X_test, y_test)

0.6965376782077393