# Mulitnomial Naive Bayes

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer 
import pickle
import warnings; warnings.simplefilter('ignore')
np.random.RandomState(42)



<mtrand.RandomState at 0x233871aa708>

In [2]:
X_train = pd.read_csv("../Data/X_train.csv",index_col=0)
y_train = pd.read_csv("../Data/y_train.csv",index_col=0)
with open('../Assets/custom_stop_words.pkl','rb') as f:
    custom_stop_words = pickle.load(f)

This cell instantiates a Pipeline, which will process the data, and  determines which model I will use in this notebook, which is the Multinomial Naive Bayes model.

In [3]:
pipe = Pipeline([
    ("count", TfidfVectorizer()),
    ("MultiNB", MultinomialNB())
])

This dictionary lists the hyper parameters that I want my Gridsearch to test, which will then pick a single model. Gridsearch will then use the model that had the best accuracy.

In [4]:
params = {
    "count__stop_words":[custom_stop_words],
    "count__min_df":[5],
    'MultiNB__alpha': [1,.5,.1]
}

The alpha value in the Multinomial Naive Bayes model dictates how strong the regularization is. The min_df parameter is saying that in order for a word to be considered in the model, it must show up in at least 5 documents. This is done in order to reduce the overall number of feature. Most is not all of the words being eliminated should have no importance in determining each class if it shows up in less than 5 posts.

In [5]:
gs_MultiNB = GridSearchCV(pipe,param_grid=params)

In [6]:
gs_MultiNB.fit(X_train['Total_text'],y_train['subreddit'])

GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('count', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
 ...
        vocabulary=None)), ('MultiNB', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'count__stop_words': [['bill', 'however', 'move', 'whoever', 'becoming', 'detail', 'here', 'she', 'its', 'forty', 'sometimes', 'yours', 'describe', 'seem', 'thereby', 'may', 'and', 'these', 'along', 'still', 'yourselves', 'three', 'whether', 'none', 'themselves', 'first', 'two', 'beforeh...', 'us', 'few', 'her', 'com', 've', 'just']], 'count__min_df': [5], 'MultiNB__alpha': [1, 0.5, 0.1]},
       pre_dispatch='2*n_jobs', refit=True, scoring=N

In [7]:
gs_MultiNB.best_params_['MultiNB__alpha']

0.5

The alpha that performed the best in the model was .5

###### Saving our  Multinomial Naive Bayes Model to be evaluated later

In [8]:
with open('../Assets/MultiNB.pkl','wb+') as f:
    pickle.dump(gs_MultiNB,f)