## Setup

In [1]:
reset -fs

In [2]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

In [3]:
%matplotlib inline

In [4]:
RANDOM_STATE = 28

## Load Data and Filter

In [5]:
# Comments File
comments = '../Data/combined_comments.csv'

# Read in File
comm = pd.read_csv(comments).drop('Unnamed: 0', axis=1)

In [6]:
comm.columns

Index(['commentType', 'commentBody', 'sectionName'], dtype='object')

In [7]:
comm.head()

Unnamed: 0,commentType,commentBody,sectionName
0,comment,The snake-filled heads comment made me think o...,Unknown
1,comment,She-devil reporting for duty!,Unknown
2,comment,XX is the new mark of the devil.,Unknown
3,comment,"""Courtland Sykes"" should be writing for The On...",Unknown
4,comment,"I happen to descend for a few of them, because...",Unknown


In [8]:
comm.sectionName.unique()

array(['Unknown', 'Politics', 'Television', 'Europe', 'Middle East',
       'Pro Football', 'Asia Pacific', 'Live', 'The Daily', 'Mind',
       'Art & Design', 'Wine, Beer & Cocktails', 'Americas',
       'Sunday Review', 'Economy', 'Family', 'Dance', 'Africa',
       'Energy & Environment ', 'DealBook', 'Book Review', 'Music',
       'Olympics', 'Move', 'Lesson Plans', 'Entrepreneurship', 'Baseball',
       'Media', 'Entertainment', 'Opinion | Politics', 'Weddings',
       'Real Estate', 'Eat', 'College Basketball', 'Australia',
       'Retirement', 'Neighborhoods', 'Personal Tech', 'Canada', 'Hockey',
       'Tennis', 'Cycling', 'Pro Basketball', 'Learning', 'Golf',
       'Soccer', "401(k)'s and Similar Plans", 'Rugby',
       'College Football', 'Paying for College', 'Insider Events',
       'Editorials', 'Fashion & Beauty', 'World Cup', nan, 'Automobiles',
       'Food', 'Art', 'Opinion | The World', 'Cricket', 'Room For Debate',
       'Education Life', 'Student Loans', 'Auto Rac

In [9]:
# Filter out Unknown & comment commentTypes & Non-NAs
filt = (comm.sectionName != "Unknown") & (comm.commentType == "comment") & comm.sectionName.notna()
data = comm.commentBody[filt]
sections = comm.sectionName[filt]

## Train/Test Split and Training

In [10]:
train_data, test_data, train_target, test_target = train_test_split(data, sections, random_state=RANDOM_STATE)

In [11]:
vectorizer = CountVectorizer(decode_error='ignore',
                             stop_words='english')
nb_model = Pipeline([('vec', vectorizer),
                    ('clf', MultinomialNB())])

In [12]:
nb_model.fit(train_data, train_target)

Pipeline(memory=None,
     steps=[('vec', CountVectorizer(analyzer='word', binary=False, decode_error='ignore',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

## Predictions and Metric Evaluation

In [13]:
nb_model.score(test_data, test_target)

0.6761591873201686

<br>
<br>
<br>

## Grid Search

In [14]:
from sklearn.model_selection import GridSearchCV

In [15]:
grid_params = dict(vec__ngram_range=[(1,1), (1,2), (1,3)],
                   clf__alpha=np.arange(0, 1, 0.1))

In [16]:
nb_model = Pipeline([('vec', vectorizer),
                    ('clf', MultinomialNB())])
gs = GridSearchCV(estimator=nb_model,
                 param_grid=grid_params,
                 scoring='accuracy',
                 cv=5,
                 n_jobs=-1)

In [17]:
gs.fit(train_data, train_target)

  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
Process ForkPoolWorker-3:
Traceback (most recent call last):
  File "/home/vietpride12/anaconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/home/vietpride12/anaconda3/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "/home/vietpride12/anaconda3/lib/python3.7/multiprocessing/pool.py", line 110, in worker
    task = get()
  File "/home/vietpride12/anaconda3/lib/python3.7/site-packages/sklearn/externals/joblib/pool.py", line 362, in get
    return recv()
  File "/home/vietpride12/anaconda3/lib/python3.7/multiprocessing/connection.py", line 250, in recv
    buf = self._recv_bytes()
  File "/home/vietpride12/anaconda3/lib/python3.7/multiprocessing/conn

AttributeError: 'NoneType' object has no attribute 'terminate'

In [None]:
gs.best_params_

In [None]:
gs.best_estimator_

In [None]:
gs.score(test_data, test_target)

<br>
<br>
<br>

## Optimized Model with Classification Report

In [None]:
from sklearn.metrics import classification_report

In [None]:
nb_model = gs.best_estimator_
predictions = nb_model.predict(test_data)
print(classification_report(test_target, predictions))

<br>
<br>
<br>

## Save Model

In [None]:
from joblib import dump, load
dump(nb_model, '../Models/naive_bayes_small.joblib') 

## Save Notebook

In [None]:
import dill
dill.dump_session('../Notebook_Saves/Naive_Bayes_Comments_Total.db')