## Setup

In [1]:
reset -fs

In [2]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

In [3]:
%matplotlib inline

In [4]:
RANDOM_STATE = 28

## Load Data and Filter

In [5]:
# Comments File
comments = 'Data/CommentsApril2018.csv'

In [6]:
# Read in File
comm = pd.read_csv(comments); comm.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,approveDate,articleID,articleWordCount,commentBody,commentID,commentSequence,commentTitle,commentType,createDate,depth,...,status,timespeople,trusted,typeOfMaterial,updateDate,userDisplayName,userID,userLocation,userTitle,userURL
0,1524594282,5adf6684068401528a2aa69b,781.0,How could the league possibly refuse this offe...,26853969.0,26853969.0,<br/>,comment,1524594011,1.0,...,approved,1,0,News,1524594282,Christopher Rillo,46566740.0,San Francisco,,
1,1524594252,5adf6684068401528a2aa69b,781.0,"So then the execs can be like ""yeah...we will ...",26853699.0,26853699.0,<br/>,comment,1524593146,1.0,...,approved,1,0,News,1524594252,Matt Brand,64324866.0,"Williamsburg, Brooklyn",,
2,1524594250,5adf6684068401528a2aa69b,781.0,I would not want to play chess against these c...,26853677.0,26853677.0,<br/>,comment,1524593032,1.0,...,approved,1,0,News,1524594250,Joseph,78105093.0,"Fayetteville, AR",,
3,1524593431,5adf6684068401528a2aa69b,781.0,Could the cheerleaders join the Actors' Equity...,26853784.0,26853784.0,<br/>,comment,1524593426,1.0,...,approved,0,0,News,1524593431,Stephen,81939618.0,"Phoenix, AZ",,
4,1524595048,5adf653f068401528a2aa697,656.0,Seeking conclusions which support preconceived...,26854236.0,26854236.0,<br/>,comment,1524595043,1.0,...,approved,1,0,News,1524595048,Paul Zorsky,58642997.0,Texas,,


In [7]:
comm.columns

Index(['approveDate', 'articleID', 'articleWordCount', 'commentBody',
       'commentID', 'commentSequence', 'commentTitle', 'commentType',
       'createDate', 'depth', 'editorsSelection', 'inReplyTo', 'newDesk',
       'parentID', 'parentUserDisplayName', 'permID', 'picURL', 'printPage',
       'recommendations', 'recommendedFlag', 'replyCount', 'reportAbuseFlag',
       'sectionName', 'sharing', 'status', 'timespeople', 'trusted',
       'typeOfMaterial', 'updateDate', 'userDisplayName', 'userID',
       'userLocation', 'userTitle', 'userURL'],
      dtype='object')

In [8]:
filt = (comm.sectionName != "Unknown") & (comm.commentType == "comment")
data = comm.commentBody[filt]
sections = comm.sectionName[filt]

## Train/Test Split and Training

In [9]:
train_data, test_data, train_target, test_target = train_test_split(data, sections, random_state=RANDOM_STATE)

In [10]:
vectorizer = CountVectorizer(decode_error='ignore',
                             stop_words='english')
nb_model = Pipeline([('vec', vectorizer),
                    ('clf', MultinomialNB())])

In [11]:
nb_model.fit(train_data, train_target)

Pipeline(memory=None,
     steps=[('vec', CountVectorizer(analyzer='word', binary=False, decode_error='ignore',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

## Predictions and Metric Evaluation

In [12]:
predictions = nb_model.predict(test_data)
accuracy_score(predictions, test_target)

0.708839422933915

## Grid Search

In [13]:
from sklearn.model_selection import GridSearchCV

In [14]:
grid_params = dict(vec__ngram_range=[(1,1), (1,2), (1,3)],
                   clf__alpha=np.arange(0, 1, 0.1))

In [15]:
gs = GridSearchCV(estimator=nb_model,
                 param_grid=grid_params,
                 scoring='accuracy',
                 cv=5,
                 n_jobs=-1)

In [16]:
gs.fit(train_data, train_target)

  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)


GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vec', CountVectorizer(analyzer='word', binary=False, decode_error='ignore',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=...nizer=None, vocabulary=None)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'vec__ngram_range': [(1, 1), (1, 2), (1, 3)], 'clf__alpha': array([0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [17]:
gs.best_params_

{'clf__alpha': 0.30000000000000004, 'vec__ngram_range': (1, 1)}

In [18]:
gs.best_estimator_

Pipeline(memory=None,
     steps=[('vec', CountVectorizer(analyzer='word', binary=False, decode_error='ignore',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)), ('clf', MultinomialNB(alpha=0.30000000000000004, class_prior=None, fit_prior=True))])

In [19]:
gs.score(test_data, test_target)

0.7232164989341133

## Optimized Model with Classification Report

In [20]:
from sklearn.metrics import classification_report

In [21]:
vectorizer = CountVectorizer(decode_error='ignore',
                             stop_words='english')
nb_model = Pipeline([('vec', vectorizer),
                    ('clf', MultinomialNB(alpha=0.30000000000000004))])

In [22]:
nb_model.fit(train_data, train_target)

Pipeline(memory=None,
     steps=[('vec', CountVectorizer(analyzer='word', binary=False, decode_error='ignore',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)), ('clf', MultinomialNB(alpha=0.30000000000000004, class_prior=None, fit_prior=True))])

In [23]:
predictions = nb_model.predict(test_data)
print(classification_report(test_target, predictions))

                        precision    recall  f1-score   support

                Africa       0.92      0.30      0.46        76
              Americas       0.70      0.30      0.42       232
          Art & Design       0.95      0.25      0.40        80
          Asia Pacific       0.57      0.59      0.58       943
             Australia       1.00      0.13      0.23        23
           Book Review       0.43      0.10      0.16       180
                Canada       0.81      0.54      0.65       133
    College Basketball       1.00      0.06      0.11        18
               Cycling       1.00      0.39      0.56        28
              DealBook       0.41      0.21      0.27        73
                   Eat       0.80      0.61      0.69       119
               Economy       0.51      0.60      0.55       294
                Europe       0.55      0.33      0.42       733
                Family       0.63      0.66      0.65       327
                  Golf       0.00      

  'precision', 'predicted', average, warn_for)


## Save Model

In [24]:
from joblib import dump, load
dump(nb_model, 'Models/naive_bayes_small.joblib') 

['Models/naive_bayes_small.joblib']

## Save Notebook

In [25]:
import dill
dill.dump_session('Notebook_Saves/Naive_Bayes_Exploration.db')