## Setup

In [1]:
reset -fs

In [2]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Perceptron

In [3]:
RANDOM_STATE = 28

## Load Data and Filter

In [4]:
# Comments File
comments = 'Data/CommentsApril2018.csv'

In [5]:
# Read in File
comm = pd.read_csv(comments)

  interactivity=interactivity, compiler=compiler, result=result)


In [6]:
comm.columns

Index(['approveDate', 'articleID', 'articleWordCount', 'commentBody',
       'commentID', 'commentSequence', 'commentTitle', 'commentType',
       'createDate', 'depth', 'editorsSelection', 'inReplyTo', 'newDesk',
       'parentID', 'parentUserDisplayName', 'permID', 'picURL', 'printPage',
       'recommendations', 'recommendedFlag', 'replyCount', 'reportAbuseFlag',
       'sectionName', 'sharing', 'status', 'timespeople', 'trusted',
       'typeOfMaterial', 'updateDate', 'userDisplayName', 'userID',
       'userLocation', 'userTitle', 'userURL'],
      dtype='object')

Column "userTitle" has both NaN and string data types. Replace Nan with "Unknown" to have one uniform data type in the column

In [7]:
comm.iloc[:, 32] = comm.iloc[:, 32].replace(np.nan, 'Unknown', regex=True); comm.head()

Unnamed: 0,approveDate,articleID,articleWordCount,commentBody,commentID,commentSequence,commentTitle,commentType,createDate,depth,...,status,timespeople,trusted,typeOfMaterial,updateDate,userDisplayName,userID,userLocation,userTitle,userURL
0,1524594282,5adf6684068401528a2aa69b,781.0,How could the league possibly refuse this offe...,26853969.0,26853969.0,<br/>,comment,1524594011,1.0,...,approved,1,0,News,1524594282,Christopher Rillo,46566740.0,San Francisco,Unknown,
1,1524594252,5adf6684068401528a2aa69b,781.0,"So then the execs can be like ""yeah...we will ...",26853699.0,26853699.0,<br/>,comment,1524593146,1.0,...,approved,1,0,News,1524594252,Matt Brand,64324866.0,"Williamsburg, Brooklyn",Unknown,
2,1524594250,5adf6684068401528a2aa69b,781.0,I would not want to play chess against these c...,26853677.0,26853677.0,<br/>,comment,1524593032,1.0,...,approved,1,0,News,1524594250,Joseph,78105093.0,"Fayetteville, AR",Unknown,
3,1524593431,5adf6684068401528a2aa69b,781.0,Could the cheerleaders join the Actors' Equity...,26853784.0,26853784.0,<br/>,comment,1524593426,1.0,...,approved,0,0,News,1524593431,Stephen,81939618.0,"Phoenix, AZ",Unknown,
4,1524595048,5adf653f068401528a2aa697,656.0,Seeking conclusions which support preconceived...,26854236.0,26854236.0,<br/>,comment,1524595043,1.0,...,approved,1,0,News,1524595048,Paul Zorsky,58642997.0,Texas,Unknown,


In [8]:
filt = (comm.sectionName != "Unknown") & (comm.commentType == "comment")
data = comm.commentBody[filt]
sections = comm.sectionName[filt]

## Train/Test Split and Training

In [9]:
train_data, test_data, train_target, test_target = train_test_split(data, sections, random_state=RANDOM_STATE)

In [14]:
vectorizer = CountVectorizer(decode_error='ignore',
                             stop_words='english')
pet_model = Pipeline([('vec', vectorizer),
                    ('pet', Perceptron(random_state=RANDOM_STATE))])
pet_model.fit(train_data, train_target)



Pipeline(memory=None,
     steps=[('vec', CountVectorizer(analyzer='word', binary=False, decode_error='ignore',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        s..._jobs=1, penalty=None, random_state=28,
      shuffle=True, tol=None, verbose=0, warm_start=False))])

## Predictions and Metric Evaluation

In [15]:
pet_model.score(test_data, test_target)

0.6618412572505081

## Grid Search

In [12]:
from sklearn.model_selection import GridSearchCV

In [29]:
grid_params = dict(vec__ngram_range=[(1,1), (1,2), (1,3)])

In [30]:
pet_model = Pipeline([('vec', vectorizer),
                      ('clf', Perceptron(n_jobs=-1, random_state=RANDOM_STATE))])

gs = GridSearchCV(estimator=pet_model,
                 param_grid=grid_params,
                 scoring='accuracy',
                 cv=5,
                 n_jobs=-1)

In [31]:
gs.fit(train_data, train_target)



GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vec', CountVectorizer(analyzer='word', binary=False, decode_error='ignore',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        s...jobs=-1, penalty=None, random_state=28,
      shuffle=True, tol=None, verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'vec__ngram_range': [(1, 1), (1, 2), (1, 3)]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [26]:
gs.best_params_

{'clf__penalty': 'l1'}

In [27]:
gs.best_estimator_

Pipeline(memory=None,
     steps=[('vec', CountVectorizer(analyzer='word', binary=False, decode_error='ignore',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        s...jobs=-1, penalty='l1', random_state=28,
      shuffle=True, tol=None, verbose=0, warm_start=False))])

In [28]:
gs.score(test_data, test_target)

0.6170244410292003

## With TdidfVectorizor

In [32]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [33]:
vectorizer = TfidfVectorizer(input='content',
                            analyzer='word',
                            stop_words='english',
                            decode_error='ignore')

pet_model = Pipeline([('vec', vectorizer),
                      ('clf', Perceptron(n_jobs=-1, random_state=RANDOM_STATE))])

In [34]:
pet_model.fit(train_data, train_target)



Pipeline(memory=None,
     steps=[('vec', TfidfVectorizer(analyzer='word', binary=False, decode_error='ignore',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
   ...jobs=-1, penalty=None, random_state=28,
      shuffle=True, tol=None, verbose=0, warm_start=False))])

In [35]:
pet_model.score(test_data, test_target)

0.6701700461057954

In [36]:
grid_params = dict(vec__ngram_range=[(1,1), (1,2), (1,3)])

In [37]:
pet_model = Pipeline([('vec', vectorizer),
                      ('clf', Perceptron(n_jobs=-1, random_state=RANDOM_STATE))])

gs2 = GridSearchCV(estimator=pet_model,
                 param_grid=grid_params,
                 scoring='accuracy',
                 cv=5,
                 n_jobs=-1)

In [38]:
gs2.fit(train_data, train_target)



GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vec', TfidfVectorizer(analyzer='word', binary=False, decode_error='ignore',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
   ...jobs=-1, penalty=None, random_state=28,
      shuffle=True, tol=None, verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'vec__ngram_range': [(1, 1), (1, 2), (1, 3)]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [39]:
gs2.best_params_

{'vec__ngram_range': (1, 3)}

In [40]:
gs2.best_estimator_

Pipeline(memory=None,
     steps=[('vec', TfidfVectorizer(analyzer='word', binary=False, decode_error='ignore',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=True,
   ...jobs=-1, penalty=None, random_state=28,
      shuffle=True, tol=None, verbose=0, warm_start=False))])

In [41]:
gs2.score(test_data, test_target)

0.7141440682167468

## Optimized Model with Classification Report

In [42]:
from sklearn.metrics import classification_report

In [46]:
pet_model = gs2.best_estimator_

In [47]:
predictions = pet_model.predict(test_data)
print(classification_report(test_target, predictions))

                        precision    recall  f1-score   support

                Africa       0.53      0.49      0.51        76
              Americas       0.61      0.50      0.55       232
          Art & Design       0.53      0.50      0.52        80
          Asia Pacific       0.59      0.56      0.58       943
             Australia       0.37      0.43      0.40        23
           Book Review       0.26      0.13      0.18       180
                Canada       0.57      0.71      0.64       133
    College Basketball       0.34      0.56      0.43        18
               Cycling       0.51      0.75      0.61        28
              DealBook       0.50      0.29      0.37        73
                   Eat       0.74      0.69      0.71       119
               Economy       0.56      0.49      0.52       294
                Europe       0.54      0.46      0.50       733
                Family       0.65      0.59      0.62       327
                  Golf       0.53      

  'recall', 'true', average, warn_for)


## Save Model

In [48]:
from joblib import dump, load
dump(pet_model, 'Models/perceptron_small.joblib') 

['Models/perceptron_small.joblib']

## Save Notebook

In [49]:
import dill
dill.dump_session('Notebook_Saves/Perceptron_Exploration.db')