## Setup

In [1]:
reset -fs

In [2]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC

In [3]:
RANDOM_STATE = 28

## Load Data and Filter

In [4]:
# Comments File
comments = '../Data/CommentsApril2018.csv'

In [5]:
# Read in File
comm = pd.read_csv(comments)

  interactivity=interactivity, compiler=compiler, result=result)


In [6]:
comm.columns

Index(['approveDate', 'articleID', 'articleWordCount', 'commentBody',
       'commentID', 'commentSequence', 'commentTitle', 'commentType',
       'createDate', 'depth', 'editorsSelection', 'inReplyTo', 'newDesk',
       'parentID', 'parentUserDisplayName', 'permID', 'picURL', 'printPage',
       'recommendations', 'recommendedFlag', 'replyCount', 'reportAbuseFlag',
       'sectionName', 'sharing', 'status', 'timespeople', 'trusted',
       'typeOfMaterial', 'updateDate', 'userDisplayName', 'userID',
       'userLocation', 'userTitle', 'userURL'],
      dtype='object')

In [7]:
# Filter out Unknown & comment commentTypes & Non-NAs
filt = (comm.sectionName != "Unknown") & (comm.commentType == "comment") & comm.sectionName.notna()
data = comm.commentBody[filt]
sections = comm.sectionName[filt]

## Train/Test Split and Training

In [8]:
train_data, test_data, train_target, test_target = train_test_split(data, sections, random_state=RANDOM_STATE)

In [9]:
vectorizer = CountVectorizer(decode_error='ignore',
                             stop_words='english')
lsvc_model = Pipeline([('vec', vectorizer),
                       ('clf', LinearSVC(random_state=RANDOM_STATE))])

In [10]:
lsvc_model.fit(train_data, train_target)

Pipeline(memory=None,
     steps=[('vec', CountVectorizer(analyzer='word', binary=False, decode_error='ignore',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        s... max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=28, tol=0.0001,
     verbose=0))])

## Predictions and Metric Evaluation

In [11]:
lsvc_model.score(test_data, test_target)

0.70472460463041

## Grid Search

In [12]:
from sklearn.model_selection import GridSearchCV

In [13]:
grid_params = dict(vec__ngram_range=[(1,1), (1,2), (1,3)],
                   clf__loss=['hinge', 'squared_hinge'])

In [14]:
lsvc_model = Pipeline([('vec', vectorizer),
                       ('clf', LinearSVC(random_state=RANDOM_STATE))])
gs = GridSearchCV(estimator=lsvc_model,
                 param_grid=grid_params,
                 scoring='accuracy',
                 cv=5,
                 n_jobs=-1)

In [15]:
gs.fit(train_data, train_target)



GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vec', CountVectorizer(analyzer='word', binary=False, decode_error='ignore',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        s... max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=28, tol=0.0001,
     verbose=0))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'vec__ngram_range': [(1, 1), (1, 2), (1, 3)], 'clf__loss': ['hinge', 'squared_hinge']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [16]:
gs.best_params_

{'clf__loss': 'squared_hinge', 'vec__ngram_range': (1, 2)}

In [17]:
gs.best_estimator_

Pipeline(memory=None,
     steps=[('vec', CountVectorizer(analyzer='word', binary=False, decode_error='ignore',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words='english',
        s... max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=28, tol=0.0001,
     verbose=0))])

In [18]:
gs.score(test_data, test_target)

0.7359079867135987

## With TdidfVectorizor

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [20]:
vectorizer = TfidfVectorizer(input='content',
                            analyzer='word',
                            stop_words='english',
                            decode_error='ignore')

lsvc_model = Pipeline([('vec', vectorizer),
                       ('clf', LinearSVC(random_state=RANDOM_STATE))])

In [21]:
lsvc_model.fit(train_data, train_target)

Pipeline(memory=None,
     steps=[('vec', TfidfVectorizer(analyzer='word', binary=False, decode_error='ignore',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
   ... max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=28, tol=0.0001,
     verbose=0))])

In [23]:
lsvc_model.score(test_data, test_target)

0.7550443706311041

In [24]:
grid_params = dict(vec__ngram_range=[(1,1), (1,2), (1,3)],
                   clf__loss=['hinge', 'squared_hinge'])

In [25]:
lsvc_model = Pipeline([('vec', vectorizer),
                       ('clf', LinearSVC(random_state=RANDOM_STATE))])
gs2 = GridSearchCV(estimator=lsvc_model,
                 param_grid=grid_params,
                 scoring='accuracy',
                 cv=5,
                 n_jobs=-1)

In [26]:
gs2.fit(train_data, train_target)



GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vec', TfidfVectorizer(analyzer='word', binary=False, decode_error='ignore',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
   ... max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=28, tol=0.0001,
     verbose=0))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'vec__ngram_range': [(1, 1), (1, 2), (1, 3)], 'clf__loss': ['hinge', 'squared_hinge']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [27]:
gs2.best_params_

{'clf__loss': 'hinge', 'vec__ngram_range': (1, 2)}

In [28]:
gs2.best_estimator_

Pipeline(memory=None,
     steps=[('vec', TfidfVectorizer(analyzer='word', binary=False, decode_error='ignore',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,
   ...nge', max_iter=1000, multi_class='ovr',
     penalty='l2', random_state=28, tol=0.0001, verbose=0))])

In [29]:
gs2.score(test_data, test_target)

0.769718903376134

## Optimized Model with Classification Report

In [30]:
from sklearn.metrics import classification_report

In [31]:
vectorizer = TfidfVectorizer(input='content',
                            analyzer='word',
                            stop_words='english',
                            decode_error='ignore')

lsvc_model = gs2.best_estimator_

In [32]:
lsvc_model.fit(train_data, train_target)

Pipeline(memory=None,
     steps=[('vec', TfidfVectorizer(analyzer='word', binary=False, decode_error='ignore',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,
   ...nge', max_iter=1000, multi_class='ovr',
     penalty='l2', random_state=28, tol=0.0001, verbose=0))])

In [33]:
predictions = lsvc_model.predict(test_data)
print(classification_report(test_target, predictions))

                        precision    recall  f1-score   support

                Africa       0.95      0.47      0.63        76
              Americas       0.84      0.42      0.56       232
          Art & Design       0.93      0.34      0.50        80
          Asia Pacific       0.70      0.57      0.63       943
             Australia       1.00      0.39      0.56        23
           Book Review       0.83      0.06      0.10       180
                Canada       0.79      0.63      0.70       133
    College Basketball       1.00      0.39      0.56        18
               Cycling       0.95      0.64      0.77        28
              DealBook       0.62      0.29      0.39        73
                   Eat       0.93      0.71      0.80       119
               Economy       0.73      0.49      0.58       294
                Europe       0.72      0.41      0.52       733
                Family       0.76      0.61      0.67       327
                  Golf       1.00      

  'precision', 'predicted', average, warn_for)


## Save Model

In [34]:
from joblib import dump, load
dump(lsvc_model, '../Models/svm_small.joblib') 

['../Models/svm_small.joblib']

## Save Notebook

In [35]:
import dill
dill.dump_session('../Notebook_Saves/LinearSVM_Exploration.db')