## Setup

In [1]:
reset -fs

In [2]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC

In [3]:
RANDOM_STATE = 28

## Load Data

In [4]:
comments = '../Data/combined_everything.csv'

In [5]:
comm = pd.read_csv(comments)

  interactivity=interactivity, compiler=compiler, result=result)


In [6]:
comm.head()

Unnamed: 0,approveDate,articleID,articleWordCount,commentBody,commentID,commentSequence,commentTitle,commentType,createDate,depth,...,status,timespeople,trusted,typeOfMaterial,updateDate,userDisplayName,userID,userLocation,userTitle,userURL
0,1517529462,5a7258e410f40f00018bed7d,835.0,The snake-filled heads comment made me think o...,25791250.0,25791250.0,<br/>,comment,1517508000.0,1.0,...,approved,1.0,0.0,Op-Ed,1517529462,Jennie,79172841.0,WA,,
1,1517529428,5a7258e410f40f00018bed7d,835.0,She-devil reporting for duty!,25795675.0,25795675.0,<br/>,comment,1517527000.0,1.0,...,approved,1.0,0.0,Op-Ed,1517529428,Nice White Lady,66376882.0,Seattle,,
2,1517529427,5a7258e410f40f00018bed7d,835.0,XX is the new mark of the devil.,25792078.0,25792078.0,<br/>,comment,1517511000.0,1.0,...,approved,1.0,0.0,Op-Ed,1517529427,Stan Sutton,27924638.0,"Westchester County, NY",,
3,1517529425,5a7258e410f40f00018bed7d,835.0,"""Courtland Sykes"" should be writing for The On...",25791604.0,25791604.0,<br/>,comment,1517510000.0,1.0,...,approved,1.0,0.0,Op-Ed,1517529425,Phil Carson,31909107.0,Denver,,
4,1517529418,5a7258e410f40f00018bed7d,835.0,"I happen to descend for a few of them, because...",25795241.0,25795241.0,<br/>,comment,1517525000.0,1.0,...,approved,1.0,0.0,Op-Ed,1517529418,Abby,55271958.0,Tucson,,


In [7]:
comm.shape

(2176364, 34)

#### One-Hot Encoding features after removing 'Unknown' Section Names 

In [8]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
filt = (comm.sectionName != "Unknown") & (comm.commentType == "comment")
comm_filt = comm[filt].copy().reset_index()

In [9]:
# transform and map newDesk categories 
newDesk_le = LabelEncoder()
newDesk_labels = newDesk_le.fit_transform(comm_filt['newDesk'])
comm_filt['newDesk_label'] = newDesk_labels

# encoding newDesk
newDesk_ohe = OneHotEncoder()
newDesk_feature_arr = newDesk_ohe.fit_transform(comm_filt[['newDesk_label']]).toarray()
newDesk_feature_labels = list(newDesk_le.classes_)
newDesk_features = pd.DataFrame(newDesk_feature_arr, columns=newDesk_feature_labels)

In [10]:
# transform and map typeOfMaterial categories 
material_le = LabelEncoder()
material_labels = material_le.fit_transform(comm_filt['typeOfMaterial'])
comm_filt['material_label'] = material_labels

# encoding typeOfMaterial
material_ohe = OneHotEncoder()
material_feature_arr = material_ohe.fit_transform(comm_filt[['material_label']]).toarray()
material_feature_labels = list(material_le.classes_)
material_features = pd.DataFrame(material_feature_arr, columns=material_feature_labels)

In [11]:
# subsetting comment data frame and concatenating with new features
comm_filt_sub = comm_filt.loc[:,['commentBody', 'sectionName']]
comm_df_ohe = pd.concat([comm_filt_sub, newDesk_features, material_features], axis=1)

In [12]:
data = comm_df_ohe.loc[:,newDesk_feature_labels]
sections = comm_df_ohe.sectionName

In [13]:
train_data_no_unknown, test_data_no_unknown, train_target_no_unknown, test_target_no_unknown = train_test_split(data, sections, random_state=RANDOM_STATE)
train_target_no_unknown = train_target_no_unknown.astype(str)
test_target_no_unknown = test_target_no_unknown.astype(str)

In [14]:
lsvc_model = LinearSVC(random_state=RANDOM_STATE)
lsvc_model.fit(train_data_no_unknown, train_target_no_unknown)
lsvc_model.score(test_data_no_unknown, test_target_no_unknown.astype(str))

0.7233396723229959

## Grid Search

In [15]:
from sklearn.model_selection import GridSearchCV

In [16]:
grid_params = dict(loss=['hinge', 'squared_hinge'])
lsvc_model = LinearSVC(random_state=RANDOM_STATE)
gs = GridSearchCV(estimator=lsvc_model,
                 param_grid=grid_params,
                 scoring='accuracy',
                 cv=5,
                 n_jobs=-1)
gs.fit(train_data_no_unknown, train_target_no_unknown)



GridSearchCV(cv=5, error_score='raise',
       estimator=LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=28, tol=0.0001,
     verbose=0),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'loss': ['hinge', 'squared_hinge']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [17]:
gs.score(test_data_no_unknown, test_target_no_unknown)

0.7233396723229959

<br>
<br>
<br>

## Save Model

In [18]:
from joblib import dump, load
dump(gs.best_estimator_, '../Models/ohe_linearsvm_full.joblib') 

['../Models/ohe_linearsvm_full.joblib']

## Save Notebook

In [19]:
import dill
dill.dump_session('../Notebook_Saves/LinearSVM_OHE_Everything.db')