## Setup

In [1]:
reset -fs

In [2]:
import numpy as np
import pandas as pd
import re
import string

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from nltk.stem.porter import PorterStemmer

In [3]:
RANDOM_STATE = 28

In [4]:
def tokenize(text):
    """
    Tokenize text and return a non-unique list of tokenized words
    found in the text. Normalize to lowercase, strip punctuation,
    remove stop words, drop words of length < 3, strip digits.
    """
    regex = re.compile('[' + re.escape(string.punctuation) + '0-9\\r\\t\\n]')
    nopunct = regex.sub(" ", text)  # delete stuff but leave at least a space to avoid clumping together
    words = nopunct.split(" ")
    words = [w for w in words if len(w) > 2]  # ignore a, an, to, at, be, ...
    words = [w.lower() for w in words]
    goodwords = [w for w in words if w not in ENGLISH_STOP_WORDS]
    return goodwords


def stemwords(words):
    """
    Given a list of tokens/words, return a new list with each word
    stemmed using a PorterStemmer.
    """
    stemmer = PorterStemmer()
    return [stemmer.stem(w) for w in words]


def tokenizer(text):
    return stemwords(tokenize(text))

## Loading Data 

In [5]:
# comments = 'Data/CommentsApril2018.csv'
comments = '/Users/Nina/Documents/classes/msds621/nina_nyt/nyt-comments/CommentsApril2018.csv'
# Read in File
comm = pd.read_csv(comments)

  interactivity=interactivity, compiler=compiler, result=result)


Column "userTitle" has both NaN and string data types. Replace Nan with "Unknown" to have one uniform data type in the column

In [6]:
comm.iloc[:, 32] = comm.iloc[:, 32].replace(np.nan, 'Unknown', regex=True); comm.head()

Unnamed: 0,approveDate,articleID,articleWordCount,commentBody,commentID,commentSequence,commentTitle,commentType,createDate,depth,...,status,timespeople,trusted,typeOfMaterial,updateDate,userDisplayName,userID,userLocation,userTitle,userURL
0,1524594282,5adf6684068401528a2aa69b,781.0,How could the league possibly refuse this offe...,26853969.0,26853969.0,<br/>,comment,1524594011,1.0,...,approved,1,0,News,1524594282,Christopher Rillo,46566740.0,San Francisco,Unknown,
1,1524594252,5adf6684068401528a2aa69b,781.0,"So then the execs can be like ""yeah...we will ...",26853699.0,26853699.0,<br/>,comment,1524593146,1.0,...,approved,1,0,News,1524594252,Matt Brand,64324866.0,"Williamsburg, Brooklyn",Unknown,
2,1524594250,5adf6684068401528a2aa69b,781.0,I would not want to play chess against these c...,26853677.0,26853677.0,<br/>,comment,1524593032,1.0,...,approved,1,0,News,1524594250,Joseph,78105093.0,"Fayetteville, AR",Unknown,
3,1524593431,5adf6684068401528a2aa69b,781.0,Could the cheerleaders join the Actors' Equity...,26853784.0,26853784.0,<br/>,comment,1524593426,1.0,...,approved,0,0,News,1524593431,Stephen,81939618.0,"Phoenix, AZ",Unknown,
4,1524595048,5adf653f068401528a2aa697,656.0,Seeking conclusions which support preconceived...,26854236.0,26854236.0,<br/>,comment,1524595043,1.0,...,approved,1,0,News,1524595048,Paul Zorsky,58642997.0,Texas,Unknown,


## Feature Engineering, Training, and Testing

Transforming categorical variables 'newDesk' and 'typeOfMaterial' using one-hot encoding

#### One-Hot Encoding features after removing 'Unknown' Section Names 

In [7]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
filt = (comm.sectionName != "Unknown") & (comm.commentType == "comment")
comm_filt = comm[filt].copy().reset_index()

In [8]:
# transform and map newDesk categories 
newDesk_le = LabelEncoder()
newDesk_labels = newDesk_le.fit_transform(comm_filt['newDesk'])
comm_filt['newDesk_label'] = newDesk_labels

# encoding newDesk
newDesk_ohe = OneHotEncoder()
newDesk_feature_arr = newDesk_ohe.fit_transform(comm_filt[['newDesk_label']]).toarray()
newDesk_feature_labels = list(newDesk_le.classes_)
newDesk_features = pd.DataFrame(newDesk_feature_arr, columns=newDesk_feature_labels)

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [9]:
# transform and map typeOfMaterial categories 
material_le = LabelEncoder()
material_labels = material_le.fit_transform(comm_filt['typeOfMaterial'])
comm_filt['material_label'] = material_labels

# encoding typeOfMaterial
material_ohe = OneHotEncoder()
material_feature_arr = material_ohe.fit_transform(comm_filt[['material_label']]).toarray()
material_feature_labels = list(material_le.classes_)
material_features = pd.DataFrame(material_feature_arr, columns=material_feature_labels)

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [10]:
# subsetting comment data frame and concatenating with new features
comm_filt_sub = comm_filt.loc[:,['articleID','commentID', 'commentBody', 'sectionName']]
comm_df_ohe = pd.concat([comm_filt_sub, newDesk_features, material_features], axis=1)

In [11]:
data = comm_df_ohe.loc[:,newDesk_feature_labels]
sections = comm_df_ohe.sectionName

In [12]:
train_data_no_unknown, test_data_no_unknown, train_target_no_unknown, test_target_no_unknown = train_test_split(data, sections, random_state=RANDOM_STATE)

In [13]:
lsvc_model = LinearSVC(random_state=RANDOM_STATE)
lsvc_model.fit(train_data_no_unknown, train_target_no_unknown)
predicted = lsvc_model.predict(test_data_no_unknown)
accuracy_score(predicted, test_target_no_unknown)

0.79336671459025332

#### One-Hot Encoding features before removing 'Unknown' Section Names 

In [14]:
# transform and map newDesk categories 
newDesk_le = LabelEncoder()
newDesk_labels = newDesk_le.fit_transform(comm['newDesk'])
comm['newDesk_label'] = newDesk_labels

# encoding newDesk
newDesk_ohe = OneHotEncoder()
newDesk_feature_arr = newDesk_ohe.fit_transform(comm[['newDesk_label']]).toarray()
newDesk_feature_labels = list(newDesk_le.classes_)
newDesk_features = pd.DataFrame(newDesk_feature_arr, columns=newDesk_feature_labels)

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [15]:
# transform and map typeOfMaterial categories 
material_le = LabelEncoder()
material_labels = material_le.fit_transform(comm['typeOfMaterial'])
comm['material_label'] = material_labels

# encoding typeOfMaterial
material_ohe = OneHotEncoder()
material_feature_arr = material_ohe.fit_transform(comm[['material_label']]).toarray()
material_feature_labels = list(material_le.classes_)
material_features = pd.DataFrame(material_feature_arr, columns=material_feature_labels)

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [16]:
comm_sub = comm.loc[:,['articleID','commentID', 'commentBody', 'sectionName']]
comm_df_ohe = pd.concat([comm_sub, newDesk_features, material_features], axis=1)

In [17]:
data = comm_df_ohe.loc[:,newDesk_feature_labels]
sections = comm_df_ohe.sectionName

In [19]:
train_data_w_unknown, test_data_w_unknown, train_target_w_unknown, test_target_w_unknown  = train_test_split(data, sections, random_state=RANDOM_STATE)

In [21]:
lsvc_model = LinearSVC(random_state=RANDOM_STATE)
lsvc_model.fit(train_data_w_unknown, train_target_w_unknown)
predicted = lsvc_model.predict(test_data_w_unknown)
accuracy_score(predicted, test_target_w_unknown)

0.79018888435928791

## Grid Search

In [22]:
from sklearn.model_selection import GridSearchCV

In [23]:
grid_params = dict(loss=['hinge', 'squared_hinge'])
lsvc_model = LinearSVC(random_state=RANDOM_STATE)
# lsvc_model = Pipeline([('clf', LinearSVC(random_state=RANDOM_STATE))])
gs = GridSearchCV(estimator=lsvc_model,
                 param_grid=grid_params,
                 scoring='accuracy',
                 cv=5,
                 n_jobs=-1)

# gs = GridSearchCV(estimator=lsvc_model,
#                  param_grid=grid_params,
#                  scoring='accuracy',
#                  cv=5)

In [24]:
gs.fit(train_data_no_unknown, train_target_no_unknown)



GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=28, tol=0.0001,
     verbose=0),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'loss': ['hinge', 'squared_hinge']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [25]:
gs.best_params_

{'loss': 'squared_hinge'}

In [26]:
gs.best_estimator_

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=28, tol=0.0001,
     verbose=0)

In [28]:
gs.score(test_data_no_unknown, test_target_no_unknown)

0.79336671459025332