## Setup

In [1]:
reset -fs

In [2]:
import numpy as np
import pandas as pd

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC

from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
RANDOM_STATE = 28

## Load Data

In [20]:
comments = 'Data/combined_everything.csv'
# comments = '/Users/Nina/Downloads/combined_everything.csv'

In [21]:
comm = pd.read_csv(comments).drop('Unnamed: 0', axis=1)

Column "userTitle" has both NaN and string data types. Replace Nan with "Unknown" to have one uniform data type in the column

In [22]:
comm.head()

Unnamed: 0,commentType,commentBody,sectionName
0,comment,The snake-filled heads comment made me think o...,Unknown
1,comment,She-devil reporting for duty!,Unknown
2,comment,XX is the new mark of the devil.,Unknown
3,comment,"""Courtland Sykes"" should be writing for The On...",Unknown
4,comment,"I happen to descend for a few of them, because...",Unknown


In [23]:
comm.shape

(54657755, 3)

In [9]:
comm.iloc[:, 32] = comm.iloc[:, 32].replace(np.nan, 'Unknown', regex=True); comm.head()

IndexError: single positional indexer is out-of-bounds

#### One-Hot Encoding features after removing 'Unknown' Section Names 

In [None]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
filt = (comm.sectionName != "Unknown") & (comm.commentType == "comment")
comm_filt = comm[filt].copy().reset_index()

In [None]:
# transform and map newDesk categories 
newDesk_le = LabelEncoder()
newDesk_labels = newDesk_le.fit_transform(comm_filt['newDesk'])
comm_filt['newDesk_label'] = newDesk_labels

# encoding newDesk
newDesk_ohe = OneHotEncoder()
newDesk_feature_arr = newDesk_ohe.fit_transform(comm_filt[['newDesk_label']]).toarray()
newDesk_feature_labels = list(newDesk_le.classes_)
newDesk_features = pd.DataFrame(newDesk_feature_arr, columns=newDesk_feature_labels)

In [None]:
# transform and map typeOfMaterial categories 
material_le = LabelEncoder()
material_labels = material_le.fit_transform(comm_filt['typeOfMaterial'])
comm_filt['material_label'] = material_labels

# encoding typeOfMaterial
material_ohe = OneHotEncoder()
material_feature_arr = material_ohe.fit_transform(comm_filt[['material_label']]).toarray()
material_feature_labels = list(material_le.classes_)
material_features = pd.DataFrame(material_feature_arr, columns=material_feature_labels)

In [None]:
# subsetting comment data frame and concatenating with new features
comm_filt_sub = comm_filt.loc[:,['articleID','commentID', 'commentBody', 'sectionName']]
comm_df_ohe = pd.concat([comm_filt_sub, newDesk_features, material_features], axis=1)

In [None]:
data = comm_df_ohe.loc[:,newDesk_feature_labels]
sections = comm_df_ohe.sectionName

In [None]:
train_data_no_unknown, test_data_no_unknown, train_target_no_unknown, test_target_no_unknown = train_test_split(data, sections, random_state=RANDOM_STATE)

In [None]:
lsvc_model = LinearSVC(random_state=RANDOM_STATE)
lsvc_model.fit(train_data_no_unknown, train_target_no_unknown)
predicted = lsvc_model.predict(test_data_no_unknown)
accuracy_score(predicted, test_target_no_unknown)

## Grid Search

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
grid_params = dict(loss=['hinge', 'squared_hinge'])
lsvc_model = LinearSVC(random_state=RANDOM_STATE)
gs = GridSearchCV(estimator=lsvc_model,
                 param_grid=grid_params,
                 scoring='accuracy',
                 cv=5,
                 n_jobs=-1)
gs.fit(train_data_no_unknown, train_target_no_unknown)

In [None]:
gs.score(test_data_no_unknown, test_target_no_unknown)

In [None]:
# from sklearn.metrics import classification_report