## Setup

In [1]:
reset -fs

In [2]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC

In [3]:
RANDOM_STATE = 28

## Load Data and Filter

In [4]:
# Comments File
comments = 'Data/combined_comments.csv'

In [5]:
# Read in File
comm = pd.read_csv(comments).drop('Unnamed: 0', axis=1)

In [6]:
comm.columns

Index(['commentType', 'commentBody', 'sectionName'], dtype='object')

Column "userTitle" has both NaN and string data types. Replace Nan with "Unknown" to have one uniform data type in the column

In [7]:
comm.head()

Unnamed: 0,commentType,commentBody,sectionName
0,comment,The snake-filled heads comment made me think o...,Unknown
1,comment,She-devil reporting for duty!,Unknown
2,comment,XX is the new mark of the devil.,Unknown
3,comment,"""Courtland Sykes"" should be writing for The On...",Unknown
4,comment,"I happen to descend for a few of them, because...",Unknown


In [8]:
comm.sectionName.unique()

array(['Unknown', 'Politics', 'Television', 'Europe', 'Middle East',
       'Pro Football', 'Asia Pacific', 'Live', 'The Daily', 'Mind',
       'Art & Design', 'Wine, Beer & Cocktails', 'Americas',
       'Sunday Review', 'Economy', 'Family', 'Dance', 'Africa',
       'Energy & Environment ', 'DealBook', 'Book Review', 'Music',
       'Olympics', 'Move', 'Lesson Plans', 'Entrepreneurship', 'Baseball',
       'Media', 'Entertainment', 'Opinion | Politics', 'Weddings',
       'Real Estate', 'Eat', 'College Basketball', 'Australia',
       'Retirement', 'Neighborhoods', 'Personal Tech', 'Canada', 'Hockey',
       'Tennis', 'Cycling', 'Pro Basketball', 'Learning', 'Golf',
       'Soccer', "401(k)'s and Similar Plans", 'Rugby',
       'College Football', 'Paying for College', 'Insider Events',
       'Editorials', 'Fashion & Beauty', 'World Cup', nan, 'Automobiles',
       'Food', 'Art', 'Opinion | The World', 'Cricket', 'Room For Debate',
       'Education Life', 'Student Loans', 'Auto Rac

In [9]:
filt = (comm.sectionName != "Unknown") & (comm.commentType == "comment") & comm.sectionName.notna()
data = comm.commentBody[filt]
sections = comm.sectionName[filt]

## Train/Test Split and Training

In [10]:
train_data, test_data, train_target, test_target = train_test_split(data, sections, random_state=RANDOM_STATE)

In [11]:
vectorizer = CountVectorizer(decode_error='ignore',
                             stop_words='english')
lsvc_model = Pipeline([('vec', vectorizer),
                       ('clf', LinearSVC(random_state=RANDOM_STATE))])

In [12]:
lsvc_model.fit(train_data, train_target)

Pipeline(memory=None,
     steps=[('vec', CountVectorizer(analyzer='word', binary=False, decode_error='ignore',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        s... max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=28, tol=0.0001,
     verbose=0))])

## Predictions and Metric Evaluation

In [13]:
lsvc_model.score(test_data, test_target)

0.6897999957790175

<br>
<br>
<br>

## With TdidfVectorizor

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [15]:
vectorizer = TfidfVectorizer(input='content',
                            analyzer='word',
                            stop_words='english',
                            decode_error='ignore')

lsvc_model = Pipeline([('vec', vectorizer),
                       ('clf', LinearSVC(random_state=RANDOM_STATE))])

In [16]:
lsvc_model.fit(train_data, train_target)

Pipeline(memory=None,
     steps=[('vec', TfidfVectorizer(analyzer='word', binary=False, decode_error='ignore',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
   ... max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=28, tol=0.0001,
     verbose=0))])

In [17]:
lsvc_model.score(test_data, test_target)

0.7234060514819166

## Optimized Model 

In [18]:
vectorizer = TfidfVectorizer(input='content',
                            analyzer='word',
                            stop_words='english',
                            decode_error='ignore',
                            ngram_range=(1, 2))

lsvc_model = Pipeline([('vec', vectorizer),
                       ('clf', LinearSVC(loss='squared_hinge', random_state=RANDOM_STATE))])

In [19]:
lsvc_model.fit(train_data, train_target)

Pipeline(memory=None,
     steps=[('vec', TfidfVectorizer(analyzer='word', binary=False, decode_error='ignore',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,
   ... max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=28, tol=0.0001,
     verbose=0))])

In [20]:
lsvc_model.score(test_data, test_target)

0.7364559223902017

## Optimized Model with Classification Report

In [21]:
from sklearn.metrics import classification_report

In [22]:
predictions = lsvc_model.predict(test_data)
print(accuracy_score(test_target, predictions))
print(classification_report(test_target, predictions))

0.7364559223902017
                            precision    recall  f1-score   support

401(k)'s and Similar Plans       1.00      0.25      0.40        12
                    Africa       0.72      0.26      0.38       244
                  Americas       0.74      0.31      0.44      1267
                       Art       0.00      0.00      0.00        13
              Art & Design       0.86      0.64      0.73       860
              Asia Pacific       0.66      0.46      0.54      4510
                 Australia       0.75      0.17      0.28        89
               Auto Racing       1.00      0.71      0.83        17
               Automobiles       1.00      0.11      0.20         9
                  Baseball       0.87      0.78      0.82       562
               Book Review       0.74      0.16      0.26       459
                    Canada       0.78      0.36      0.50       373
        College Basketball       0.80      0.54      0.65       174
          College Football  

  'precision', 'predicted', average, warn_for)


<br>
<br>
<br>

## Save Model

In [23]:
from joblib import dump, load
dump(lsvc_model, 'Models/svm_full.joblib') 

['Models/svm_full.joblib']

## Save Notebook

In [24]:
import dill
dill.dump_session('Notebook_Saves/LinearSVM_Total_Data.db')