## Setup

In [1]:
reset -fs

In [2]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC

In [3]:
RANDOM_STATE = 28

## Load Data and Filter

In [5]:
# Comments File
comments = 'Data/combined_comments.csv'

In [6]:
# Read in File
comm = pd.read_csv(comments).drop('Unnamed: 0', axis=1)

In [7]:
comm.columns

Index(['commentType', 'commentBody', 'sectionName'], dtype='object')

Column "userTitle" has both NaN and string data types. Replace Nan with "Unknown" to have one uniform data type in the column

In [8]:
comm.head()

Unnamed: 0,commentType,commentBody,sectionName
0,comment,ANY anti Trump propaganda from Gaga and my TV ...,Pro Football
1,comment,"I'll not watch the SB, nor the grammys or osca...",Pro Football
2,comment,"NFL's going to do another ""in-your-face, Ameri...",Pro Football
3,comment,I'm continually amazed at the ill-placed crede...,Pro Football
4,comment,"Personally, I do not want to see any politics ...",Pro Football


In [9]:
filt = (comm.sectionName != "Unknown") & (comm.commentType == "comment")
data = comm.commentBody[filt]
sections = comm.sectionName[filt]

## Train/Test Split and Training

In [10]:
train_data, test_data, train_target, test_target = train_test_split(data, sections, random_state=RANDOM_STATE)

In [11]:
vectorizer = CountVectorizer(decode_error='ignore',
                             stop_words='english')
lsvc_model = Pipeline([('vec', vectorizer),
                       ('clf', LinearSVC(random_state=RANDOM_STATE))])

In [12]:
lsvc_model.fit(train_data, train_target)



Pipeline(memory=None,
     steps=[('vec', CountVectorizer(analyzer='word', binary=False, decode_error='ignore',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        s... max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=28, tol=0.0001,
     verbose=0))])

## Predictions and Metric Evaluation

In [13]:
predictions = lsvc_model.predict(test_data)
accuracy_score(predictions, test_target))

0.7792628171661272


## Grid Search

In [14]:
from sklearn.model_selection import GridSearchCV

In [15]:
grid_params = dict(vec__ngram_range=[(1,1), (1,2), (1,3)],
                   clf__loss=['hinge', 'squared_hinge'])

In [16]:
lsvc_model = Pipeline([('vec', vectorizer),
                       ('clf', LinearSVC(random_state=RANDOM_STATE))])
gs = GridSearchCV(estimator=lsvc_model,
                 param_grid=grid_params,
                 scoring='accuracy',
                 cv=5,
                 n_jobs=-1)

In [17]:
gs.fit(train_data, train_target)



GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('vec', CountVectorizer(analyzer='word', binary=False, decode_error='ignore',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        s... max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=28, tol=0.0001,
     verbose=0))]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'vec__ngram_range': [(1, 1), (1, 2), (1, 3)], 'clf__loss': ['hinge', 'squared_hinge']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [18]:
gs.best_params_

{'clf__loss': 'squared_hinge', 'vec__ngram_range': (1, 2)}

In [19]:
gs.best_estimator_

Pipeline(memory=None,
     steps=[('vec', CountVectorizer(analyzer='word', binary=False, decode_error='ignore',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words='english',
        s... max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=28, tol=0.0001,
     verbose=0))])

In [20]:
gs.score(test_data, test_target)

0.8038529811005534


## With TdidfVectorizor

In [21]:
import re
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from nltk.stem.porter import PorterStemmer

In [22]:
def tokenize(text):
    """
    Tokenize text and return a non-unique list of tokenized words
    found in the text. Normalize to lowercase, strip punctuation,
    remove stop words, drop words of length < 3, strip digits.
    """
    regex = re.compile('[' + re.escape(string.punctuation) + '0-9\\r\\t\\n]')
    nopunct = regex.sub(" ", text)  # delete stuff but leave at least a space to avoid clumping together
    words = nopunct.split(" ")
    words = [w for w in words if len(w) > 2]  # ignore a, an, to, at, be, ...
    words = [w.lower() for w in words]
    goodwords = [w for w in words if w not in ENGLISH_STOP_WORDS]
    return goodwords


def stemwords(words):
    """
    Given a list of tokens/words, return a new list with each word
    stemmed using a PorterStemmer.
    """
    stemmer = PorterStemmer()
    return [stemmer.stem(w) for w in words]


def tokenizer(text):
    return stemwords(tokenize(text))

In [23]:
vectorizer = TfidfVectorizer(input='content',
                            analyzer='word',
#                             tokenizer=tokenizer,
                            stop_words='english',
                            decode_error='ignore')

lsvc_model = Pipeline([('vec', vectorizer),
                       ('clf', LinearSVC(random_state=RANDOM_STATE))])

In [24]:
lsvc_model.fit(train_data, train_target)

Pipeline(memory=None,
     steps=[('vec', TfidfVectorizer(analyzer='word', binary=False, decode_error='ignore',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
 ... max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=28, tol=0.0001,
     verbose=0))])

In [25]:
lsvc_model.score(test_data, test_target)

0.8159131251957815


## Grid Search with TfidfVectorizer

In [27]:
grid_params = dict(vec__ngram_range=[(1,1), (1,2), (1,3)],
                   clf__loss=['hinge', 'squared_hinge'])

In [28]:
lsvc_model = Pipeline([('vec', vectorizer),
                       ('clf', LinearSVC(random_state=RANDOM_STATE))])
gs2 = GridSearchCV(estimator=lsvc_model,
                 param_grid=grid_params,
                 scoring='accuracy',
                 cv=5,
                 n_jobs=-1)

In [29]:
gs2.fit(train_data, train_target)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('vec', TfidfVectorizer(analyzer='word', binary=False, decode_error='ignore',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
 ... max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=28, tol=0.0001,
     verbose=0))]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'vec__ngram_range': [(1, 1), (1, 2), (1, 3)], 'clf__loss': ['hinge', 'squared_hinge']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [30]:
gs2.best_params_

{'clf__loss': 'hinge', 'vec__ngram_range': (1, 2)}

In [31]:
gs2.best_estimator_

Pipeline(memory=None,
     steps=[('vec', TfidfVectorizer(analyzer='word', binary=False, decode_error='ignore',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,
 ...nge', max_iter=1000, multi_class='ovr',
     penalty='l2', random_state=28, tol=0.0001, verbose=0))])

In [37]:
gs2.score(test_data, test_target)

0.8284431450349796


## Optimized Model with Classification Report

In [44]:
from sklearn.metrics import classification_report

In [47]:
vectorizer = TfidfVectorizer(input='content',
                            analyzer='word',
#                             tokenizer=tokenizer,
                            stop_words='english',
                            decode_error='ignore',
                            ngram_range=(1,2))

lsvc_model = gs2.best_estimator_

In [48]:
lsvc_model.fit(train_data, train_target)

Pipeline(memory=None,
     steps=[('vec', TfidfVectorizer(analyzer='word', binary=False, decode_error='ignore',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,
 ...nge', max_iter=1000, multi_class='ovr',
     penalty='l2', random_state=28, tol=0.0001, verbose=0))])

In [49]:
predictions = lsvc_model.predict(test_data)
print(accuracy_score(test_target, predictions))
print(classification_report(test_target, predictions))

0.8284431450349796
                            precision    recall  f1-score   support

401(k)'s and Similar Plans       0.00      0.00      0.00         5
                    Africa       1.00      0.29      0.44        21
                  Americas       0.00      0.00      0.00         8
              Asia Pacific       0.70      0.57      0.63       339
                  Baseball       0.89      0.88      0.88       156
               Book Review       0.38      0.18      0.24        28
        College Basketball       0.96      0.47      0.63        53
                  DealBook       0.68      0.29      0.41       212
                       Eat       0.92      0.84      0.88        56
                   Economy       0.67      0.07      0.13        27
            Education Life       0.89      0.28      0.42        29
     Energy & Environment        0.70      0.58      0.64        24
                    Europe       0.83      0.68      0.75       355
                    Family  