## Setup

In [1]:
reset -fs

In [2]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier

In [3]:
RANDOM_STATE = 28

## Load Data and Filter

In [4]:
# Comments File
comments = '../Data/CommentsApril2018.csv'

In [5]:
# Read in File
comm = pd.read_csv(comments)

  interactivity=interactivity, compiler=compiler, result=result)


In [6]:
comm.columns

Index(['approveDate', 'articleID', 'articleWordCount', 'commentBody',
       'commentID', 'commentSequence', 'commentTitle', 'commentType',
       'createDate', 'depth', 'editorsSelection', 'inReplyTo', 'newDesk',
       'parentID', 'parentUserDisplayName', 'permID', 'picURL', 'printPage',
       'recommendations', 'recommendedFlag', 'replyCount', 'reportAbuseFlag',
       'sectionName', 'sharing', 'status', 'timespeople', 'trusted',
       'typeOfMaterial', 'updateDate', 'userDisplayName', 'userID',
       'userLocation', 'userTitle', 'userURL'],
      dtype='object')

In [7]:
# Filter out Unknown & comment commentTypes & Non-NAs
filt = (comm.sectionName != "Unknown") & (comm.commentType == "comment") & comm.sectionName.notna()
data = comm.commentBody[filt]
sections = comm.sectionName[filt]

## Train/Test Split and Training

In [8]:
train_data, test_data, train_target, test_target = train_test_split(data, sections, random_state=RANDOM_STATE)

In [9]:
vectorizer = CountVectorizer(decode_error='ignore',
                             stop_words='english')
dt_model = Pipeline([('vec', vectorizer),
                       ('clf', DecisionTreeClassifier(random_state=RANDOM_STATE))])

In [10]:
dt_model.fit(train_data, train_target)

Pipeline(memory=None,
     steps=[('vec', CountVectorizer(analyzer='word', binary=False, decode_error='ignore',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        s...        min_weight_fraction_leaf=0.0, presort=False, random_state=28,
            splitter='best'))])

## Predictions and Metric Evaluation

In [11]:
dt_model.score(test_data, test_target)

0.6114223390015369

## Grid Search

In [12]:
from sklearn.model_selection import GridSearchCV

In [13]:
grid_params = dict(vec__ngram_range=[(1,1), (1,2), (1,3)],
                   clf__criterion=['gini', 'entropy'])

In [14]:
gs = GridSearchCV(estimator=dt_model,
                 param_grid=grid_params,
                 scoring='accuracy',
                 cv=5,
                 n_jobs=-1)

In [15]:
gs.fit(train_data, train_target)



KeyboardInterrupt: 

In [None]:
gs.best_params_

In [None]:
gs.best_estimator_

In [None]:
gs.score(test_data, test_target)

## With TdidfVectorizor

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
vectorizer = TfidfVectorizer(input='content',
                            analyzer='word',
                            stop_words='english',
                            decode_error='ignore')

dt_model = Pipeline([('vec', vectorizer),
                       ('clf', DecisionTreeClassifier(random_state=RANDOM_STATE))])

In [None]:
dt_model.fit(train_data, train_target)

In [None]:
predicted = dt_model.predict(test_data)
accuracy_score(predicted, test_target)

In [None]:
grid_params = dict(vec__ngram_range=[(1,1), (1,2), (1,3)],
                   clf__criterion=['gini', 'entropy'])

In [None]:
gs2 = GridSearchCV(estimator=dt_model,
                 param_grid=grid_params,
                 scoring='accuracy',
                 cv=5,
                 n_jobs=-1)

In [None]:
gs2.fit(train_data, train_target)

In [None]:
gs2.best_params_

In [None]:
gs2.best_estimator_

In [None]:
gs2.score(test_data, test_target)

## Optimized Model with Classification Report

In [None]:
from sklearn.metrics import classification_report

In [None]:
vectorizer = TfidfVectorizer(input='content',
                            analyzer='word',
                            stop_words='english',
                            decode_error='ignore')

lsvc_model = Pipeline([('vec', vectorizer),
                       ('clf', LinearSVC(random_state=RANDOM_STATE, loss='hinge'))])

In [None]:
lsvc_model.fit(train_data, train_target)

In [None]:
predictions = lsvc_model.predict(test_data)
print(classification_report(test_target, predictions))