# SVM

This notebook was run on an Amazon SageMaker ml.c5.4xlarge instance.  

In [6]:
import pandas as pd
import string
import re
import string
import numpy as np
import datetime

import warnings
warnings.filterwarnings('ignore')

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer 
from nltk.stem import LancasterStemmer 

import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.naive_bayes import MultinomialNB,GaussianNB
from sklearn import svm
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

In [7]:
import pickle_functions as pf

In [8]:
import model_functions as mf

### Load and shuffle data

Read in test.csv and train.csv

In [10]:
train = pf.read_pickle(bucket_name='advancedml-koch-mathur-hinkson', filename='sub_train_df14_preprocessed')

In [12]:
drop_cols = ['split', 'cleaned_w_stopwords', 'cleaned_no_stem', 'cleaned_porter', 'cleaned_lancaster', 'bigrams_unstemmed',
       'perc_upper', 'num_exclam', 'num_words', 'perc_stopwords',
       'num_upper_words']

In [13]:
train = train.drop(drop_cols, axis = 1)

In [14]:
train.shape

(100000, 49)

In [15]:
train.columns

Index(['id', 'target', 'comment_text', 'severe_toxicity', 'obscene',
       'identity_attack', 'insult', 'threat', 'asian', 'atheist', 'bisexual',
       'black', 'buddhist', 'christian', 'female', 'heterosexual', 'hindu',
       'homosexual_gay_or_lesbian', 'intellectual_or_learning_disability',
       'jewish', 'latino', 'male', 'muslim', 'other_disability',
       'other_gender', 'other_race_or_ethnicity', 'other_religion',
       'other_sexual_orientation', 'physical_disability',
       'psychiatric_or_mental_illness', 'transgender', 'white', 'created_date',
       'publication_id', 'parent_id', 'article_id', 'rating', 'funny', 'wow',
       'sad', 'likes', 'disagree', 'sexual_explicit',
       'identity_annotator_count', 'toxicity_annotator_count',
       'cleaned_w_stopwords_str', 'cleaned_no_stem_str', 'cleaned_porter_str',
       'cleaned_lancaster_str'],
      dtype='object')

Create a new column called "toxicity_category" in the train data frame categorizing comments as toxic ("1") or non-toxic ("0").

In [16]:
train['toxicity_category'] = train.target.apply(lambda x: 1 if x > 0.5 else 0)

Split train.csv into training (80%) and hold out sets (20%).

In [17]:
# https://stackoverflow.com/questions/24147278/how-do-i-create-test-and-train-samples-from-one-dataframe-with-pandas
msk = np.random.rand(len(train)) < 0.8
train_set = train[msk]
hold_out_set = train[~msk]

In [18]:
print(train_set.toxicity_category.value_counts())

0    74909
1     5332
Name: toxicity_category, dtype: int64


In [19]:
print(hold_out_set.toxicity_category.value_counts())

0    18435
1     1324
Name: toxicity_category, dtype: int64


In [20]:
print(train_set.toxicity_category.value_counts())

0    74909
1     5332
Name: toxicity_category, dtype: int64


In [21]:
toxic = train_set[train_set.toxicity_category == 1]
nontoxic = train_set[train_set.toxicity_category == 0]

In [22]:
train_set.shape, toxic.shape, nontoxic.shape

((80241, 50), (5332, 50), (74909, 50))

Reshaping the dataset to be include an equal number of toxic and nontoxic samples

In [23]:
quarter = len(toxic)

In [24]:
random_df = train_set.sample(quarter*4)

Create a data set of 25% toxic and 75% nontoxic comments, and shuffle the data such that you do not have a data set of grouped toxic and grouped nontoxic comments

In [25]:
prepared_25 = toxic.append(nontoxic.sample(len(toxic)*3))
prepared_25 = prepared_25.sample(frac=1).reset_index(drop=True)
print(prepared_25.toxicity_category.value_counts())

0    15996
1     5332
Name: toxicity_category, dtype: int64


### SVM - weighted 25% toxic and 75% nontoxic

In [26]:
classifier, output, fitted_vectorizer = mf.run_model(model_df=prepared_25, 
                                                     model_type="SVM", 
                                                     comments = "cleaned_no_stem_str", 
                                                     train_perc=0.95, 
                                                     target="toxicity_category", 
                                                     see_inside=False)

fitting model now


In [27]:
mf.get_metrics(output=output, detailed=True, should_print=True, round_to=3)

Overall Accuracy: 0.875234521575985
Overall Precision: 0.835
Overall Recall: 0.6254681647940075
Overall F1 Score: 0.715203426124197
ROC_AUC: 0.792

Target Accuracy: 0.6254681647940075
Target Precision: 1.0
Target Recall: 0.6254681647940075
Target F1 Score: 0.7695852534562212

Non-Target Accuracy: 0.9586983729662077
Non-Target Precision: 1.0
Non-Target Recall: 0.9586983729662077
Non-Target F1 Score: 0.9789137380191694

Strong Identity Accuracy: 0.45
Strong Identity Precision: 1.0
Strong Identity Recall: 0.45
Strong Identity F1 Score: 1.0

Obscenity Accuracy: 0.7272727272727273
Obscenity Precision: 1.0
Obscenity Recall: 0.7272727272727273
Obscenity F1 Score: 1.0

Insults Accuracy: 0.7128712871287128
Insults Precision: 1.0
Insults Recall: 0.7114427860696517
Insults F1 Score: 1.0

Threats Accuracy: 0.5
Threats Precision: 1.0
Threats Recall: 0.5
Threats F1 Score: 1.0



{'Overall': {'Accuracy': 0.875234521575985,
  'Precision': 0.835,
  'Recall': 0.6254681647940075,
  'F1': 0.715203426124197,
  'ROC_AUC': 0.792},
 'Target': {'Accuracy': 0.6254681647940075,
  'Precision': 1.0,
  'Recall': 0.6254681647940075,
  'F1': 0.7695852534562212},
 'Non-Target': {'Accuracy': 0.5, 'Precision': 1.0, 'Recall': 0.5, 'F1': 1.0}}

In [28]:
hold_out_results = mf.run_model_test(model_df=hold_out_set, 
                                     clf=classifier, 
                                     vectorizer=fitted_vectorizer, 
                                     comments="cleaned_no_stem_str", target="toxicity_category")

Index(['id', 'target', 'comment_text', 'severe_toxicity', 'obscene',
       'identity_attack', 'insult', 'threat', 'asian', 'atheist', 'bisexual',
       'black', 'buddhist', 'christian', 'female', 'heterosexual', 'hindu',
       'homosexual_gay_or_lesbian', 'intellectual_or_learning_disability',
       'jewish', 'latino', 'male', 'muslim', 'other_disability',
       'other_gender', 'other_race_or_ethnicity', 'other_religion',
       'other_sexual_orientation', 'physical_disability',
       'psychiatric_or_mental_illness', 'transgender', 'white', 'created_date',
       'publication_id', 'parent_id', 'article_id', 'rating', 'funny', 'wow',
       'sad', 'likes', 'disagree', 'sexual_explicit',
       'identity_annotator_count', 'toxicity_annotator_count',
       'cleaned_w_stopwords_str', 'cleaned_no_stem_str', 'cleaned_porter_str',
       'cleaned_lancaster_str', 'toxicity_category', 'predicted', 'y_test'],
      dtype='object')


In [29]:
hold_out_results.to_csv("holdout_results", sep='|')

In [32]:
hold_out_metrics = mf.get_metrics(output=hold_out_results, detailed=True, should_print=True, round_to=3)

Overall Accuracy: 0.9436712384229972
Overall Precision: 0.5737246680642907
Overall Recall: 0.6200906344410876
Overall F1 Score: 0.5960072595281306
ROC_AUC: 0.794

Target Accuracy: 0.6200906344410876
Target Precision: 1.0
Target Recall: 0.6200906344410876
Target F1 Score: 0.7655011655011655

Non-Target Accuracy: 0.9669107675617032
Non-Target Precision: 1.0
Non-Target Recall: 0.9669107675617032
Non-Target F1 Score: 0.983177054605626

Strong Identity Accuracy: 0.5433070866141733
Strong Identity Precision: 0.9565217391304348
Strong Identity Recall: 0.5454545454545454
Strong Identity F1 Score: 1.0

Obscenity Accuracy: 0.7761194029850746
Obscenity Precision: 1.0
Obscenity Recall: 0.7761194029850746
Obscenity F1 Score: 1.0

Insults Accuracy: 0.6963562753036437
Insults Precision: 0.9896296296296296
Insults Recall: 0.6951092611862643
Insults F1 Score: 1.0

Threats Accuracy: 0.2413793103448276
Threats Precision: 1.0
Threats Recall: 0.21428571428571427
Threats F1 Score: 0.0



### SVM - 1/3 toxic, 2/3 nontoxic

In [36]:
prepared_33 = toxic.append(nontoxic.sample(len(toxic)*2))
prepared_33 = prepared_33.sample(frac=1).reset_index(drop=True)
print(prepared_33.toxicity_category.value_counts())

0    10664
1     5332
Name: toxicity_category, dtype: int64


In [37]:
classifier, output, fitted_vectorizer = mf.run_model(model_df=prepared_33, 
                                                     model_type="SVM", 
                                                     comments = "cleaned_no_stem_str", 
                                                     train_perc=0.95, 
                                                     target="toxicity_category", 
                                                     see_inside=False)

fitting model now


In [38]:
hold_out_results = mf.run_model_test(model_df=hold_out_set, 
                                     clf=classifier, 
                                     vectorizer=fitted_vectorizer, 
                                     comments="cleaned_no_stem_str", target="toxicity_category")

Index(['id', 'target', 'comment_text', 'severe_toxicity', 'obscene',
       'identity_attack', 'insult', 'threat', 'asian', 'atheist', 'bisexual',
       'black', 'buddhist', 'christian', 'female', 'heterosexual', 'hindu',
       'homosexual_gay_or_lesbian', 'intellectual_or_learning_disability',
       'jewish', 'latino', 'male', 'muslim', 'other_disability',
       'other_gender', 'other_race_or_ethnicity', 'other_religion',
       'other_sexual_orientation', 'physical_disability',
       'psychiatric_or_mental_illness', 'transgender', 'white', 'created_date',
       'publication_id', 'parent_id', 'article_id', 'rating', 'funny', 'wow',
       'sad', 'likes', 'disagree', 'sexual_explicit',
       'identity_annotator_count', 'toxicity_annotator_count',
       'cleaned_w_stopwords_str', 'cleaned_no_stem_str', 'cleaned_porter_str',
       'cleaned_lancaster_str', 'toxicity_category', 'predicted', 'y_test'],
      dtype='object')


In [39]:
hold_out_metrics = mf.get_metrics(output=hold_out_results, detailed=True, should_print=True, round_to=3)

Overall Accuracy: 0.9248949845639961
Overall Precision: 0.45987963891675027
Overall Recall: 0.6925981873111783
Overall F1 Score: 0.5527426160337553
ROC_AUC: 0.817

Target Accuracy: 0.6925981873111783
Target Precision: 1.0
Target Recall: 0.6925981873111783
Target F1 Score: 0.8183846497099508

Non-Target Accuracy: 0.9415785191212368
Non-Target Precision: 1.0
Non-Target Recall: 0.9415785191212368
Non-Target F1 Score: 0.9699103176598776

Strong Identity Accuracy: 0.6535433070866141
Strong Identity Precision: 0.9425287356321839
Strong Identity Recall: 0.6776859504132231
Strong Identity F1 Score: 1.0

Obscenity Accuracy: 0.7611940298507462
Obscenity Precision: 1.0
Obscenity Recall: 0.7611940298507462
Obscenity F1 Score: 1.0

Insults Accuracy: 0.742914979757085
Insults Precision: 0.9849108367626886
Insults Recall: 0.7471383975026015
Insults F1 Score: 1.0

Threats Accuracy: 0.3793103448275862
Threats Precision: 1.0
Threats Recall: 0.35714285714285715
Threats F1 Score: 1.0

