# SVM

This notebook was run on an Amazon SageMaker ml.c5.4xlarge instance.  

In [1]:
import pandas as pd
import string
import re
import string
import numpy as np
import datetime

import warnings
warnings.filterwarnings('ignore')

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer 
from nltk.stem import LancasterStemmer 

import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.naive_bayes import MultinomialNB,GaussianNB
from sklearn import svm
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

In [2]:
import pickle_functions as pf

In [3]:
import model_functions as mf

### Load and shuffle data

Read in test.csv and train.csv

In [4]:
train = pf.read_pickle(bucket_name='advancedml-koch-mathur-hinkson', filename='sub_train_df14_preprocessed')

In [5]:
drop_cols = ['split', 'cleaned_w_stopwords', 'cleaned_no_stem', 'cleaned_porter', 'cleaned_lancaster', 'bigrams_unstemmed',
       'perc_upper', 'num_exclam', 'num_words', 'perc_stopwords',
       'num_upper_words']

In [6]:
train = train.drop(drop_cols, axis = 1)

In [7]:
train.shape

(100000, 49)

In [8]:
train.columns

Index(['id', 'target', 'comment_text', 'severe_toxicity', 'obscene',
       'identity_attack', 'insult', 'threat', 'asian', 'atheist', 'bisexual',
       'black', 'buddhist', 'christian', 'female', 'heterosexual', 'hindu',
       'homosexual_gay_or_lesbian', 'intellectual_or_learning_disability',
       'jewish', 'latino', 'male', 'muslim', 'other_disability',
       'other_gender', 'other_race_or_ethnicity', 'other_religion',
       'other_sexual_orientation', 'physical_disability',
       'psychiatric_or_mental_illness', 'transgender', 'white', 'created_date',
       'publication_id', 'parent_id', 'article_id', 'rating', 'funny', 'wow',
       'sad', 'likes', 'disagree', 'sexual_explicit',
       'identity_annotator_count', 'toxicity_annotator_count',
       'cleaned_w_stopwords_str', 'cleaned_no_stem_str', 'cleaned_porter_str',
       'cleaned_lancaster_str'],
      dtype='object')

Create a new column called "toxicity_category" in the train data frame categorizing comments as toxic ("1") or non-toxic ("0").

In [9]:
train['toxicity_category'] = train.target.apply(lambda x: 1 if x > 0.5 else 0)

Split train.csv into training (80%) and hold out sets (20%).

In [10]:
#Citation: https://stackoverflow.com/questions/24147278/how-do-i-create-test-and-train-samples-from-one-dataframe-with-pandas
msk = np.random.rand(len(train)) < 0.8
train_set = train[msk]
hold_out_set = train[~msk]

In [11]:
print(train_set.toxicity_category.value_counts())

0    74767
1     5391
Name: toxicity_category, dtype: int64


In [12]:
print(hold_out_set.toxicity_category.value_counts())

0    18577
1     1265
Name: toxicity_category, dtype: int64


In [13]:
print(train_set.toxicity_category.value_counts())

0    74767
1     5391
Name: toxicity_category, dtype: int64


In [14]:
toxic = train_set[train_set.toxicity_category == 1]
nontoxic = train_set[train_set.toxicity_category == 0]

In [15]:
train_set.shape, toxic.shape, nontoxic.shape

((80158, 50), (5391, 50), (74767, 50))

Reshaping the dataset to be include an equal number of toxic and nontoxic samples

In [16]:
quarter = len(toxic)

In [17]:
random_df = train_set.sample(quarter*4)

Create a data set of 25% toxic and 75% nontoxic comments, and shuffle the data such that you do not have a data set of grouped toxic and grouped nontoxic comments

In [18]:
prepared_25 = toxic.append(nontoxic.sample(len(toxic)*3))
prepared_25 = prepared_25.sample(frac=1).reset_index(drop=True)
print(prepared_25.toxicity_category.value_counts())

0    16173
1     5391
Name: toxicity_category, dtype: int64


### SVM - weighted 25% toxic, 75% nontoxic

In [19]:
classifier, output, fitted_vectorizer = mf.run_model(model_df=prepared_25, 
                                                     model_type="SVM", 
                                                     comments = "cleaned_no_stem_str", 
                                                     train_perc=0.95, 
                                                     target="toxicity_category", 
                                                     see_inside=False)

fitting model now


In [20]:
mf.get_metrics(output=output, detailed=True, should_print=True, round_to=3)

Overall Accuracy: 0.8608534322820037
Overall Precision: 0.8604651162790697
Overall Recall: 0.5401459854014599
Overall F1 Score: 0.6636771300448431
ROC_AUC: 0.755

Target Accuracy: 0.5401459854014599
Target Precision: 1.0
Target Recall: 0.5401459854014599
Target F1 Score: 0.7014218009478673

Non-Target Accuracy: 0.9701492537313433
Non-Target Precision: 1.0
Non-Target Recall: 0.9701492537313433
Non-Target F1 Score: 0.9848484848484849

Strong Identity Accuracy: 0.5
Strong Identity Precision: 1.0
Strong Identity Recall: 0.5
Strong Identity F1 Score: 1.0

Obscenity Accuracy: 0.5
Obscenity Precision: 1.0
Obscenity Recall: 0.5
Obscenity F1 Score: 1.0

Insults Accuracy: 0.6021505376344086
Insults Precision: 1.0
Insults Recall: 0.6
Insults F1 Score: 1.0

Threats Accuracy: 0.2
Threats Precision: 1.0
Threats Recall: 0.2
Threats F1 Score: 0.0



{'Overall': {'Accuracy': 0.8608534322820037,
  'Precision': 0.8604651162790697,
  'Recall': 0.5401459854014599,
  'F1': 0.6636771300448431,
  'ROC_AUC': 0.755},
 'Target': {'Accuracy': 0.5401459854014599,
  'Precision': 1.0,
  'Recall': 0.5401459854014599,
  'F1': 0.7014218009478673},
 'Non-Target': {'Accuracy': 0.2, 'Precision': 1.0, 'Recall': 0.2, 'F1': 0.0}}

In [21]:
hold_out_results = mf.run_model_test(model_df=hold_out_set, 
                                     clf=classifier, 
                                     vectorizer=fitted_vectorizer, 
                                     comments="cleaned_no_stem_str", target="toxicity_category")

Index(['id', 'target', 'comment_text', 'severe_toxicity', 'obscene',
       'identity_attack', 'insult', 'threat', 'asian', 'atheist', 'bisexual',
       'black', 'buddhist', 'christian', 'female', 'heterosexual', 'hindu',
       'homosexual_gay_or_lesbian', 'intellectual_or_learning_disability',
       'jewish', 'latino', 'male', 'muslim', 'other_disability',
       'other_gender', 'other_race_or_ethnicity', 'other_religion',
       'other_sexual_orientation', 'physical_disability',
       'psychiatric_or_mental_illness', 'transgender', 'white', 'created_date',
       'publication_id', 'parent_id', 'article_id', 'rating', 'funny', 'wow',
       'sad', 'likes', 'disagree', 'sexual_explicit',
       'identity_annotator_count', 'toxicity_annotator_count',
       'cleaned_w_stopwords_str', 'cleaned_no_stem_str', 'cleaned_porter_str',
       'cleaned_lancaster_str', 'toxicity_category', 'predicted', 'y_test'],
      dtype='object')


In [22]:
hold_out_results.to_csv("holdout_results", sep='|')

In [23]:
hold_out_metrics = mf.get_metrics(output=hold_out_results, detailed=True, should_print=True, round_to=3)

Overall Accuracy: 0.9417397439774217
Overall Precision: 0.5376121463077985
Overall Recall: 0.6158102766798419
Overall F1 Score: 0.574060427413412
ROC_AUC: 0.79

Target Accuracy: 0.6158102766798419
Target Precision: 1.0
Target Recall: 0.6158102766798419
Target F1 Score: 0.7622309197651663

Non-Target Accuracy: 0.9639338967540507
Non-Target Precision: 1.0
Non-Target Recall: 0.9639338967540507
Non-Target F1 Score: 0.9816357855498301

Strong Identity Accuracy: 0.5619834710743802
Strong Identity Precision: 0.9384615384615385
Strong Identity Recall: 0.5545454545454546
Strong Identity F1 Score: 1.0

Obscenity Accuracy: 0.7058823529411765
Obscenity Precision: 1.0
Obscenity Recall: 0.7014925373134329
Obscenity F1 Score: 1.0

Insults Accuracy: 0.7034339229968782
Insults Precision: 0.9819819819819819
Insults Recall: 0.7055016181229773
Insults F1 Score: 1.0

Threats Accuracy: 0.1875
Threats Precision: 1.0
Threats Recall: 0.16129032258064516
Threats F1 Score: 0.0



In [40]:
# Citation:  https://medium.com/@aneesha/visualising-top-features-in-linear-svm-with-scikit-learn-and-matplotlib-3454ab18a14d

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
import matplotlib.pyplot as plt

def plot_coefficients(classifier, feature_names, top_features=20):
    coef = np.ravel(classifier.coef_)
    top_positive_coefficients = np.argsort(coef)[-top_features:]
    top_negative_coefficients = np.argsort(coef)[:top_features]
    top_coefficients = np.hstack([top_negative_coefficients, top_positive_coefficients])
    # create plot
    plt.figure(figsize=(15, 5))
    colors = ["red" if c < 0 else "blue" for c in coef[top_coefficients]]
    plt.bar(np.arange(2 * top_features), coef[top_coefficients], color=colors)
    feature_names = np.array(feature_names)
    plt.xticks(np.arange(1, 1 + 2 * top_features), feature_names[top_coefficients], rotation=60, ha="right")
    plt.show()
    

plot_coefficients(classifier, fitted_vectorizer.get_feature_names())

ValueError: WRITEBACKIFCOPY base is read-only

<Figure size 1080x360 with 0 Axes>

### SVM - weighted 1/3 toxic, 2/3 nontoxic

In [36]:
prepared_33 = toxic.append(nontoxic.sample(len(toxic)*2))
prepared_33 = prepared_33.sample(frac=1).reset_index(drop=True)
print(prepared_33.toxicity_category.value_counts())

0    10664
1     5332
Name: toxicity_category, dtype: int64


In [37]:
classifier, output, fitted_vectorizer = mf.run_model(model_df=prepared_33, 
                                                     model_type="SVM", 
                                                     comments = "cleaned_no_stem_str", 
                                                     train_perc=0.95, 
                                                     target="toxicity_category", 
                                                     see_inside=False)

fitting model now


In [38]:
hold_out_results = mf.run_model_test(model_df=hold_out_set, 
                                     clf=classifier, 
                                     vectorizer=fitted_vectorizer, 
                                     comments="cleaned_no_stem_str", target="toxicity_category")

Index(['id', 'target', 'comment_text', 'severe_toxicity', 'obscene',
       'identity_attack', 'insult', 'threat', 'asian', 'atheist', 'bisexual',
       'black', 'buddhist', 'christian', 'female', 'heterosexual', 'hindu',
       'homosexual_gay_or_lesbian', 'intellectual_or_learning_disability',
       'jewish', 'latino', 'male', 'muslim', 'other_disability',
       'other_gender', 'other_race_or_ethnicity', 'other_religion',
       'other_sexual_orientation', 'physical_disability',
       'psychiatric_or_mental_illness', 'transgender', 'white', 'created_date',
       'publication_id', 'parent_id', 'article_id', 'rating', 'funny', 'wow',
       'sad', 'likes', 'disagree', 'sexual_explicit',
       'identity_annotator_count', 'toxicity_annotator_count',
       'cleaned_w_stopwords_str', 'cleaned_no_stem_str', 'cleaned_porter_str',
       'cleaned_lancaster_str', 'toxicity_category', 'predicted', 'y_test'],
      dtype='object')


In [39]:
hold_out_metrics = mf.get_metrics(output=hold_out_results, detailed=True, should_print=True, round_to=3)

Overall Accuracy: 0.9248949845639961
Overall Precision: 0.45987963891675027
Overall Recall: 0.6925981873111783
Overall F1 Score: 0.5527426160337553
ROC_AUC: 0.817

Target Accuracy: 0.6925981873111783
Target Precision: 1.0
Target Recall: 0.6925981873111783
Target F1 Score: 0.8183846497099508

Non-Target Accuracy: 0.9415785191212368
Non-Target Precision: 1.0
Non-Target Recall: 0.9415785191212368
Non-Target F1 Score: 0.9699103176598776

Strong Identity Accuracy: 0.6535433070866141
Strong Identity Precision: 0.9425287356321839
Strong Identity Recall: 0.6776859504132231
Strong Identity F1 Score: 1.0

Obscenity Accuracy: 0.7611940298507462
Obscenity Precision: 1.0
Obscenity Recall: 0.7611940298507462
Obscenity F1 Score: 1.0

Insults Accuracy: 0.742914979757085
Insults Precision: 0.9849108367626886
Insults Recall: 0.7471383975026015
Insults F1 Score: 1.0

Threats Accuracy: 0.3793103448275862
Threats Precision: 1.0
Threats Recall: 0.35714285714285715
Threats F1 Score: 1.0

