# SVM

This notebook was run on an Amazon SageMaker ml.c5.4xlarge instance.  

In [1]:
import pandas as pd
import string
import re
import string
import numpy as np
import datetime

import warnings
warnings.filterwarnings('ignore')

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer 
from nltk.stem import LancasterStemmer 

import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.naive_bayes import MultinomialNB,GaussianNB
from sklearn import svm
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

In [2]:
import pickle_functions as pf

In [3]:
import model_functions as mf

### Load and shuffle data

Read in test.csv and train.csv

In [4]:
train = pf.read_pickle(bucket_name='advancedml-koch-mathur-hinkson', filename='sub_train_df14_preprocessed')

In [5]:
drop_cols = ['split', 'cleaned_w_stopwords', 'cleaned_no_stem', 'cleaned_porter', 'cleaned_lancaster', 'bigrams_unstemmed',
       'perc_upper', 'num_exclam', 'num_words', 'perc_stopwords',
       'num_upper_words']

In [6]:
train = train.drop(drop_cols, axis = 1)

In [7]:
train.shape

(100000, 49)

In [8]:
train.columns

Index(['id', 'target', 'comment_text', 'severe_toxicity', 'obscene',
       'identity_attack', 'insult', 'threat', 'asian', 'atheist', 'bisexual',
       'black', 'buddhist', 'christian', 'female', 'heterosexual', 'hindu',
       'homosexual_gay_or_lesbian', 'intellectual_or_learning_disability',
       'jewish', 'latino', 'male', 'muslim', 'other_disability',
       'other_gender', 'other_race_or_ethnicity', 'other_religion',
       'other_sexual_orientation', 'physical_disability',
       'psychiatric_or_mental_illness', 'transgender', 'white', 'created_date',
       'publication_id', 'parent_id', 'article_id', 'rating', 'funny', 'wow',
       'sad', 'likes', 'disagree', 'sexual_explicit',
       'identity_annotator_count', 'toxicity_annotator_count',
       'cleaned_w_stopwords_str', 'cleaned_no_stem_str', 'cleaned_porter_str',
       'cleaned_lancaster_str'],
      dtype='object')

Create a new column called "toxicity_category" in the train data frame categorizing comments as toxic ("1") or non-toxic ("0").

In [9]:
train['toxicity_category'] = train.target.apply(lambda x: 1 if x > 0.5 else 0)

Split train.csv into training (80%) and hold out sets (20%).

In [10]:
#Citation: https://stackoverflow.com/questions/24147278/how-do-i-create-test-and-train-samples-from-one-dataframe-with-pandas
msk = np.random.rand(len(train)) < 0.8
train_set = train[msk]
hold_out_set = train[~msk]

In [11]:
print(train_set.toxicity_category.value_counts())

0    74628
1     5269
Name: toxicity_category, dtype: int64


In [12]:
print(hold_out_set.toxicity_category.value_counts())

0    18716
1     1387
Name: toxicity_category, dtype: int64


In [13]:
print(train_set.toxicity_category.value_counts())

0    74628
1     5269
Name: toxicity_category, dtype: int64


In [14]:
toxic = train_set[train_set.toxicity_category == 1]
nontoxic = train_set[train_set.toxicity_category == 0]

In [15]:
train_set.shape, toxic.shape, nontoxic.shape

((79897, 50), (5269, 50), (74628, 50))

Reshaping the dataset to be include an equal number of toxic and nontoxic samples

In [16]:
quarter = len(toxic)

In [17]:
random_df = train_set.sample(quarter*4)

Create a data set of 25% toxic and 75% nontoxic comments, and shuffle the data such that you do not have a data set of grouped toxic and grouped nontoxic comments

In [18]:
prepared_25 = toxic.append(nontoxic.sample(len(toxic)*3))
prepared_25 = prepared_25.sample(frac=1).reset_index(drop=True)
print(prepared_25.toxicity_category.value_counts())

0    15807
1     5269
Name: toxicity_category, dtype: int64


### SVM - weighted 25% toxic, 75% nontoxic

In [19]:
classifier, output, fitted_vectorizer = mf.run_model(model_df=prepared_25, 
                                                     model_type="SVM", 
                                                     comments = "cleaned_no_stem_str", 
                                                     train_perc=0.95, 
                                                     target="toxicity_category", 
                                                     see_inside=False)

fitting model now


In [20]:
mf.get_metrics(output=output, detailed=True, should_print=True, round_to=3)

Overall Accuracy: 0.8671726755218216
Overall Precision: 0.8484848484848485
Overall Recall: 0.60431654676259
Overall F1 Score: 0.7058823529411765
ROC_AUC: 0.783

Target Accuracy: 0.60431654676259
Target Precision: 1.0
Target Recall: 0.60431654676259
Target F1 Score: 0.7533632286995516

Non-Target Accuracy: 0.961340206185567
Non-Target Precision: 1.0
Non-Target Recall: 0.961340206185567
Non-Target F1 Score: 0.9802890932982917

Strong Identity Accuracy: 0.48484848484848486
Strong Identity Precision: 1.0
Strong Identity Recall: 0.48484848484848486
Strong Identity F1 Score: 1.0

Obscenity Accuracy: 0.5
Obscenity Precision: 1.0
Obscenity Recall: 0.5
Obscenity F1 Score: 1.0

Insults Accuracy: 0.6439024390243903
Insults Precision: 1.0
Insults Recall: 0.6439024390243903
Insults F1 Score: 1.0

Threats Accuracy: 0.16666666666666666
Threats Precision: 1.0
Threats Recall: 0.16666666666666666
Threats F1 Score: 0.0



{'Overall': {'Accuracy': 0.8671726755218216,
  'Precision': 0.8484848484848485,
  'Recall': 0.60431654676259,
  'F1': 0.7058823529411765,
  'ROC_AUC': 0.783},
 'Target': {'Accuracy': 0.60431654676259,
  'Precision': 1.0,
  'Recall': 0.60431654676259,
  'F1': 0.7533632286995516},
 'Non-Target': {'Accuracy': 0.16666666666666666,
  'Precision': 1.0,
  'Recall': 0.16666666666666666,
  'F1': 0.0}}

In [21]:
hold_out_results = mf.run_model_test(model_df=hold_out_set, 
                                     clf=classifier, 
                                     vectorizer=fitted_vectorizer, 
                                     comments="cleaned_no_stem_str", target="toxicity_category")

Index(['id', 'target', 'comment_text', 'severe_toxicity', 'obscene',
       'identity_attack', 'insult', 'threat', 'asian', 'atheist', 'bisexual',
       'black', 'buddhist', 'christian', 'female', 'heterosexual', 'hindu',
       'homosexual_gay_or_lesbian', 'intellectual_or_learning_disability',
       'jewish', 'latino', 'male', 'muslim', 'other_disability',
       'other_gender', 'other_race_or_ethnicity', 'other_religion',
       'other_sexual_orientation', 'physical_disability',
       'psychiatric_or_mental_illness', 'transgender', 'white', 'created_date',
       'publication_id', 'parent_id', 'article_id', 'rating', 'funny', 'wow',
       'sad', 'likes', 'disagree', 'sexual_explicit',
       'identity_annotator_count', 'toxicity_annotator_count',
       'cleaned_w_stopwords_str', 'cleaned_no_stem_str', 'cleaned_porter_str',
       'cleaned_lancaster_str', 'toxicity_category', 'predicted', 'y_test'],
      dtype='object')


In [22]:
hold_out_results.to_csv("holdout_results", sep='|')

In [23]:
hold_out_metrics = mf.get_metrics(output=hold_out_results, detailed=True, should_print=True, round_to=3)

Overall Accuracy: 0.9407551111774362
Overall Precision: 0.56703146374829
Overall Recall: 0.5976928622927181
Overall F1 Score: 0.5819585819585821
ROC_AUC: 0.782

Target Accuracy: 0.5976928622927181
Target Precision: 1.0
Target Recall: 0.5976928622927181
Target F1 Score: 0.7481949458483755

Non-Target Accuracy: 0.9661786706561231
Non-Target Precision: 1.0
Non-Target Recall: 0.9661786706561231
Non-Target F1 Score: 0.982798445609935

Strong Identity Accuracy: 0.6333333333333333
Strong Identity Precision: 0.9861111111111112
Strong Identity Recall: 0.6228070175438597
Strong Identity F1 Score: 1.0

Obscenity Accuracy: 0.6
Obscenity Precision: 0.9836065573770492
Obscenity Recall: 0.6060606060606061
Obscenity F1 Score: 1.0

Insults Accuracy: 0.6837944664031621
Insults Precision: 0.9912408759124087
Insults Recall: 0.6837865055387714
Insults F1 Score: 1.0

Threats Accuracy: 0.2222222222222222
Threats Precision: 1.0
Threats Recall: 0.17647058823529413
Threats F1 Score: 0.0



In [24]:
fitted_vectorizer.get_feature_names()

['00',
 '000',
 '0001',
 '000s',
 '000to',
 '001',
 '006',
 '007',
 '00875cbc5c58',
 '00am',
 '01',
 '02',
 '023',
 '029',
 '03',
 '04',
 '045',
 '05',
 '050',
 '05pm',
 '06',
 '07',
 '079',
 '07townhall',
 '08',
 '08_news_archive',
 '08aa626eb493',
 '08th',
 '09',
 '093',
 '0bama',
 '0bamacare',
 '0c3fea8550',
 '0f4bf3d4fa57fd9684baa65758cb86f0cb311a081a009e5be625f6f44fecc828',
 '0h',
 '0int',
 '10',
 '100',
 '1000',
 '10000',
 '10000km',
 '1000ft',
 '1000kg',
 '1000kgs',
 '1000s',
 '100b',
 '100k',
 '100kg',
 '100kgs',
 '100km',
 '100kph',
 '100m',
 '100mg',
 '100million',
 '100s',
 '100th',
 '100x',
 '101',
 '102',
 '103',
 '10356',
 '104',
 '105',
 '10505',
 '105cedaravefoundation',
 '106',
 '107',
 '108',
 '10_gwrwsg',
 '10b',
 '10k',
 '10kph',
 '10m',
 '10mph',
 '10s',
 '10th',
 '11',
 '110',
 '1100',
 '1100sqft',
 '110kmh',
 '111',
 '112',
 '113',
 '114',
 '115',
 '116',
 '11680',
 '1174',
 '11am',
 '11e7',
 '11th',
 '12',
 '120',
 '1200',
 '120m',
 '121',
 '124',
 '1240',
 '125

In [55]:
classifier.coef_.todense()

matrix([[ 0.53246021, -1.31120159,  0.        , ...,  0.25839697,
          0.25839697,  0.25839697]])

In [29]:
import numpy as np

In [43]:
np.ravel(classifier.coef_).view().reshape(1)

array([<1x36158 sparse matrix of type '<class 'numpy.float64'>'
	with 24022 stored elements in Compressed Sparse Row format>],
      dtype=object)

In [53]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
import matplotlib.pyplot as plt


def plot_coefficients(classifier, feature_names, top_features=20):
    coef = classifier.coef_.todense()
    top_positive_coefficients = np.argsort(coef)[-top_features:]
    top_negative_coefficients = np.argsort(coef)[:top_features]
    top_coefficients = np.hstack([top_negative_coefficients, top_positive_coefficients])
    # create plot
    plt.figure(figsize=(15, 5))
    colors = ["red" if c < 0 else "blue" for c in coef[top_coefficients]]
    plt.bar(np.arange(2 * top_features), coef[top_coefficients], color=colors)
    feature_names = np.array(feature_names)
    plt.xticks(np.arange(1, 1 + 2 * top_features), feature_names[top_coefficients], rotation=60, ha="right")
    plt.show()



plot_coefficients(classifier, fitted_vectorizer.get_feature_names())


IndexError: index 31661 is out of bounds for axis 0 with size 1

<Figure size 1080x360 with 0 Axes>

### SVM - weighted 1/3 toxic, 2/3 nontoxic

In [36]:
prepared_33 = toxic.append(nontoxic.sample(len(toxic)*2))
prepared_33 = prepared_33.sample(frac=1).reset_index(drop=True)
print(prepared_33.toxicity_category.value_counts())

0    10664
1     5332
Name: toxicity_category, dtype: int64


In [37]:
classifier, output, fitted_vectorizer = mf.run_model(model_df=prepared_33, 
                                                     model_type="SVM", 
                                                     comments = "cleaned_no_stem_str", 
                                                     train_perc=0.95, 
                                                     target="toxicity_category", 
                                                     see_inside=False)

fitting model now


In [38]:
hold_out_results = mf.run_model_test(model_df=hold_out_set, 
                                     clf=classifier, 
                                     vectorizer=fitted_vectorizer, 
                                     comments="cleaned_no_stem_str", target="toxicity_category")

Index(['id', 'target', 'comment_text', 'severe_toxicity', 'obscene',
       'identity_attack', 'insult', 'threat', 'asian', 'atheist', 'bisexual',
       'black', 'buddhist', 'christian', 'female', 'heterosexual', 'hindu',
       'homosexual_gay_or_lesbian', 'intellectual_or_learning_disability',
       'jewish', 'latino', 'male', 'muslim', 'other_disability',
       'other_gender', 'other_race_or_ethnicity', 'other_religion',
       'other_sexual_orientation', 'physical_disability',
       'psychiatric_or_mental_illness', 'transgender', 'white', 'created_date',
       'publication_id', 'parent_id', 'article_id', 'rating', 'funny', 'wow',
       'sad', 'likes', 'disagree', 'sexual_explicit',
       'identity_annotator_count', 'toxicity_annotator_count',
       'cleaned_w_stopwords_str', 'cleaned_no_stem_str', 'cleaned_porter_str',
       'cleaned_lancaster_str', 'toxicity_category', 'predicted', 'y_test'],
      dtype='object')


In [39]:
hold_out_metrics = mf.get_metrics(output=hold_out_results, detailed=True, should_print=True, round_to=3)

Overall Accuracy: 0.9248949845639961
Overall Precision: 0.45987963891675027
Overall Recall: 0.6925981873111783
Overall F1 Score: 0.5527426160337553
ROC_AUC: 0.817

Target Accuracy: 0.6925981873111783
Target Precision: 1.0
Target Recall: 0.6925981873111783
Target F1 Score: 0.8183846497099508

Non-Target Accuracy: 0.9415785191212368
Non-Target Precision: 1.0
Non-Target Recall: 0.9415785191212368
Non-Target F1 Score: 0.9699103176598776

Strong Identity Accuracy: 0.6535433070866141
Strong Identity Precision: 0.9425287356321839
Strong Identity Recall: 0.6776859504132231
Strong Identity F1 Score: 1.0

Obscenity Accuracy: 0.7611940298507462
Obscenity Precision: 1.0
Obscenity Recall: 0.7611940298507462
Obscenity F1 Score: 1.0

Insults Accuracy: 0.742914979757085
Insults Precision: 0.9849108367626886
Insults Recall: 0.7471383975026015
Insults F1 Score: 1.0

Threats Accuracy: 0.3793103448275862
Threats Precision: 1.0
Threats Recall: 0.35714285714285715
Threats F1 Score: 1.0

