# Naive Bayes Model

In this notebook, a Naive Bayes model is run on a iid sampled data set of approximately 670K rows of data.  This notebook was run on an AWS SageMaker ml.c5.4xlarge instance.

#### Import modules

In [1]:
import pandas as pd
import string
import re
import string
import numpy as np
import datetime

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer 
from nltk.stem import LancasterStemmer 

import warnings
warnings.filterwarnings('ignore')

import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.naive_bayes import MultinomialNB
from sklearn import svm
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

In [2]:
import feature_generation_functions as fgf

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ec2-user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
import model_functions as mf

In [4]:
import pickle_functions as pf

### Load and shuffle data

In [5]:
train = pd.read_csv("s3://advancedml-koch-mathur-hinkson/train.csv")

In [6]:
train.shape

(1804874, 45)

Label comments as toxic ("1") or nontoxic ("0") using 0.5 threshold

In [7]:
train['toxicity_category'] = train.target.apply(lambda x: 1 if x > 0.5 else 0)

In [8]:
train.shape

(1804874, 46)

Split into train_set and validation_set

In [9]:
#Citation: https://stackoverflow.com/questions/24147278/how-do-i-create-test-and-train-samples-from-one-dataframe-with-pandas
msk = np.random.rand(len(train)) < 0.8
train_set = train[msk]
validation_set = train[~msk]

In [14]:
print(train_set.toxicity_category.value_counts())

0    1358850
1      85018
Name: toxicity_category, dtype: int64


In [15]:
print(validation_set.toxicity_category.value_counts())

0    339586
1     21420
Name: toxicity_category, dtype: int64


Randomly sample train_set to create a smaller data frame (train_sample) to run SVM on

In [16]:
train_sample = train_set.sample(frac=0.5, replace=True)

In [17]:
print(train_sample.toxicity_category.value_counts())

0    679613
1     42321
Name: toxicity_category, dtype: int64


### Generate features

In [18]:
train_df = fgf.generate_NB_SVM_features(train_sample)

Cleaned with stopwords...Elapsed Time:  0.177 minutes
Cleaned without stopwords...Elapsed Time:  0.245 minutes
Stemmed (Porter)...Elapsed Time:  5.661 minutes
Stemmed (Lancaster)...Elapsed Time:  4.641 minutes

DONE GENERATING FEATURES


In [20]:
pf.write_pickle_to_s3bucket(filename='NB_final_720K', 
                            bucket_name='advancedml-koch-mathur-hinkson', 
                            df=train_df)

Pickled and sent to bucket!


In [35]:
validation_df = fgf.generate_NB_SVM_features(validation_set)

Cleaned with stopwords...Elapsed Time:  0.088 minutes
Cleaned without stopwords...Elapsed Time:  0.122 minutes
Stemmed (Porter)...Elapsed Time:  2.797 minutes
Stemmed (Lancaster)...Elapsed Time:  2.332 minutes

DONE GENERATING FEATURES


In [None]:
pf.write_pickle_to_s3bucket(filename='NB_final_720K', 
                            bucket_name='advancedml-koch-mathur-hinkson', 
                            df=validation_df)

### Reshaping

In [5]:
# train_df = pf.read_pickle(filename='NB_final_720K', bucket_name='advancedml-koch-mathur-hinkson')

In [6]:
toxic = train_df[train_df.toxicity_category == 1]
nontoxic = train_df[train_df.toxicity_category == 0]

In [7]:
train_df.shape, toxic.shape, nontoxic.shape

((721934, 50), (42321, 50), (679613, 50))

Reshape the dataset to include an equal number of toxic and nontoxic samples

In [8]:
quarter = len(toxic)

In [9]:
random_df = train_df.sample(quarter*4)

In [10]:
prepared_25 = toxic.append(nontoxic.sample(len(toxic)*3))
prepared_25 = prepared_25.sample(frac=1).reset_index(drop=True)
print(prepared_25.toxicity_category.value_counts())

prepared_50 = toxic.append(toxic).append(nontoxic.sample(len(toxic)*2))
prepared_50 = prepared_50.sample(frac=1).reset_index(drop=True)
print(prepared_50.toxicity_category.value_counts())

prepared_75 = toxic.append(toxic).append(toxic).append(nontoxic.sample(len(toxic)))
prepared_75 = prepared_75.sample(frac=1).reset_index(drop=True)
print(prepared_75.toxicity_category.value_counts())

0    126963
1     42321
Name: toxicity_category, dtype: int64
1    84642
0    84642
Name: toxicity_category, dtype: int64
1    126963
0     42321
Name: toxicity_category, dtype: int64


### Naive Bayes

In [11]:
best_metric = 0
metric_dict = ''
model_factors = []

SUBSET_OF_INTEREST = "Target"
METRIC_OF_INTEREST = "F1"

dfs = [random_df, prepared_50]
label = ["random_df", "prepared_50"]

mn = 0

for i in range(len(dfs)):
    for text in ['cleaned_w_stopwords_str', 'cleaned_no_stem_str', 'cleaned_porter_str',
       'cleaned_lancaster_str']:

        factors = [label[i], text]
        mn += 1
        print("{}. {}".format(mn, datetime.datetime.now()))
        print(factors)

        clf, output = mf.run_model(dfs[i], comments = text, model_type = "MultiNB")
        metrics = mf.get_metrics(output, should_print=False)
        metric_of_interest = metrics[SUBSET_OF_INTEREST][METRIC_OF_INTEREST]
        
        print("Overall Accuracy: {}, Target Accuracy: {}, Non-Target Accuracy: {}".format(metrics["Overall"]["Accuracy"], metrics["Target"]["Accuracy"], metrics["Non-Target"]["Accuracy"]))
        print() 
        
        if (metric_of_interest > best_metric) and metric_of_interest < 0.95:
            best_metric = metric_of_interest
            
            model_factors = factors
            metric_dict = metrics

1. 2019-05-30 19:17:34.605432
['random_df', 'cleaned_w_stopwords_str']
fitting model now
Overall Accuracy: 0.941, Target Accuracy: 0.004, Non-Target Accuracy: 0.999

2. 2019-05-30 19:17:50.552724
['random_df', 'cleaned_no_stem_str']
fitting model now
Overall Accuracy: 0.941, Target Accuracy: 0.005, Non-Target Accuracy: 0.999

3. 2019-05-30 19:18:01.751315
['random_df', 'cleaned_porter_str']
fitting model now
Overall Accuracy: 0.941, Target Accuracy: 0.005, Non-Target Accuracy: 0.999

4. 2019-05-30 19:18:12.192631
['random_df', 'cleaned_lancaster_str']
fitting model now
Overall Accuracy: 0.941, Target Accuracy: 0.004, Non-Target Accuracy: 0.999

5. 2019-05-30 19:18:22.405352
['prepared_50', 'cleaned_w_stopwords_str']
fitting model now
Overall Accuracy: 0.822, Target Accuracy: 0.89, Non-Target Accuracy: 0.756

6. 2019-05-30 19:18:36.693717
['prepared_50', 'cleaned_no_stem_str']
fitting model now
Overall Accuracy: 0.821, Target Accuracy: 0.89, Non-Target Accuracy: 0.752

7. 2019-05-30 19:

In [12]:
train_df.head()

Unnamed: 0,id,target,comment_text,severe_toxicity,obscene,identity_attack,insult,threat,asian,atheist,...,likes,disagree,sexual_explicit,identity_annotator_count,toxicity_annotator_count,toxicity_category,cleaned_w_stopwords_str,cleaned_no_stem_str,cleaned_porter_str,cleaned_lancaster_str
111477,378509,0.2,"For the small minded GOP, politics are more im...",0.0,0.0,0.0,0.2,0.0,0.0,0.0,...,8,0,0.0,4,10,0,For the small minded GOP politics are more imp...,for small minded gop politics important welfar...,for small mind gop polit import welfar countri...,for smal mind gop polit import welf country sh...
826004,5131799,0.728571,What a bunch of idiots the religious right are...,0.128571,0.128571,0.485714,0.714286,0.0,0.0,0.0,...,0,0,0.0,5,70,1,What a bunch of idiots the religious right are...,what bunch idiots religious right these fools ...,what bunch idiot religi right these fool belie...,what bunch idiot religy right thes fool believ...
576164,947214,0.166667,Brinksmanhip might be okay for high school stu...,0.0,0.0,0.0,0.166667,0.0,,,...,3,0,0.0,0,6,0,Brinksmanhip might be okay for high school stu...,brinksmanhip might okay high school student co...,brinksmanhip might okay high school student co...,brinksmanhip might okay high school stud counc...
1655937,6151798,0.0,"Ahhh yes, another no taxes less government st...",0.0,0.0,0.0,0.0,0.0,,,...,1,1,0.0,0,4,0,Ahhh yes another no taxes less government sto...,ahhh yes another taxes less government story w...,ahhh ye anoth tax less govern stori wow i neve...,ahhh ye anoth tax less govern story wow i nev ...
1465266,5912126,0.0,"""Perfect"" in what regard?",0.0,0.0,0.0,0.0,0.0,,,...,0,0,0.0,0,4,0,Perfect in what regard,perfect regard,perfect regard,perfect regard


In [13]:
model_factors, best_metric

(['prepared_50', 'cleaned_w_stopwords_str'], 0.942)

In [14]:
metric_dict

{'Overall': {'Accuracy': 0.822,
  'Precision': 0.781,
  'Recall': 0.89,
  'F1': 0.832,
  'ROC_AUC': 0.823},
 'Target': {'Accuracy': 0.89, 'Precision': 1.0, 'Recall': 0.89, 'F1': 0.942},
 'Non-Target': {'Accuracy': 0.756,
  'Precision': 1.0,
  'Recall': 0.756,
  'F1': 0.861}}