In [56]:
import pandas as pd

import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer 
from nltk.stem import LancasterStemmer 

nltk.download("stopwords")

import string
import re

import warnings
warnings.filterwarnings('ignore')

import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB

from sklearn import svm

from sklearn.linear_model import SGDClassifier

import numpy as np

from scipy import sparse
import datetime

import s3fs

from sklearn.feature_extraction.text import TfidfVectorizer

import io

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ec2-user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Load and shuffle data

Read in test.csv and train.csv

In [6]:
test = pd.read_csv("s3://advancedml-koch-mathur-hinkson/test.csv")

In [7]:
train = pd.read_csv("s3://advancedml-koch-mathur-hinkson/train.csv")

Create a new column called "toxicity_category" in the train data frame categorizing comments as toxic ("1") or non-toxic ("0").

In [23]:
train['toxicity_category'] = train.target.apply(lambda x: 1 if x > 0.5 else 0)

Split train.csv into training (80%) and validation sets (20%).

In [24]:
msk = np.random.rand(len(train)) < 0.8
train_set = train[msk]
validation_set = train[~msk]

In [26]:
print(train_set.toxicity_category.value_counts())

0    1359168
1      85107
Name: toxicity_category, dtype: int64


In [27]:
print(validation_set.toxicity_category.value_counts())

0    339268
1     21331
Name: toxicity_category, dtype: int64


Create small sample ("train_sample1") from the train_set on which to run models.  Ensure that samples are iid by replacing after each draw.

In [28]:
train_sample1 = train_set.sample(frac=0.05, replace=True)

In [29]:
print(train_sample1.toxicity_category.value_counts())

0    67882
1     4332
Name: toxicity_category, dtype: int64


### Generate features

In [30]:
ls = LancasterStemmer()
ps = PorterStemmer() 

sw = set(stopwords.words('english'))
sw.add('')

def clean_text(text, stemming=None, remove_sw = True):
    '''
    This auxiliary function cleans text.
    
    Methods used for cleaning are: 
        (1) transform string of text to list of words,
        (2) cleaned (lowercase, remove punctuation) and remove stop words,
        (3) Porter stemming of cleaned (lowercase, remove punctuation) text, 
        (4) Lancaster stemming of cleaned (lowercase, remove punctuation), 
        (5) cleaned (lowercase, remove punctuation) without removing stop words.
    
    Inputs:
        text (string) - A string of text.
        stemming (parameter) - either Porter or Lancaster stemming method
        remove_sw (boolean) - True/False remove stop words
    
    Outputs:
        Cleaned text per the input parameters.
    '''

    t = text.replace("-", " ").split(" ")
    
    t = [w.lower() for w in t]
    
    if remove_sw == True:
        t = [w for w in t if w not in sw]
    
    if stemming == None:
        pass;
    elif stemming == "Porter":
        t = [ps.stem(w) for w in t]
    elif stemming == "Lancaster":
        t = [ls.stem(w) for w in t]
    else:
        print("Please enter a valid stemming type")
        
    t = [w.strip(string.punctuation) for w in t]

    return ' '.join(t)

In [31]:
def add_text_cleaning_cols(df):
    '''
    This function generates features.
    
    Input:
        dataframe with raw text strings
        
    Output:
        dataframe with added columns:
            (1) 'split' - (list) Transforms the string of text into a list of words
            (2) 'cleaned_w_stopwords' - (string) A string of text where words have been lowercased, 
                                        punctuation is removed, and stop words are removed
            (3) 'cleaned_no_stem' - (string) A string of text where words have been lowercased, and 
                                        punctuation is removed (stop words remain in text).
                                        
            
            (4) 'cleaned_porter' - A striPorter stemming of cleaned (lowercase, remove punctuation) text. 
        (4) Lancaster stemming of cleaned (lowercase, remove punctuation), 
        (5) cleaned (lowercase, remove punctuation) without removing stop words.
    '''
    print(datetime.datetime.now())
    
    df['split'] = df["comment_text"].apply(lambda x: x.split(" "))
    df['cleaned_w_stopwords'] = df["comment_text"].apply(clean_text,args=(None,False),)

    print(datetime.datetime.now())
    df['cleaned_no_stem'] = df["comment_text"].apply(clean_text,)
    df['cleaned_porter'] = df["comment_text"].apply(clean_text,args=("Porter",),)
    df['cleaned_lancaster'] = df["comment_text"].apply(clean_text,args=("Lancaster",),)

    print(datetime.datetime.now())

    df['perc_upper'] = df["comment_text"].apply(lambda x: round((len(re.findall(r'[A-Z]',x)) / len(x)), 3))

    df['num_exclam'] = df["comment_text"].apply(lambda x:(len(re.findall(r'!',x))))
    
    df['num_words'] = df["split"].apply(lambda x: len(x))
    print(datetime.datetime.now())

    df['perc_stopwords'] = round((df.num_words - df['cleaned_no_stem'].apply(lambda x: len(x)))/df.num_words,3) 
    
    df['num_upper_words'] = df["split"].apply(lambda x: sum(map(str.isupper, x)))
    
    print(datetime.datetime.now())

    
    

In [32]:
add_text_cleaning_cols(train_sample1)

2019-05-26 18:57:36.255069
2019-05-26 18:57:38.055402
2019-05-26 18:58:36.011581
2019-05-26 18:58:36.478395
2019-05-26 18:58:36.803464


In [33]:
train_sample1.columns

Index(['id', 'target', 'comment_text', 'severe_toxicity', 'obscene',
       'identity_attack', 'insult', 'threat', 'asian', 'atheist', 'bisexual',
       'black', 'buddhist', 'christian', 'female', 'heterosexual', 'hindu',
       'homosexual_gay_or_lesbian', 'intellectual_or_learning_disability',
       'jewish', 'latino', 'male', 'muslim', 'other_disability',
       'other_gender', 'other_race_or_ethnicity', 'other_religion',
       'other_sexual_orientation', 'physical_disability',
       'psychiatric_or_mental_illness', 'transgender', 'white', 'created_date',
       'publication_id', 'parent_id', 'article_id', 'rating', 'funny', 'wow',
       'sad', 'likes', 'disagree', 'sexual_explicit',
       'identity_annotator_count', 'toxicity_annotator_count',
       'category_toxicity', 'toxicity_category', 'split',
       'cleaned_w_stopwords', 'cleaned_no_stem', 'cleaned_porter',
       'cleaned_lancaster', 'perc_upper', 'num_exclam', 'num_words',
       'perc_stopwords', 'num_upper_words'

Pickle the dataset and send to s3 bucket:

In [34]:
train_sample1.shape

(72214, 57)

In [35]:
train_sample1.head(5)

Unnamed: 0,id,target,comment_text,severe_toxicity,obscene,identity_attack,insult,threat,asian,atheist,...,split,cleaned_w_stopwords,cleaned_no_stem,cleaned_porter,cleaned_lancaster,perc_upper,num_exclam,num_words,perc_stopwords,num_upper_words
375600,702857,0.0,You completely misstate how the Bulletin of t...,0.0,0.0,0.0,0.0,0.0,,,...,"[You, completely, misstate, how, the, , Bullet...",you completely misstate how the bulletin of t...,completely misstate bulletin atomic scientists...,complet misstat bulletin atom scientist move d...,complet misst bulletin atom sci mov doomsday c...,0.03,0,29,-2.69,0
1566597,6038615,0.3,The good thing about boondoggles is that they ...,0.0,0.0,0.0,0.3,0.0,,,...,"[The, good, thing, about, boondoggles, is, tha...",the good thing about boondoggles is that they ...,good thing boondoggles usually collapse weight...,good thing boondoggl usual collaps weight corr...,good thing boondoggl us collaps weight corruption,0.01,0,17,-2.353,0
275683,579993,0.0,Throwin your votes away eh,0.0,0.0,0.0,0.0,0.0,,,...,"[Throwin, your, votes, away, eh]",throwin your votes away eh,throwin votes away eh,throwin vote away eh,throwin vot away eh,0.038,0,5,-3.2,0
89607,352281,0.0,Mr. Sayre trunk line fibers typically carry 40...,0.0,0.0,0.0,0.0,0.0,,,...,"[Mr., Sayre, trunk, line, fibers, typically, c...",mr sayre trunk line fibers typically carry 40 ...,mr sayre trunk line fibers typically carry 40 ...,mr sayr trunk line fiber typic carri 40 separ ...,mr sayr trunk lin fib typ carry 40 sep 10 gb/s...,0.025,0,183,-2.902,3
1347516,5762238,0.0,MSNBC and CNN often mention facts ..............,0.0,0.0,0.0,0.0,0.0,,,...,"[MSNBC, and, CNN, often, mention, facts, , ......",msnbc and cnn often mention facts \n althoug...,msnbc cnn often mention facts \n although sto...,msnbc cnn often mention fact \n although stop...,msnbc cnn oft ment fact \n although stop watc...,0.085,0,24,-3.833,3


Due to memory issues, we needed to use a smaller training set.  We used a random iid sample of half of the train_sample1 frame to train NB model:

In [36]:
toxic = train_sample1[train_sample1.toxicity_category == 1]
nontoxic = train_sample1[train_sample1.toxicity_category == 0]

In [37]:
train_sample1.shape, toxic.shape, nontoxic.shape

((72214, 57), (4332, 57), (67882, 57))

Reshaping the dataset to be include an equal number of toxic and nontoxic samples

In [40]:
prepared_df = toxic.append(toxic).append(nontoxic.sample(len(toxic)*2))
prepared_df = prepared_df.sample(frac=1).reset_index(drop=True)

print(prepared_df.toxicity_category.value_counts())


1    8664
0    8664
Name: toxicity_category, dtype: int64


Because we are unable to train an NB model on categorical (text) and continuous (numerical) data at the same time, our action plan changed to running two independent models for each type of data and then running a thrid NB model on the resulting predict_proba from the other two trained models.

In [57]:
def run_model(model_df, train_perc=.80, addtl_feats =[''], model_type = "Multi", 
              num_iter = 10, should_print=False, see_inside=False, comments="comment_text",
             target='toxicity_category'):
    
    train_start = 0
    train_end = round(model_df.shape[0]*train_perc) 

    test_start = train_end
    test_end = model_df.shape[0]
    
    X_all = model_df[comments].values
    y_all = model_df[target].values
    
    # tokenizing text
#     count_vect = CountVectorizer()
#     X_all_counts = count_vect.fit_transform(X_all.astype('U'))
    #print(X_all_counts.shape)

    # calculating frequencies
    tfidf_vectorizer = TfidfVectorizer(use_idf=True)
    fitted_vectorizer=tfidf_vectorizer.fit(model_df[comments])
    X_all_tfidf =  fitted_vectorizer.transform(model_df[comments])


    print(X_all_tfidf.shape)
    
    if addtl_feats != ['']: # combine non-text and text features if necessary
        print("here")
#         others_all = model_df[addtl_feats].values.reshape(-1,1)

        others_all = model_df[addtl_feats].values.reshape(-1,len(addtl_feats))
        #print(others_all)
        newfeatures_all = sparse.hstack((X_all_tfidf, others_all.astype(float))).tocsr()
    else:
        newfeatures_all = X_all_tfidf
    
    
    X_train = newfeatures_all[train_start:train_end]
    y_train = model_df[train_start:train_end][target].values
    y_train=y_train.astype('int')
    

    X_test = newfeatures_all[test_start:test_end]
    y_test = model_df[test_start:test_end][target].values
    
    
    
    
    if model_type == 'Multi':
        clf = MultinomialNB().fit(X_train, y_train)
    if model_type == "Gauss":
        clf = GaussianNB().fit(X_train, y_train) 
    if model_type == "SVM":
        clf = svm.SVC(kernel='linear', probability=True, random_state=1008).fit(X_train, y_train) 
        
    preds_for_train = clf.predict(X_train)
    
    
   
    predicted = clf.predict(X_test)
    accuracy = np.mean(predicted == y_test)
    
    output = model_df[test_start:test_end]
    output['predicted'] = predicted
    output['y_test'] = y_test
    output['accuracy'] = output.predicted == output.y_test
    

#     y_scores_sorted, y_true_sorted = joint_sort_descending(np.array(y_scores), np.array(y_true))
#     precision = precision_score(y_true_sorted, preds)


    if should_print == True:

        print("The accuracy on the test set is {}%.".format(round(accuracy*100,2)))    
    
    if see_inside == True:
        return clf, accuracy, X_all_counts, X_all_tfidf
    else:
        return clf, accuracy, preds_for_train, predicted, output


In [44]:
clf1, accuracy, preds_for_train, predicted , output = run_model(prepared_df, comments = "cleaned_lancaster", should_print=False)

print("The unique values predicted in the training set include :" + str(np.unique(preds_for_train)))
print("The unique values predicted in the test set include :" + str(np.unique(predicted)))

(17328, 25803)
The unique values predicted in the training set include :[0 1]
The unique values predicted in the test set include :[0 1]


In [50]:
output.head()

Unnamed: 0,id,target,comment_text,severe_toxicity,obscene,identity_attack,insult,threat,asian,atheist,...,cleaned_porter,cleaned_lancaster,perc_upper,num_exclam,num_words,perc_stopwords,num_upper_words,predicted,y_test,accuracy
13862,5047544,0.226415,Anne McLellan a true waste of space then and n...,0.0,0.0,0.0,0.245283,0.0,,,...,ann mclellan true wast space like ilk,an mclellan tru wast spac lik ilk,0.043,0,15,-1.6,0,1,0,False
13863,1082637,0.6,Thieves.,0.0,0.1,0.1,0.6,0.0,,,...,thieves,thieves,0.125,0,1,-6.0,0,1,1,True
13864,802459,0.555556,The third world can't ever become the first wo...,0.041667,0.111111,0.027778,0.527778,0.0,,,...,third world can't ever becom first world keep ...,third world can't ev becom first world keep tr...,0.016,0,87,-3.333,0,0,1,False
13865,1072184,0.0,"Well, your post responded to a comment about t...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,well post respond comment this everi stori i'v...,well post respond com this every story i've re...,0.055,0,37,-2.324,2,0,0,True
13866,5180644,0.903226,This guy is an idiot. He was all in for shorti...,0.0,0.193548,0.048387,0.887097,0.016129,,,...,guy idiot short canadian bank probabl took man...,guy idiot short canad bank prob took many road...,0.028,0,35,-1.971,0,1,1,True


In [45]:
output.y_test.value_counts()

1    1735
0    1731
Name: y_test, dtype: int64

In [46]:
output.predicted.value_counts()

1    2060
0    1406
Name: predicted, dtype: int64

In [47]:
accuracy

0.811598384304674

In [48]:
targets = output[output.y_test == 1]
targets[targets.accuracy == True].shape[0] / targets.shape[0]

0.9054755043227666

In [49]:
output[output.y_test == 0].accuracy.value_counts()

True     1242
False     489
Name: accuracy, dtype: int64

In [54]:
best_accuracy = 0
model_factors = []

for text in ['cleaned_w_stopwords', 'cleaned_no_stem', 'cleaned_porter',
    'cleaned_lancaster']:
    for tp in [0.6, 0.7, 0.8]:
  
        factors = [text, tp]
        print(factors)

        clf, accuracy, preds_for_train, predicted, output = run_model(prepared_df, train_perc = tp, comments = text, should_print=False)
        
        print("The unique values predicted for the training set include :" + str(np.unique(preds_for_train)))
        print("The unique values predicted for the test set include :" + str(np.unique(predicted)))
        
        targets = output[output.y_test == 1]
        target_accuracy = targets[targets.accuracy == True].shape[0] / targets.shape[0]
        
        print("Accuracy: {} , Target Accuracy: {}".format(accuracy, target_accuracy))

        if target_accuracy > best_accuracy:
            model_factors = factors
            best_accuracy = target_accuracy

        print()



['cleaned_w_stopwords', 0.6]
(17328, 30033)
The unique values predicted for the training set include :[0 1]
The unique values predicted for the test set include :[0 1]
Accuracy: 0.8094070119751839 , Target Accuracy: 0.8924050632911392

['cleaned_w_stopwords', 0.7]
(17328, 30033)
The unique values predicted for the training set include :[0 1]
The unique values predicted for the test set include :[0 1]
Accuracy: 0.812043093497499 , Target Accuracy: 0.9043377226955848

['cleaned_w_stopwords', 0.8]
(17328, 30033)
The unique values predicted for the training set include :[0 1]
The unique values predicted for the test set include :[0 1]
Accuracy: 0.8228505481823427 , Target Accuracy: 0.9135446685878963

['cleaned_no_stem', 0.6]
(17328, 30032)
The unique values predicted for the training set include :[0 1]
The unique values predicted for the test set include :[0 1]
Accuracy: 0.8117154811715481 , Target Accuracy: 0.8975834292289988

['cleaned_no_stem', 0.7]
(17328, 30032)
The unique values pre

In [58]:
best_accuracy_svm = 0
model_factors_svm = []

for text in ['cleaned_w_stopwords', 'cleaned_no_stem', 'cleaned_porter',
    'cleaned_lancaster']:
    for tp in [0.6, 0.7, 0.8]:
  
        factors = [text, tp]
        print(factors)

        clf, accuracy, preds_for_train, predicted, output = run_model(prepared_df, model_type="SVM", train_perc = tp, comments = text, should_print=False)
        
        print("The unique values predicted for the training set include :" + str(np.unique(preds_for_train)))
        print("The unique values predicted for the test set include :" + str(np.unique(predicted)))
        
        targets = output[output.y_test == 1]
        target_accuracy = targets[targets.accuracy == True].shape[0] / targets.shape[0]
        
        print("Accuracy: {} , Target Accuracy: {}".format(accuracy, target_accuracy))

        if target_accuracy > best_accuracy:
            model_factors = factors
            best_accuracy = target_accuracy

        print()

['cleaned_w_stopwords', 0.6]
(17328, 30033)
The unique values predicted for the training set include :[0 1]
The unique values predicted for the test set include :[0 1]
Accuracy: 0.874909825422017 , Target Accuracy: 0.8627733026467204

['cleaned_w_stopwords', 0.7]
(17328, 30033)
The unique values predicted for the training set include :[0 1]
The unique values predicted for the test set include :[0 1]
Accuracy: 0.8855328972681801 , Target Accuracy: 0.8772269558481797

['cleaned_w_stopwords', 0.8]
(17328, 30033)
The unique values predicted for the training set include :[0 1]
The unique values predicted for the test set include :[0 1]
Accuracy: 0.8993075591459896 , Target Accuracy: 0.906628242074928

['cleaned_no_stem', 0.6]
(17328, 30032)
The unique values predicted for the training set include :[0 1]
The unique values predicted for the test set include :[0 1]
Accuracy: 0.875486942721108 , Target Accuracy: 0.8593210586881473

['cleaned_no_stem', 0.7]
(17328, 30032)
The unique values predi

In [59]:
best_accuracy_svm, model_factors_svm

(0, [])