In [1]:
import pandas as pd
import string
import re
import string
import numpy as np
import datetime

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer 
from nltk.stem import LancasterStemmer 

import warnings
warnings.filterwarnings('ignore')

import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.naive_bayes import MultinomialNB,GaussianNB

from sklearn import svm

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

### Load and split data

Read in test.csv and train.csv

In [6]:
test = pd.read_csv("s3://advancedml-koch-mathur-hinkson/test.csv")

In [2]:
train = pd.read_csv("s3://advancedml-koch-mathur-hinkson/train.csv")

In [3]:
train = pd.read_csv("train.csv")

Create a new column called "toxicity_category" in the train data frame categorizing comments as toxic ("1") or non-toxic ("0").

In [4]:
train['toxicity_category'] = train.target.apply(lambda x: 1 if x > 0.5 else 0)

Split train.csv into training (80%) and validation sets (20%).

In [5]:
msk = np.random.rand(len(train)) < 0.8
train_set = train[msk]
validation_set = train[~msk]

In [6]:
print(train_set.toxicity_category.value_counts())

0    1358961
1      84890
Name: toxicity_category, dtype: int64


In [7]:
print(validation_set.toxicity_category.value_counts())

0    339475
1     21548
Name: toxicity_category, dtype: int64


Create small sample ("train_sample1") from the train_set on which to run models.  Ensure that samples are iid by replacing after each draw.

In [8]:
train_sample = train_set.sample(frac=0.1, replace=True)

In [9]:
print(train_sample.toxicity_category.value_counts())

0    135745
1      8640
Name: toxicity_category, dtype: int64


### Generate features

In [10]:
ls = LancasterStemmer()
ps = PorterStemmer() 

sw = set(stopwords.words('english'))
sw.add('')

def clean_text(text, stemming=None, remove_sw = True):
    '''
    This auxiliary function cleans text.
    
    Methods used for cleaning are: 
        (1) transform string of text to list of words,
        (2) cleaned (lowercase, remove punctuation) and remove stop words,
        (3) Porter stemming of cleaned (lowercase, remove punctuation) text, 
        (4) Lancaster stemming of cleaned (lowercase, remove punctuation), 
        (5) cleaned (lowercase, remove punctuation) without removing stop words.
    
    Inputs:
        text (string) - A string of text.
        stemming (parameter) - either Porter or Lancaster stemming method
        remove_sw (boolean) - True/False remove stop words
    
    Outputs:
        Cleaned text per the input parameters.
    '''

    t = text.replace("-", " ").split(" ")
    
    t = [w.lower() for w in t]
    
    if remove_sw == True:
        t = [w for w in t if w not in sw]
    
    if stemming == None:
        pass;
    elif stemming == "Porter":
        t = [ps.stem(w) for w in t]
    elif stemming == "Lancaster":
        t = [ls.stem(w) for w in t]
    else:
        print("Please enter a valid stemming type")
        
    t = [w.strip(string.punctuation) for w in t]

    return ' '.join(t)

In [11]:
def add_text_cleaning_cols(df):
    '''
    This function generates features and adds them to the data frame.
    
    Input:
        Data frame with raw text strings.
        
    Output:
        Data frame with added columns:
            (1) 'split' - (list) Transforms the string of text into a list of words
            (2) 'cleaned_w_stopwords' - (string) A string of text where words have been lowercased, 
                                        punctuation is removed, and stop words are removed
            (3) 'cleaned_no_stem' - (string) A string of text where words have been lowercased, and 
                                        punctuation is removed (stop words remain in text).
                                        
            
            (4) 'cleaned_porter' - (string) A string of text where words have been stemmed using the 
                                        Porter method on cleaned (lowercase, remove punctuation) text. 
            (5) 'cleaned_lancaster' - (string) A string of text where words have been stemmed using the
                                        Lancaster method on cleaned (lowercase, remove punctuation) text.
            (6) 'perc_upper' - (float) Percent of uppercase letters in the string of text.
            (7) 'num_exclam' - (integer) Number of times an exclamation point appears in text.
            (8) 'num_words' - (integer) Number of words in text.
            
    '''
    print(datetime.datetime.now())
    
    df['split'] = df["comment_text"].apply(lambda x: x.split(" "))
    df['cleaned_w_stopwords'] = df["comment_text"].apply(clean_text,args=(None,False),)

    print(datetime.datetime.now())
    df['cleaned_no_stem'] = df["comment_text"].apply(clean_text,)
    df['cleaned_porter'] = df["comment_text"].apply(clean_text,args=("Porter",),)
    df['cleaned_lancaster'] = df["comment_text"].apply(clean_text,args=("Lancaster",),)

    print(datetime.datetime.now())

    df['perc_upper'] = df["comment_text"].apply(lambda x: round((len(re.findall(r'[A-Z]',x)) / len(x)), 3))

    df['num_exclam'] = df["comment_text"].apply(lambda x:(len(re.findall(r'!',x))))
    
    df['num_words'] = df["split"].apply(lambda x: len(x))
    print("DONE")
        

    
    

In [14]:
add_text_cleaning_cols(train_sample)

2019-05-27 19:52:24.841432
2019-05-27 19:52:29.656106
2019-05-27 19:55:03.282368
DONE


In [15]:
train_sample.columns

Index(['id', 'target', 'comment_text', 'severe_toxicity', 'obscene',
       'identity_attack', 'insult', 'threat', 'asian', 'atheist', 'bisexual',
       'black', 'buddhist', 'christian', 'female', 'heterosexual', 'hindu',
       'homosexual_gay_or_lesbian', 'intellectual_or_learning_disability',
       'jewish', 'latino', 'male', 'muslim', 'other_disability',
       'other_gender', 'other_race_or_ethnicity', 'other_religion',
       'other_sexual_orientation', 'physical_disability',
       'psychiatric_or_mental_illness', 'transgender', 'white', 'created_date',
       'publication_id', 'parent_id', 'article_id', 'rating', 'funny', 'wow',
       'sad', 'likes', 'disagree', 'sexual_explicit',
       'identity_annotator_count', 'toxicity_annotator_count',
       'toxicity_category', 'split', 'cleaned_w_stopwords', 'cleaned_no_stem',
       'cleaned_porter', 'cleaned_lancaster', 'perc_upper', 'num_exclam',
       'num_words'],
      dtype='object')

Pickle the dataset and send to s3 bucket:

In [17]:
toxic = train_sample[train_sample.toxicity_category == 1]
nontoxic = train_sample[train_sample.toxicity_category == 0]

In [18]:
train_sample.shape, toxic.shape, nontoxic.shape

((144385, 54), (8640, 54), (135745, 54))

Reshaping the dataset to be include an equal number of toxic and nontoxic samples

In [19]:
quarter = len(toxic)

In [20]:
random_df = train_sample.sample(quarter*4)

In [23]:
prepared_25 = toxic.append(nontoxic.sample(len(toxic)*3))
prepared_25 = prepared_25.sample(frac=1).reset_index(drop=True)
print(prepared_25.toxicity_category.value_counts())

prepared_50 = toxic.append(toxic).append(nontoxic.sample(len(toxic)*2))
prepared_50 = prepared_50.sample(frac=1).reset_index(drop=True)
print(prepared_50.toxicity_category.value_counts())

prepared_75 = toxic.append(toxic).append(toxic).append(nontoxic.sample(len(toxic)))
prepared_75 = prepared_75.sample(frac=1).reset_index(drop=True)
print(prepared_75.toxicity_category.value_counts())


0    25920
1     8640
Name: toxicity_category, dtype: int64
1    17280
0    17280
Name: toxicity_category, dtype: int64
1    25920
0     8640
Name: toxicity_category, dtype: int64


In [16]:
def run_model(model_df, train_perc=.80, addtl_feats =[''], model_type = "Multi", 
             see_inside=False, comments="comment_text",
             target='toxicity_category'):
    '''
    This function runs a single machine learning model as per the specified parameters.
    
    Input(s):
        model_df: source data frame
        train_perc: percentage that should be used for training set
        addtl_feats: (list) list of non text columns to include
        model_type: which machine learning model to use
        see_inside: returns the intermediate tokenized and vectorized arrays
        comments: source column for text data
        target: source column for y values
        
    Output(s):
    
    '''
    
    train_start = 0
    train_end = round(model_df.shape[0]*train_perc) 

    test_start = train_end
    test_end = model_df.shape[0]
    
    X_all = model_df[comments].values
    y_all = model_df[target].values

    # calculating frequencies
    tfidf_vectorizer = TfidfVectorizer(use_idf=True)
    fitted_vectorizer=tfidf_vectorizer.fit(model_df[comments])
    X_all_tfidf =  fitted_vectorizer.transform(model_df[comments])
    
    
    X_train = X_all_tfidf[train_start:train_end]
    y_train = model_df[train_start:train_end][target].values
    y_train=y_train.astype('int')
    

    X_test = X_all_tfidf[test_start:test_end]
    y_test = model_df[test_start:test_end][target].values
    
    
    model_dict = {}
    model_dict["Mutli"] = MultinomialNB()
    model_dict["Gauss"] = GaussianNB()
    model_dict['SVM'] = svm.SVC(kernel='linear', probability=True, random_state=1008)
    model_dict["LR"] = LogisticRegression(penalty="l1",C=1e5)
        
    clf = model_dict[model_type].fit(X_train, y_train)
    
    predicted = clf.predict(X_test)
    
    output = model_df[test_start:test_end]
    output['predicted'] = predicted
    output['y_test'] = y_test
    output['accuracy'] = output.predicted == output.y_test
    
    if see_inside == True:
        return clf, output, X_all_counts, X_all_tfidf
    else:
        return clf, output


In [None]:
def get_metrics(output, should_print=True, round_to=3):
    metrics = {}
    targets = output[output.y_test == 1]
    nontargets = output[output.y_test == 0]
    
    dfs = [output, target, nontarget]
    label = ["Overall", "Target", "Non-Target"]
    
    for i in range(len(dfs)):
        
        df, label = dfs[i], label[i]
        
        metrics[label] = {}
        
        accuracy = round(accuracy_score(y_test, predicted), round_to)
        metrics[label]['Accuracy'] = accuracy
        
        precision = round(precision_score(y_test, predicted), round_to)
        metrics[label]['Precision'] = precision

        recall = round(recall_score(y_test, predicted), round_to)
        metrics[label]['Recall'] = recall
        
        f1 = round(f1_score(y_test, predicted), round_to)
        metrics[label]['F1'] = f1

        if label == "Overall":
            roc_auc = round(roc_auc_score(y_test, predicted), round_to)
            metrics[label]['ROC_AUC'] = roc_auc
            
        if should_print == True:
            print("{} Accuracy: {}".format(label, accuracy))
            print("{} Precision: {}".format(label, precision))
            print("{} Recall: {}".format(label, recall))
            if label == "Overall":
                print("ROC_AUC: {}".format(label, roc_auc))
   

In [17]:
clf, output = run_model(prepared_df, comments = "cleaned_lancaster", should_print=False)

(17092, 25851)
The unique values predicted in the training set include :[0 1]
The unique values predicted in the test set include :[0 1]


In [18]:
get_metrics(output)

Unnamed: 0,id,target,comment_text,severe_toxicity,obscene,identity_attack,insult,threat,asian,atheist,...,cleaned_w_stopwords,cleaned_no_stem,cleaned_porter,cleaned_lancaster,perc_upper,num_exclam,num_words,predicted,y_test,accuracy
13674,5293834,0.0,Preeeecisely.,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,preeeecisely,preeeecisely,preeeecisely,preeeecisely,0.077,0,1,1,0,False
13675,6043087,0.166667,tol·er·ance\nˈtäl(ə)rəns/Submit\nnoun\n1.\nthe...,0.0,0.0,0.0,0.0,0.0,,,...,tol·er·ance\nˈtäl(ə)rəns/submit\nnoun\n1.\nthe...,tol·er·ance\nˈtäl(ə)rəns/submit\nnoun\n1.\nthe...,tol·er·ance\nˈtäl(ə)rəns/submit\nnoun\n1.\nth ...,tol·er·ance\nˈtäl(ə)rəns/submit\nnoun\n1.\nthe...,0.012,0,67,0,0,True
13676,5415503,0.550725,"Oh sure, put a black guy in the role of ""Caesa...",0.014493,0.014493,0.565217,0.246377,0.0,0.0,0.0,...,oh sure put a black guy in the role of caesar ...,oh sure put black guy role caesar youd crying ...,oh sure put black guy role caesar youd cri any...,oh sure put black guy rol caesar youd cry anyo...,0.023,0,19,1,1,True
13677,5936191,0.685714,Great work getting more scum off the streets.,0.071429,0.114286,0.014286,0.628571,0.014286,,,...,great work getting more scum off the streets,great work getting scum streets,great work get scum streets,gre work get scum streets,0.022,0,8,1,1,True
13678,744227,0.8,There is an error there.. Something U can not...,0.0,0.0,0.0,0.8,0.0,,,...,there is an error there something u can not s...,error there something u see smart,error there someth u see smart,er there someth u see smart,0.038,0,16,0,1,False


### Naive Bayes

In [27]:
best_metric = 0
metric_dict = ''
model_factors = []

SUBSET_OF_INTEREST = "Target"
METRIC_OF_INTEREST = "F1"

dfs = [random_df, prepared_25, prepared_50, prepared_75]
label = ["random_df", "prepared_25", "prepared_50", "prepared_75"]

for i in range(len(dfs)):
    for text in ['cleaned_w_stopwords', 'cleaned_no_stem', 'cleaned_porter', 'cleaned_lancaster']:

        factors = [label[i], text, tp]
        print(factors)

        clf, output = run_model(dfs[i], comments = text, should_print=False)
        metrics = get_metrics(model)
        metric_of_interest = metrics[SUBSET_OF_INTEREST][METRIC_OF_INTEREST]

        if metric_of_intest > best_metric:
            best_metric = metric_of_interest
            
            model_factors = factors
            metric_dict = metrics

['cleaned_w_stopwords', 0.6]
(17092, 30051)
The unique values predicted for the training set include :[0 1]
The unique values predicted for the test set include :[0 1]
Accuracy: 0.8221442152991078 , Target Accuracy: 0.8695023148148148, Nontarget Accuracy: 0.7737355811889973

['cleaned_w_stopwords', 0.7]
(17092, 30051)
The unique values predicted for the training set include :[0 1]
The unique values predicted for the test set include :[0 1]
Accuracy: 0.8260530421216848 , Target Accuracy: 0.8893617021276595, Nontarget Accuracy: 0.7616987809673614

['cleaned_w_stopwords', 0.8]
(17092, 30051)
The unique values predicted for the training set include :[0 1]
The unique values predicted for the test set include :[0 1]
Accuracy: 0.8224107665301346 , Target Accuracy: 0.9002347417840375, Nontarget Accuracy: 0.7450408401400234

['cleaned_no_stem', 0.6]
(17092, 30048)
The unique values predicted for the training set include :[0 1]
The unique values predicted for the test set include :[0 1]
Accuracy

In [28]:
model_factors, best_metric

(0.9019953051643192, ['cleaned_no_stem', 0.8])

In [29]:
metric_dict

(17092, 30048)
