In [1]:
import pandas as pd
import string
import re
import string
import numpy as np
import datetime

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer 
from nltk.stem import LancasterStemmer 

import warnings
warnings.filterwarnings('ignore')

import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.naive_bayes import MultinomialNB
from sklearn import svm
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

In [2]:
import pickle_functions as pf

In [3]:
files = ['sub_train_df1_preprocessed',
        'sub_train_df2_preprocessed',
        'sub_train_df3_preprocessed',
        'sub_train_df4_preprocessed']

In [4]:
dfs = []
for file in files:
    data = pf.read_pickle(bucket_name='advancedml-koch-mathur-hinkson', filename=file)
    dfs.append(data)

In [5]:
dfs[0].columns

Index(['id', 'target', 'comment_text', 'severe_toxicity', 'obscene',
       'identity_attack', 'insult', 'threat', 'asian', 'atheist', 'bisexual',
       'black', 'buddhist', 'christian', 'female', 'heterosexual', 'hindu',
       'homosexual_gay_or_lesbian', 'intellectual_or_learning_disability',
       'jewish', 'latino', 'male', 'muslim', 'other_disability',
       'other_gender', 'other_race_or_ethnicity', 'other_religion',
       'other_sexual_orientation', 'physical_disability',
       'psychiatric_or_mental_illness', 'transgender', 'white', 'created_date',
       'publication_id', 'parent_id', 'article_id', 'rating', 'funny', 'wow',
       'sad', 'likes', 'disagree', 'sexual_explicit',
       'identity_annotator_count', 'toxicity_annotator_count', 'split',
       'cleaned_w_stopwords_str', 'cleaned_w_stopwords', 'cleaned_no_stem_str',
       'cleaned_no_stem', 'cleaned_porter_str', 'cleaned_porter',
       'cleaned_lancaster_str', 'cleaned_lancaster', 'bigrams_unstemmed',
       

In [8]:
drop_cols = ['split', 'cleaned_w_stopwords', 'cleaned_no_stem', 'cleaned_porter', 'cleaned_lancaster', 'bigrams_unstemmed',
       'perc_upper', 'num_exclam', 'num_words', 'perc_stopwords',
       'num_upper_words']

In [11]:
prep_dfs = []
for df in dfs:
    df = df.drop(drop_cols, axis = 1)
    prep_dfs.append(df)

In [12]:
prep_dfs[0].columns

Index(['id', 'target', 'comment_text', 'severe_toxicity', 'obscene',
       'identity_attack', 'insult', 'threat', 'asian', 'atheist', 'bisexual',
       'black', 'buddhist', 'christian', 'female', 'heterosexual', 'hindu',
       'homosexual_gay_or_lesbian', 'intellectual_or_learning_disability',
       'jewish', 'latino', 'male', 'muslim', 'other_disability',
       'other_gender', 'other_race_or_ethnicity', 'other_religion',
       'other_sexual_orientation', 'physical_disability',
       'psychiatric_or_mental_illness', 'transgender', 'white', 'created_date',
       'publication_id', 'parent_id', 'article_id', 'rating', 'funny', 'wow',
       'sad', 'likes', 'disagree', 'sexual_explicit',
       'identity_annotator_count', 'toxicity_annotator_count',
       'cleaned_w_stopwords_str', 'cleaned_no_stem_str', 'cleaned_porter_str',
       'cleaned_lancaster_str'],
      dtype='object')

In [14]:
data = pd.concat(prep_dfs)

In [15]:
data.shape

(400000, 49)

In [16]:
train = data

### Load and split data

Read in test.csv and train.csv

Create a new column called "toxicity_category" in the train data frame categorizing comments as toxic ("1") or non-toxic ("0").

In [17]:
train['toxicity_category'] = train.target.apply(lambda x: 1 if x > 0.5 else 0)

Split train.csv into training (80%) and validation sets (20%).

In [18]:
msk = np.random.rand(len(train)) < 0.8
train_set = train[msk]
validation_set = train[~msk]

In [19]:
print(train_set.toxicity_category.value_counts())

0    300520
1     18833
Name: toxicity_category, dtype: int64


In [20]:
print(validation_set.toxicity_category.value_counts())

0    75905
1     4742
Name: toxicity_category, dtype: int64


In [21]:
toxic = train[train.toxicity_category == 1]
nontoxic = train[train.toxicity_category == 0]

In [22]:
train.shape, toxic.shape, nontoxic.shape

((400000, 50), (23575, 50), (376425, 50))

Reshaping the dataset to be include an equal number of toxic and nontoxic samples

In [23]:
quarter = len(toxic)

In [24]:
random_df = train.sample(quarter*4)

In [25]:
prepared_25 = toxic.append(nontoxic.sample(len(toxic)*3))
prepared_25 = prepared_25.sample(frac=1).reset_index(drop=True)
print(prepared_25.toxicity_category.value_counts())

prepared_50 = toxic.append(toxic).append(nontoxic.sample(len(toxic)*2))
prepared_50 = prepared_50.sample(frac=1).reset_index(drop=True)
print(prepared_50.toxicity_category.value_counts())

prepared_75 = toxic.append(toxic).append(toxic).append(nontoxic.sample(len(toxic)))
prepared_75 = prepared_75.sample(frac=1).reset_index(drop=True)
print(prepared_75.toxicity_category.value_counts())


0    70725
1    23575
Name: toxicity_category, dtype: int64
1    47150
0    47150
Name: toxicity_category, dtype: int64
1    70725
0    23575
Name: toxicity_category, dtype: int64


In [26]:
def run_model(model_df, train_perc=.80,  model_type = "MultiNB", 
             see_inside=False, comments="comment_text",
             target='toxicity_category'):
    '''
    This function runs a single machine learning model as per the specified parameters.
    
    Input(s):
        model_df: source data frame
        train_perc: percentage that should be used for training set
        addtl_feats: (list) list of non text columns to include
        model_type: which machine learning model to use
        see_inside: returns the intermediate tokenized and vectorized arrays
        comments: source column for text data
        target: source column for y values
        
    Output(s):
    
    '''
    
    train_start = 0
    train_end = round(model_df.shape[0]*train_perc) 

    test_start = train_end
    test_end = model_df.shape[0]
    
    X_all = model_df[comments].values
    y_all = model_df[target].values

    # calculating frequencies
    tfidf_vectorizer = TfidfVectorizer(use_idf=True)
    fitted_vectorizer=tfidf_vectorizer.fit(model_df[comments].astype('U'))
    X_all_tfidf =  fitted_vectorizer.transform(model_df[comments].astype('U'))
    
    
    X_train = X_all_tfidf[train_start:train_end]
    y_train = model_df[train_start:train_end][target].values
    y_train=y_train.astype('int')
    

    X_test = X_all_tfidf[test_start:test_end]
    y_test = model_df[test_start:test_end][target].values
    
    
    model_dict = {}
    model_dict["MultiNB"] = MultinomialNB()
    model_dict['SVM'] = svm.SVC(kernel='linear', probability=True, random_state=1008)
    model_dict["LR"] = LogisticRegression(penalty="l1",C=1e5)
        
    clf = model_dict[model_type].fit(X_train.toarray(), y_train)
    
    predicted = clf.predict(X_test)
    
    output = model_df[test_start:test_end]
    output['predicted'] = predicted
    output['y_test'] = y_test
    output['accuracy'] = output.predicted == output.y_test
    
    if see_inside == True:
        return clf, output, X_all_counts, X_all_tfidf
    else:
        return clf, output


In [27]:
def get_metrics(output, should_print=True, round_to=3, detailed = False):
    metrics = {}
    targets = output[output.y_test == 1]
    nontargets = output[output.y_test == 0]
                
    dfs = [output, targets, nontargets]
    labels = ["Overall", "Target", "Non-Target"]
    
    for i in range(len(dfs)):

        df, label = dfs[i], labels[i]
        if label == "Non-Target":
            pos_label = 0
        else:
            pos_label = 1
        
        metrics[label] = {}
        
        
        accuracy = round(accuracy_score(df.y_test, df.predicted), round_to)
        metrics[label]['Accuracy'] = accuracy
        
        precision = round(precision_score(df.y_test, df.predicted, pos_label=pos_label), round_to)
        metrics[label]['Precision'] = precision

        recall = round(recall_score(df.y_test, df.predicted, pos_label=pos_label), round_to)
        metrics[label]['Recall'] = recall
        
        f1 = round(f1_score(df.y_test, df.predicted, pos_label=pos_label), round_to)
        metrics[label]['F1'] = f1

        if label == "Overall":
            roc_auc = round(roc_auc_score(df.y_test, df.predicted), round_to)
            metrics[label]['ROC_AUC'] = roc_auc
            
        if should_print == True:
            print("{} Accuracy: {}".format(label, accuracy))
            print("{} Precision: {}".format(label, precision))
            print("{} Recall: {}".format(label, recall))
            print("{} F1 Score: {}".format(label, f1))
            if label == "Overall":
                print("ROC_AUC: {}".format(roc_auc))
            print()
            
    if detailed == True:
        
        identities = output[output.identity_attack > .5]
        obscenity = output[output.obscene > .5]
        insults = output[output.insult > .5]
        threats = output[output.threat > .5]

        detail_dfs = [identities, obscenity, insults, threats]
        detail_labels = ["Strong Identity", "Obscenity", "Insults", "Threats"]
        
        for i in range(len(detail_dfs)):
            df, label = detail_dfs[i], detail_labels[i]

            metrics[label] = {}
        
            f1 = round(f1_score(df.y_test, df.predicted, pos_label=pos_label), round_to)
            metrics[label]['F1'] = f1
            
            if should_print == True:
           
                print("{} F1 Score: {}".format(label, f1))
            
    return metrics
   

### Highlighted Model

In [54]:
clf, output = run_model(prepared_50, comments = "cleaned_no_stem")

KeyboardInterrupt: 

In [None]:
get_metrics(output, detailed=True)

In [28]:
toxic.columns

Index(['id', 'target', 'comment_text', 'severe_toxicity', 'obscene',
       'identity_attack', 'insult', 'threat', 'asian', 'atheist', 'bisexual',
       'black', 'buddhist', 'christian', 'female', 'heterosexual', 'hindu',
       'homosexual_gay_or_lesbian', 'intellectual_or_learning_disability',
       'jewish', 'latino', 'male', 'muslim', 'other_disability',
       'other_gender', 'other_race_or_ethnicity', 'other_religion',
       'other_sexual_orientation', 'physical_disability',
       'psychiatric_or_mental_illness', 'transgender', 'white', 'created_date',
       'publication_id', 'parent_id', 'article_id', 'rating', 'funny', 'wow',
       'sad', 'likes', 'disagree', 'sexual_explicit',
       'identity_annotator_count', 'toxicity_annotator_count',
       'cleaned_w_stopwords_str', 'cleaned_no_stem_str', 'cleaned_porter_str',
       'cleaned_lancaster_str', 'toxicity_category'],
      dtype='object')

### Naive Bayes

In [29]:
best_metric = 0
metric_dict = ''
model_factors = []

SUBSET_OF_INTEREST = "Target"
METRIC_OF_INTEREST = "F1"

dfs = [random_df, prepared_50]
label = ["random_df", "prepared_50"]

mn = 0

for i in range(len(dfs)):
    for text in ['cleaned_w_stopwords_str', 'cleaned_no_stem_str', 'cleaned_porter_str',
       'cleaned_lancaster_str']:

        factors = [label[i], text]
        mn += 1
        print("{}. {}".format(mn, datetime.datetime.now()))
        print(factors)

        clf, output = run_model(dfs[i], comments = text, model_type = "MultiNB")
        metrics = get_metrics(output, should_print=False)
        metric_of_interest = metrics[SUBSET_OF_INTEREST][METRIC_OF_INTEREST]
        
        print("Overall Accuracy: {}, Target Accuracy: {}, Non-Target Accuracy: {}".format(metrics["Overall"]["Accuracy"], metrics["Target"]["Accuracy"], metrics["Non-Target"]["Accuracy"]))
        print() 
        
        if (metric_of_interest > best_metric) and metric_of_interest < 0.95:
            best_metric = metric_of_interest
            
            model_factors = factors
            metric_dict = metrics

1. 2019-05-29 18:35:48.226749
['random_df', 'cleaned_w_stopwords_str']


MemoryError: 

In [None]:
model_factors, best_metric

In [None]:
metric_dict