In [None]:
import pandas as pd
import string
import re
import string
import numpy as np
import datetime

import warnings
warnings.filterwarnings('ignore')

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer 
from nltk.stem import LancasterStemmer 

import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.naive_bayes import MultinomialNB,GaussianNB
from sklearn import svm
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

### Load and split data

Read in test.csv and train.csv

In [None]:
test = pd.read_csv("s3://advancedml-koch-mathur-hinkson/test.csv")

In [None]:
train = pd.read_csv("s3://advancedml-koch-mathur-hinkson/train.csv")

In [None]:
train = pd.read_csv("train.csv")

Create a new column called "toxicity_category" in the train data frame categorizing comments as toxic ("1") or non-toxic ("0").

In [None]:
train['toxicity_category'] = train.target.apply(lambda x: 1 if x > 0.5 else 0)

Split train.csv into training (80%) and validation sets (20%).

In [None]:
# https://stackoverflow.com/questions/24147278/how-do-i-create-test-and-train-samples-from-one-dataframe-with-pandas
msk = np.random.rand(len(train)) < 0.8
train_set = train[msk]
validation_set = train[~msk]

In [None]:
print(train_set.toxicity_category.value_counts())

In [None]:
print(validation_set.toxicity_category.value_counts())

Create small sample ("train_sample1") from the train_set on which to run models.  Ensure that samples are iid by replacing after each draw.

In [None]:
train_sample = train_set.sample(frac=0.2, replace=True)

In [None]:
print(train_sample.toxicity_category.value_counts())

### Generate features

In [None]:
ls = LancasterStemmer()
ps = PorterStemmer() 

sw = set(stopwords.words('english'))
sw.add('')

def clean_text(text, stemming=None, remove_sw = True):
    '''
    This auxiliary function cleans text.
    
    Methods used for cleaning are: 
        (1) transform string of text to list of words,
        (2) cleaned (lowercase, remove punctuation) and remove stop words,
        (3) Porter stemming of cleaned (lowercase, remove punctuation) text, 
        (4) Lancaster stemming of cleaned (lowercase, remove punctuation), 
        (5) cleaned (lowercase, remove punctuation) without removing stop words.
    
    Inputs:
        text (string) - A string of text.
        stemming (parameter) - either Porter or Lancaster stemming method
        remove_sw (boolean) - True/False remove stop words
    
    Outputs:
        Cleaned text per the input parameters.
    '''

    t = text.replace("-", " ").split(" ")
    
    t = [w.lower() for w in t]
    
    if remove_sw == True:
        t = [w for w in t if w not in sw]
    
    if stemming == None:
        pass;
    elif stemming == "Porter":
        t = [ps.stem(w) for w in t]
    elif stemming == "Lancaster":
        t = [ls.stem(w) for w in t]
    else:
        print("Please enter a valid stemming type")
        
    t = [w.strip(string.punctuation) for w in t]

    return ' '.join(t)

In [None]:
def add_text_cleaning_cols(df):
    '''
    This function generates features and adds them to the data frame.
    
    Input:
        Data frame with raw text strings.
        
    Output:
        Data frame with added columns:
            (1) 'split' - (list) Transforms the string of text into a list of words
            (2) 'cleaned_w_stopwords' - (string) A string of text where words have been lowercased, 
                                        punctuation is removed, and stop words are removed
            (3) 'cleaned_no_stem' - (string) A string of text where words have been lowercased, and 
                                        punctuation is removed (stop words remain in text).
                                        
            
            (4) 'cleaned_porter' - (string) A string of text where words have been stemmed using the 
                                        Porter method on cleaned (lowercase, remove punctuation) text. 
            (5) 'cleaned_lancaster' - (string) A string of text where words have been stemmed using the
                                        Lancaster method on cleaned (lowercase, remove punctuation) text.
            (6) 'perc_upper' - (float) Percent of uppercase letters in the string of text.
            (7) 'num_exclam' - (integer) Number of times an exclamation point appears in text.
            (8) 'num_words' - (integer) Number of words in text.
            
    '''
    print(datetime.datetime.now())
    
    df['split'] = df["comment_text"].apply(lambda x: x.split(" "))
    df['cleaned_w_stopwords'] = df["comment_text"].apply(clean_text,args=(None,False),)

    print(datetime.datetime.now())
    df['cleaned_no_stem'] = df["comment_text"].apply(clean_text,)
    df['cleaned_porter'] = df["comment_text"].apply(clean_text,args=("Porter",),)
    df['cleaned_lancaster'] = df["comment_text"].apply(clean_text,args=("Lancaster",),)

    print(datetime.datetime.now())

    df['perc_upper'] = df["comment_text"].apply(lambda x: round((len(re.findall(r'[A-Z]',x)) / len(x)), 3))

    df['num_exclam'] = df["comment_text"].apply(lambda x:(len(re.findall(r'!',x))))
    
    df['num_words'] = df["split"].apply(lambda x: len(x))
    print("DONE @ " + datetime.datetime.now())
        

    
    

In [None]:
add_text_cleaning_cols(train_sample)

In [None]:
train_sample.columns

In [None]:
#train_sample.to_csv('processed_sample_20_perc.csv')

Pickle the dataset and send to s3 bucket:

In [2]:
train_sample = pd.read_csv('processed_sample_20_perc.csv')
train_sample = train_sample.sample(frac=0.5)

In [3]:
toxic = train_sample[train_sample.toxicity_category == 1]
nontoxic = train_sample[train_sample.toxicity_category == 0]

In [4]:
train_sample.shape, toxic.shape, nontoxic.shape

((144346, 55), (8466, 55), (135880, 55))

Reshaping the dataset to be include an equal number of toxic and nontoxic samples

In [7]:
quarter = len(toxic)

In [8]:
random_df = train_sample.sample(quarter*4)

In [9]:
prepared_25 = toxic.append(nontoxic.sample(len(toxic)*3))
prepared_25 = prepared_25.sample(frac=1).reset_index(drop=True)
print(prepared_25.toxicity_category.value_counts())

prepared_50 = toxic.append(toxic).append(nontoxic.sample(len(toxic)*2))
prepared_50 = prepared_50.sample(frac=1).reset_index(drop=True)
print(prepared_50.toxicity_category.value_counts())

prepared_75 = toxic.append(toxic).append(toxic).append(nontoxic.sample(len(toxic)))
prepared_75 = prepared_75.sample(frac=1).reset_index(drop=True)
print(prepared_75.toxicity_category.value_counts())


0    25398
1     8466
Name: toxicity_category, dtype: int64
1    16932
0    16932
Name: toxicity_category, dtype: int64
1    25398
0     8466
Name: toxicity_category, dtype: int64


In [None]:
def run_model(model_df, train_perc=.80,  model_type = "SVM", 
             see_inside=False, comments="comment_text",
             target='toxicity_category'):
    '''
    This function runs a single machine learning model as per the specified parameters.
    
    Input(s):
        model_df: source data frame
        train_perc: percentage that should be used for training set
        addtl_feats: (list) list of non text columns to include
        model_type: which machine learning model to use
        see_inside: returns the intermediate tokenized and vectorized arrays
        comments: source column for text data
        target: source column for y values
        
    Output(s):
    
    '''
    
    train_start = 0
    train_end = round(model_df.shape[0]*train_perc) 

    test_start = train_end
    test_end = model_df.shape[0]
    
    X_all = model_df[comments].values
    y_all = model_df[target].values

    # calculating frequencies
    tfidf_vectorizer = TfidfVectorizer(use_idf=True)
    fitted_vectorizer=tfidf_vectorizer.fit(model_df[comments].values.astype('U'))
    X_all_tfidf =  fitted_vectorizer.transform(model_df[comments].values.astype('U'))
    
    
    X_train = X_all_tfidf[train_start:train_end]
    y_train = model_df[train_start:train_end][target].values
    y_train=y_train.astype('int')
    

    X_test = X_all_tfidf[test_start:test_end]
    y_test = model_df[test_start:test_end][target].values
    print("fitting model now")
    model_dict = {}
    model_dict["MultiNB"] = MultinomialNB()
    model_dict["GaussNB"] = GaussianNB()
    model_dict['SVM'] = svm.SVC(kernel='linear', probability=True, random_state=1008)
    model_dict["LR"] = LogisticRegression(penalty="l1",C=1e5)
        
    clf = model_dict[model_type].fit(X_train, y_train)
    
    predicted = clf.predict(X_test)
    
    output = model_df[test_start:test_end]
    output['predicted'] = predicted
    output['y_test'] = y_test
    output['accuracy'] = output.predicted == output.y_test
    
    if see_inside == True:
        return clf, output, X_all_counts, X_all_tfidf
    else:
        return clf, output


In [11]:
def get_metrics(output, should_print=True, round_to=3):
    metrics = {}
    targets = output[output.y_test == 1]
    nontargets = output[output.y_test == 0]
    
    dfs = [output, targets, nontargets]
    labels = ["Overall", "Target", "Non-Target"]
    
    for i in range(len(dfs)):

        df, label = dfs[i], labels[i]
        if label == "Non-Target":
            pos_label = 0
        else:
            pos_label = 1
        
        metrics[label] = {}
        
        
        accuracy = round(accuracy_score(df.y_test, df.predicted), round_to)
        metrics[label]['Accuracy'] = accuracy
        
        precision = round(precision_score(df.y_test, df.predicted, pos_label=pos_label), round_to)
        metrics[label]['Precision'] = precision

        recall = round(recall_score(df.y_test, df.predicted, pos_label=pos_label), round_to)
        metrics[label]['Recall'] = recall
        
        f1 = round(f1_score(df.y_test, df.predicted, pos_label=pos_label), round_to)
        metrics[label]['F1'] = f1

        if label == "Overall":
            roc_auc = round(roc_auc_score(df.y_test, df.predicted), round_to)
            metrics[label]['ROC_AUC'] = roc_auc
            
        if should_print == True:
            print("{} Accuracy: {}".format(label, accuracy))
            print("{} Precision: {}".format(label, precision))
            print("{} Recall: {}".format(label, recall))
            print("{} F1 Score: {}".format(label, f1))
            if label == "Overall":
                print("ROC_AUC: {}".format(label, roc_auc))
            print()
            
    return metrics
   

### Highlighted Model

In [None]:
clf, output = run_model(prepared_50, comments = "cleaned_no_stem")

In [None]:
get_metrics(output)

### SVM

In [None]:
best_metric = 0
metric_dict = ''
model_factors = []

SUBSET_OF_INTEREST = "Target"
METRIC_OF_INTEREST = "F1"

dfs = [random_df,  prepared_50]
label = ["random_df", "prepared_50"]

mn = 0

for i in range(len(dfs)):
    for text in ['cleaned_w_stopwords', 'cleaned_no_stem', 'cleaned_porter', 'cleaned_lancaster']:

        factors = [label[i], text]
        mn += 1
        print("{}. {}".format(mn, datetime.datetime.now()))
        print(factors)

        clf, output = run_model(dfs[i], comments = text)
        metrics = get_metrics(output, should_print=False)
        metric_of_interest = metrics[SUBSET_OF_INTEREST][METRIC_OF_INTEREST]
        
        print("Overall Accuracy: {}, Target Accuracy: {}, Non-Target Accuracy: {}".format(metrics["Overall"]["Accuracy"], metrics["Target"]["Accuracy"], metrics["Non-Target"]["Accuracy"]))
        print() 
        
        if (metric_of_interest > best_metric) and metric_of_interest < 0.95:
            best_metric = metric_of_interest
            
            model_factors = factors
            metric_dict = metrics

In [None]:
model_factors, best_metric

In [None]:
metric_dict