In [1]:
import numpy as np
import pandas as pd

# which model are we going to use
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

# text vectorizing stuff
from sklearn.feature_extraction.text import TfidfVectorizer

# things to enable scoring and cross validation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split

from sklearn.metrics import roc_auc_score
from scipy.sparse import hstack

import time

import matplotlib.pyplot as plt
%matplotlib inline

#### some helper stuff for tracking performance over the duration of the project

In [2]:
def write_model_timestamp(model_type, kfolds, scores, note):
    '''
    Parameters:
    model_type = string description of the model(s) used to make these scores
    kfolds     = how many folds in kfold cross validation used
    scores     = list of ROC AUC avg scores of models for each class, floats should be like 0.9784
    note       = string, whatever is of note about the model, made a change or whatever
    
    Returns:
    None, but writes (appends) a line to scores.txt in the root directory so that progress can be tracked
    The format is:
            time(s)~model_type~kfold~avg_roc_auc~toxic_auc~s_toxic_auc~obscene_auc~threat_auc~insult_auc~i_hate_auc~notes
            
    scores.txt is a tilde '~' seperated CSV like:
        time~model_type~kfold~avg_roc_auc~toxic_auc~s_toxic_auc~obscene_auc~threat_auc~insult_auc~i_hate_auc~notes
        1520303252~0.9794005980274005~note something
    '''

    out_text = "{:10.0f}~{:}~{:2d}~{:0.8f}~{:0.8f}~{:0.8f}~{:0.8f}~{:0.8f}~{:0.8f}~{:0.8f}~{:}\n".format(time.time(), 
                                             model_type, 
                                             kfolds, 
                                             np.mean(scores),
                                             scores[0],
                                             scores[1],
                                             scores[2],
                                             scores[3],
                                             scores[4],
                                             scores[5],                                                
                                             note)
    
    with open("../scores.txt", 'a') as out_file:
        out_file.write(out_text)
        
        print("wrote:")
        print(out_text)
        print("to file")

#### Load data and light processing

In [3]:
train = pd.read_csv('../data/train.csv').fillna(' ')
test = pd.read_csv('../data/test.csv').fillna(' ')

train_text = train['comment_text']
test_text = test['comment_text']
all_text = pd.concat([train_text, test_text])

#### Vectorize words from both corpuses (corpi?)

In [4]:
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    stop_words='english',
    ngram_range=(1, 1),
    max_features=50000)    # 10k was initial, 50k seemed to work well in initial testing

word_vectorizer.fit(all_text)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=50000, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents='unicode', sublinear_tf=True,
        token_pattern='\\w{1,}', tokenizer=None, use_idf=True,
        vocabulary=None)

In [5]:
train_word_features = word_vectorizer.transform(train_text)
test_word_features = word_vectorizer.transform(test_text)

print("train shape:", train_word_features.shape)
print("test shape:", test_word_features.shape)

train shape: (159571, 50000)
test shape: (153164, 50000)


#### Attempt to tune on a single split instead of 10 kfold since that will take forever

In [9]:
# LogisticRegression:
#   B-)
# Multinomial NB:
#   alpha=0.03
#   fit_prior=False
# RandomForests:
#   n_estimators=32
#   max_depth=512 seems to be the best with simple hparam testing

class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

for _class in class_names:
    x_train, x_test, y_train, y_test = train_test_split(train_word_features, train[_class], test_size=0.1, random_state=1337)


    results = []
    
#     print("Class: {:}".format(_class))
#     for nest in nests:
    model1 = RandomForestClassifier(n_estimators=32, n_jobs=-1, max_depth=512)
    model2 = MultinomialNB(alpha=0.03, fit_prior=False)
    model3 = LogisticRegression(solver='sag')
    
    meta_model = VotingClassifier(estimators=[('rf', model1), ('mnb', model2), ('lr', model3)],
                                  weights=[1.0, 1.0, 1.5],
                                  voting='soft',
                                  n_jobs=-1)
    
    meta_model.fit(x_train, y_train)

    preds = meta_model.predict(x_test)

    result = roc_auc_score(y_test, preds)
    results.append(result)

    print("Class: {: <14}  ROC AUC: {:0.4f}".format(_class, result))
    

Class: toxic           ROC AUC: 0.8265
Class: severe_toxic    ROC AUC: 0.6872
Class: obscene         ROC AUC: 0.8670
Class: threat          ROC AUC: 0.5671
Class: insult          ROC AUC: 0.8053
Class: identity_hate   ROC AUC: 0.6244


In [None]:
# voting ensemble hard voting
# Class: toxic           ROC AUC: 0.8192
# Class: severe_toxic    ROC AUC: 0.6612
# Class: obscene         ROC AUC: 0.8622
# Class: threat          ROC AUC: 0.5766
# Class: insult          ROC AUC: 0.8028
# Class: identity_hate   ROC AUC: 0.6099

# voting ensemble soft voting
# Class: toxic           ROC AUC: 0.8360
# Class: severe_toxic    ROC AUC: 0.7132
# Class: obscene         ROC AUC: 0.8777
# Class: threat          ROC AUC: 0.5863
# Class: insult          ROC AUC: 0.8261
# Class: identity_hate   ROC AUC: 0.6647

# voting ensemble soft voting with weights [1.0, 1.0, 1.5]
# Class: toxic           ROC AUC: 0.8265
# Class: severe_toxic    ROC AUC: 0.6872
# Class: obscene         ROC AUC: 0.8670
# Class: threat          ROC AUC: 0.5671
# Class: insult          ROC AUC: 0.8053
# Class: identity_hate   ROC AUC: 0.6244

In [6]:
scores = []

NUM_FOLDS = 10

train_features = train_word_features.copy()

# submission = pd.DataFrame.from_dict({'id': test['id']})

class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

for class_name in class_names:
    train_target = train[class_name]
    
    model1 = RandomForestClassifier(n_estimators=32, n_jobs=-1, max_depth=512)
    model2 = MultinomialNB(alpha=0.03, fit_prior=False)
    model3 = LogisticRegression(solver='sag')
    
    classifier = VotingClassifier(estimators=[('rf', model1), ('mnb', model2), ('lr', model3)],
                                  voting='soft',
                                  n_jobs=-1)

    kfold = StratifiedKFold(n_splits=NUM_FOLDS, shuffle=True, random_state=1337)
    
#     results = cross_val_score(classifier, train_features, train_target, cv=5, n_jobs=-1, scoring='roc_auc')
    results = cross_val_score(classifier, train_features, train_target, cv=kfold, scoring='roc_auc')
    
    print('CV Spread for class "{}":'.format(class_name))
    for result in results:
        print("    {:0.4f}".format(result), end=" ")
        
    print(" ")
        
    cv_score = np.mean(results)
    scores.append(cv_score)
    
    print('    CV score for class "{}" is {:0.4}\n'.format(class_name, cv_score))

    classifier.fit(train_features, train_target)
#     submission[class_name] = classifier.predict_proba(test_features)[:, 1]

print('Total CV score is {:0.4f}'.format(np.mean(scores)))

write_model_timestamp('Stacked Voting Ensemble', NUM_FOLDS, scores, "word2vec max 50k features, Logistic Regression, MultinomialNB, and RandomForests")

CV Spread for class "toxic":
    0.9687     0.9668     0.9701     0.9695     0.9691     0.9717     0.9675     0.9692     0.9699     0.9708  
    CV score for class "toxic" is 0.9693

CV Spread for class "severe_toxic":
    0.9819     0.9765     0.9695     0.9822     0.9843     0.9742     0.9784     0.9773     0.9845     0.9773  
    CV score for class "severe_toxic" is 0.9786

CV Spread for class "obscene":
    0.9861     0.9843     0.9807     0.9816     0.9843     0.9785     0.9837     0.9863     0.9851     0.9815  
    CV score for class "obscene" is 0.9832

CV Spread for class "threat":
    0.9568     0.9421     0.9694     0.9549     0.9532     0.9669     0.9763     0.9622     0.9580     0.9276  
    CV score for class "threat" is 0.9567

CV Spread for class "insult":
    0.9747     0.9738     0.9729     0.9744     0.9774     0.9711     0.9736     0.9751     0.9775     0.9746  
    CV score for class "insult" is 0.9745

CV Spread for class "identity_hate":
    0.9582     0.9593     

In [44]:
print("{: <14} {:}".format("CLASS", "AVG ROC AUC"))

for item in zip(class_names, scores):
    print("{: <14} {:0.4f}".format(item[0], item[1]))

CLASS          AVG ROC AUC
toxic          0.9698
severe_toxic   0.9859
obscene        0.9854
threat         0.9828
insult         0.9765
identity_hate  0.9761
