In [1]:
import os
import numpy as np
import pandas as pd

from textstat.textstat import textstat

from sklearn.model_selection import KFold
from sklearn.metrics import log_loss
from scipy.sparse import hstack

# Classifiers
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression

from NbSvmClassifier import NbSvmClassifier
import xgboost as xgb


# BoW feature extraction
import re, string
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

import gensim



In [2]:
train_file = '../data/train.csv'
test_file = '../data/test.csv'
sample_submission_file = '../data/sample_submission.csv'

train_all = pd.read_csv(train_file)
test_for_submission = pd.read_csv(test_file)
sample_submission = pd.read_csv(sample_submission_file)

# Clean data

TODO: Account for more cases

In [3]:
train_all['comment_text'].fillna("unknown", inplace=True)
test_for_submission['comment_text'].fillna("unknown", inplace=True)

# Split train data

In [4]:
kf = KFold(n_splits=5, random_state=0)
cv_splits_train = []
cv_splits_test = []
for train_index, test_index in kf.split(train_all):
    cv_splits_train.append(train_index)
    cv_splits_test.append(test_index)
cv_folds = len(cv_splits_train)

In [5]:
cv_train = []
cv_test = []
for i in range(cv_folds):
    cv_train.append(train_all.loc[cv_splits_train[i], :])
    cv_test.append(train_all.loc[cv_splits_test[i], :])

In [6]:
train = cv_train
test = cv_test

Count and check that the data is split such that the percentages of labels in train/test are roughly equal

In [7]:
labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [8]:
def print_count_of_each_label(df):
    for label in labels:
        print('{}: {} / {} ({}%)'.format(label.rjust(len(labels[-1])),
                                         df.loc[df[label] == 1].shape[0],
                                         len(df),
                                         np.round(df.loc[df[label] == 1].shape[0]/len(df)*100, 3)))

for i in range(cv_folds):
    print('CV train {}'.format(i))
    print_count_of_each_label(cv_train[i])
    print('CV test {}'.format(i))
    print_count_of_each_label(cv_test[i])

CV train 0
        toxic: 7366 / 76680 (9.606%)
 severe_toxic: 792 / 76680 (1.033%)
      obscene: 4108 / 76680 (5.357%)
       threat: 234 / 76680 (0.305%)
       insult: 3826 / 76680 (4.99%)
identity_hate: 654 / 76680 (0.853%)
CV test 0
        toxic: 1871 / 19171 (9.76%)
 severe_toxic: 173 / 19171 (0.902%)
      obscene: 1001 / 19171 (5.221%)
       threat: 71 / 19171 (0.37%)
       insult: 939 / 19171 (4.898%)
identity_hate: 160 / 19171 (0.835%)
CV train 1
        toxic: 7413 / 76681 (9.667%)
 severe_toxic: 772 / 76681 (1.007%)
      obscene: 4136 / 76681 (5.394%)
       threat: 244 / 76681 (0.318%)
       insult: 3841 / 76681 (5.009%)
identity_hate: 657 / 76681 (0.857%)
CV test 1
        toxic: 1824 / 19170 (9.515%)
 severe_toxic: 193 / 19170 (1.007%)
      obscene: 973 / 19170 (5.076%)
       threat: 61 / 19170 (0.318%)
       insult: 924 / 19170 (4.82%)
identity_hate: 157 / 19170 (0.819%)
CV train 2
        toxic: 7397 / 76681 (9.646%)
 severe_toxic: 764 / 76681 (0.996%)
      o

# Split train/test into X and y

So that we can use sklearn classifiers easily

In [9]:
X_train = [None] * cv_folds
y_train = [None] * cv_folds
X_test = [None] * cv_folds
y_test = [None] * cv_folds
for i in range(cv_folds):
    X_train[i], y_train[i] = train[i][["comment_text"]], train[i][labels]
    X_test[i], y_test[i] = test[i][["comment_text"]], test[i][labels]

In [10]:
print(X_train[0].shape, y_train[0].shape)
print(X_test[0].shape, y_test[0].shape)

(76680, 1) (76680, 6)
(19171, 1) (19171, 6)


# Extract features

## Extract textstat features

In [11]:
def extract_textstat_features(df):
    features_df = pd.DataFrame()
    features_df['comment_text_len'] = df['comment_text'].apply(len)
    features_df['comment_text_lex_count'] = df['comment_text'].apply(textstat.lexicon_count)
    features_df['comment_text_syl_count'] = df['comment_text'].apply(textstat.syllable_count)
    features_df['comment_text_sent_count'] = df['comment_text'].apply(textstat.sentence_count)
    features_df['comment_text_flesch_reading_ease'] = df['comment_text'].apply(textstat.flesch_reading_ease)
    features_df['comment_text_flesch_kincaid_grade'] = df['comment_text'].apply(textstat.flesch_kincaid_grade)
    
    features_df['comment_text_syl_over_lex'] = features_df['comment_text_syl_count'] / features_df['comment_text_lex_count']
    features_df['comment_text_lex_over_sent'] = features_df['comment_text_lex_count'] / features_df['comment_text_sent_count']
    
    return features_df

In [12]:
train_textstat_features_file = '../data/train_textstat_features.csv'
if os.path.isfile(train_textstat_features_file):
    X_train_all_features_textstat = pd.read_csv(train_textstat_features_file, index_col=0)
else:
    X_train_all_features_textstat = extract_textstat_features(train_all)
    X_train_all_features_textstat.to_csv(train_textstat_features_file)

## Split textstat features

In [13]:
X_train_features_textstat = []
X_test_features_textstat = []
for i in range(cv_folds):
    X_train_features_textstat.append(X_train_all_features_textstat.loc[cv_splits_train[i], :])
    X_test_features_textstat.append(X_train_all_features_textstat.loc[cv_splits_test[i], :])

## Extract BoW

In [14]:
re_tok = re.compile('([{}“”¨«»®´·º½¾¿¡§£₤‘’])'.format(string.punctuation))
def tokenize(s): return re_tok.sub(r' \1 ', s).split()

In [15]:
n = train_all.shape[0]
vec = TfidfVectorizer(ngram_range=(1,2), tokenizer=tokenize,
               min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1,
               smooth_idf=1, sublinear_tf=1 )
train_term_doc = vec.fit_transform(train_all['comment_text'])

In [16]:
X_train_features_bow = []
X_test_features_bow = []
for i in range(cv_folds):
    X_train_features_bow.append(train_term_doc[cv_splits_train[i], :])
    X_test_features_bow.append(train_term_doc[cv_splits_test[i], :])

## Extract emotion scores

In [17]:
emotion_lexicon_file = "../data/features/NRC-AffectIntensity-Lexicon.txt"
emotion_lexicon = pd.read_csv(emotion_lexicon_file, sep = "\t")

In [18]:
emotion_term_score = {'anger': {}, 'fear': {}, 'joy': {}, 'sadness': {}}
for row in emotion_lexicon.itertuples():
    emotion_term_score[row.AffectDimension][row.term] = row.score

In [19]:
translator = str.maketrans('', '', string.punctuation)
emotions = ['anger', 'fear', 'joy', 'sadness']
def avg_emotion_score(comment_text):
    try:
        comment_cleaned = comment_text.translate(translator)
    except:
        comment_cleaned = ""
    comment_cleaned = comment_cleaned.lower()
    comment_cleaned_words = comment_cleaned.split(" ")
    
    emotion_scores = {'anger': 0, 'fear': 0, 'joy': 0, 'sadness': 0}
    for emotion in emotions:
        scores = [emotion_term_score[emotion].get(word) for word in comment_cleaned_words
                    if emotion_term_score[emotion].get(word) is not None]
        if len(scores) == 0:
            continue
        emotion_scores[emotion] = np.sum(scores) / len(comment_cleaned_words)
        
    return [emotion_scores[emotion] for emotion in emotions]
    
def extract_emotion_features(df):
    features_df = df['comment_text'].apply(avg_emotion_score)
    return pd.DataFrame(features_df.values.tolist(), columns=['comment_text_emotion_{}'.format(emotion) for emotion in emotions])

In [20]:
X_train_all_features_emotion = extract_emotion_features(train_all)

In [21]:
X_train_features_emotion = []
X_test_features_emotion = []
for i in range(cv_folds):
    X_train_features_emotion.append(X_train_all_features_emotion.loc[cv_splits_train[i], :])
    X_test_features_emotion.append(X_train_all_features_emotion.loc[cv_splits_test[i], :])

## Extract Word2Vec

In [22]:
w2v_model = gensim.models.KeyedVectors.load_word2vec_format('../data/GoogleNews-vectors-negative300.bin', binary=True)

In [45]:
def featurize_w2v(comment_text):
    sentence = tokenize(comment_text)
    f = np.zeros(w2v_model.vector_size)
    count = 0 
    for w in sentence:
        try:
            vec = w2v_model[w]
            count += 1
        except KeyError:
            continue
        f += vec
    if count > 0:
        f /= count
    return f

In [44]:
X_train_all_features_w2v = train_all['comment_text'].apply(featurize_w2v)

In [46]:
X_train_all_features_w2v = np.array(X_train_all_features_w2v.tolist())

In [47]:
X_train_features_w2v = []
X_test_features_w2v = []
for i in range(cv_folds):
    X_train_features_w2v.append(X_train_all_features_w2v[cv_splits_train[i]])
    X_test_features_w2v.append(X_train_all_features_w2v[cv_splits_test[i]])

## Extract glove Word2Vec

In [65]:
w2v_glove_model = gensim.models.KeyedVectors.load_word2vec_format('../data/glove.6B.50d.gensim.txt')

In [69]:
def featurize_w2v_glove(comment_text):
    sentence = tokenize(comment_text)
    f = np.zeros(w2v_glove_model.vector_size)
    count = 0 
    for w in sentence:
        try:
            vec = w2v_glove_model[w]
            count += 1
        except KeyError:
            continue
        f += vec
    if count > 0:
        f /= count
    return f

In [70]:
X_train_all_features_w2v_glove = train_all['comment_text'].apply(featurize_w2v_glove)

In [71]:
X_train_all_features_w2v_glove = np.array(X_train_all_features_w2v_glove.tolist())

In [72]:
X_train_features_w2v_glove = []
X_test_features_w2v_glove = []
for i in range(cv_folds):
    X_train_features_w2v_glove.append(X_train_all_features_w2v_glove[cv_splits_train[i]])
    X_test_features_w2v_glove.append(X_train_all_features_w2v_glove[cv_splits_test[i]])

# Function to calculate mean column-wise log loss of y_pred vs y_actual

In [27]:
def calculate_score(y_actual, y_pred):
    col_log_loss = [log_loss(np.array(y_actual[label]),
                             np.array([1.-np.array(y_pred[label]), np.array(y_pred[label])]).T) for label in labels]
    return col_log_loss

# Test classification scores

## Perfect score

In [28]:
np.mean(calculate_score(y_test[0], y_test[0]))

9.9920072216264108e-16

## ZeroR

In [29]:
scores_zeror = []
for i in range(cv_folds):
    data = np.array([np.zeros(len(labels))] * len(X_test[i]))
    y_pred_zeror = pd.DataFrame(data, columns=labels)
    scores = calculate_score(y_test[i], y_pred_zeror)
    scores_zeror.append(scores)
print(np.mean(scores_zeror, axis=0))
print(np.mean(scores_zeror))

[ 3.32844347  0.34772673  1.84096823  0.10990296  1.71701177  0.29331534]
1.27289475201


## All 0.5

In [30]:
scores_half = []
for i in range(cv_folds):
    data = np.array([np.ones(len(labels))*0.5] * len(X_test[i]))
    y_pred_half = pd.DataFrame(data, columns=labels)
    scores = calculate_score(y_test[i], y_pred_half)
    scores_half.append(scores)
print(np.mean(scores_half, axis=0))
print(np.mean(scores_half))

[ 0.69314718  0.69314718  0.69314718  0.69314718  0.69314718  0.69314718]
0.69314718056


## Test calculate_score

In [31]:
scores_half = []
for i in range(cv_folds):
    data = np.array([np.array([0.5, 0.6, 0.7, 0.8, 0.9, 1.0])] * len(X_test[i]))
    y_pred_half = pd.DataFrame(data, columns=labels)
    scores = calculate_score(y_test[i], y_pred_half)
    scores_half.append(scores)
print(np.mean(scores_half, axis=0))
print(np.mean(scores_half))

[  0.69314718   0.91220862   1.15881057   1.6050267    2.19335539
  34.24546105]
6.80133491769


In [32]:
def best_classifier_per_label(classifiers, y_pred, scores):
    classifier_scores = np.array([np.mean(scores[classifier_name], axis=0) for classifier_name, _ in classifiers])
    best_classifier_score_per_label = [(classifiers[min_idx][0], classifier_scores[min_idx][i])
                                           for i, min_idx in enumerate(np.argmin(classifier_scores, axis=0))]
    for i, label in enumerate(labels):
        print("[{}] {} : {}".format(label, best_classifier_score_per_label[i][0], best_classifier_score_per_label[i][1]))
    print("Average: {}".format(np.mean([x[1] for x in best_classifier_score_per_label])))
    

## Run classifiers

In [33]:
def run_classifiers(classifiers, X_train_features, X_test_features, y_train, clf, y_pred, scores):
    for classifier_name, classifier in classifiers:
        print('Training with {}'.format(classifier_name))
        clf[classifier_name] = [{}]*cv_folds
        y_pred[classifier_name] = [{}]*cv_folds
        scores[classifier_name] = [{}]*cv_folds
        for fold in range(cv_folds):
            y_pred[classifier_name][fold] = pd.DataFrame()
            for label in labels:
                clf[classifier_name][fold][label] = classifier()
                clf[classifier_name][fold][label].fit(X_train_features[fold], y_train[fold][label])

                y_pred[classifier_name][fold][label] = clf[classifier_name][fold][label].predict_proba(X_test_features[fold]).T[1]

            scores[classifier_name][fold] = calculate_score(y_test[fold], y_pred[classifier_name][fold])
        print('Column-wise log loss for {}: {} - {}'.format(classifier_name, np.mean(scores[classifier_name], axis=0), np.mean(scores[classifier_name])))
    best_classifier_per_label(classifiers, y_pred, scores)

## Textstat features only

In [53]:
classifiers_textstat = []
# classifiers_textstat += [
#     ('Logistic Regression {}'.format(c), lambda: LogisticRegression(solver="newton-cg", C=2.0**c, max_iter=1000)) for c in np.arange(-3,4,1)]
classifiers_textstat += [
    
    ('XGBoost', lambda: xgb.XGBClassifier()),
    
#     ('Logistic Regression', lambda: LogisticRegression(solver="newton-cg", C=2.0, max_iter=1000)),
    
#     ('NB-SVM', lambda: NbSvmClassifier()),
    
#     ("Decision Tree", lambda: DecisionTreeClassifier(max_depth=5)),
#     ("Random Forest", lambda: RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1)),
#     ("Extra Trees", lambda: ExtraTreesClassifier(max_depth=5)),
#     ("Gradient Boosting", lambda: GradientBoostingClassifier()),
    
#     ("Nearest Neighbors", lambda: KNeighborsClassifier(131)),
#     ("Naive Bayes", lambda: GaussianNB()),
#     ("Neural Net", lambda: MLPClassifier(alpha=1)),
#     ("AdaBoost", lambda: AdaBoostClassifier()),
#     ("QDA", lambda: QuadraticDiscriminantAnalysis()),
    
    # Cannot even run
    #     ("Gaussian Process", lambda: GaussianProcessClassifier(1.0 * RBF(1.0)))  # Memory error??? Even with 32GB ram???
    #     ("Linear SVM", lambda: SVC(kernel="linear", C=0.025)),                   # Slow like shit
    #     ("RBF SVM", lambda: SVC(gamma=2, C=1))                                   # Slow like shit
]

In [55]:
clf_textstat = {}
y_pred_textstat = {}
scores_textstat = {}
run_classifiers(classifiers_textstat, X_train_features_textstat, X_test_features_textstat, y_train,
                clf_textstat, y_pred_textstat, scores_textstat)

Training with XGBoost
Column-wise log loss for XGBoost: [ 0.28483668  0.04888504  0.18583719  0.01931983  0.17817057  0.04614669] - 0.12719933169122624
[toxic] XGBoost : 0.284836675541959
[severe_toxic] XGBoost : 0.04888503970659461
[obscene] XGBoost : 0.18583719111278976
[threat] XGBoost : 0.0193198264879471
[insult] XGBoost : 0.17817056803190126
[identity_hate] XGBoost : 0.04614668926616587
Average: 0.12719933169122624


# Emotion only

In [56]:
classifiers_emotion = []
classifiers_emotion += [
    
    ('XGBoost', lambda: xgb.XGBClassifier()),
    
#     ('Logistic Regression', lambda: LogisticRegression(solver="newton-cg", C=2.0, max_iter=1000)),
    
#     ('NB-SVM', lambda: NbSvmClassifier()),
    
#     ("Decision Tree", lambda: DecisionTreeClassifier(max_depth=5)),
#     ("Random Forest", lambda: RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1)),
#     ("Extra Trees", lambda: ExtraTreesClassifier(max_depth=5)),
#     ("Gradient Boosting", lambda: GradientBoostingClassifier()),
    
#     ("Nearest Neighbors", lambda: KNeighborsClassifier(131)),
#     ("Naive Bayes", lambda: GaussianNB()),
#     ("Neural Net", lambda: MLPClassifier(alpha=1)),
#     ("AdaBoost", lambda: AdaBoostClassifier()),
#     ("QDA", lambda: QuadraticDiscriminantAnalysis()),
    
]

In [57]:
clf_emotion = {}
y_pred_emotion = {}
scores_emotion = {}
run_classifiers(classifiers_emotion, X_train_features_emotion, X_test_features_emotion, y_train,
                clf_emotion, y_pred_emotion, scores_emotion)

Training with XGBoost
Column-wise log loss for XGBoost: [ 0.27910365  0.04835497  0.18336427  0.01567885  0.17384561  0.04629481] - 0.12444035929347044
[toxic] XGBoost : 0.27910365378493474
[severe_toxic] XGBoost : 0.048354966282939324
[obscene] XGBoost : 0.18336426875087616
[threat] XGBoost : 0.01567884675641421
[insult] XGBoost : 0.17384561448144115
[identity_hate] XGBoost : 0.046294805704217123
Average: 0.12444035929347046


## Textstat + emotion

In [58]:
classifiers_textstat_emotion = []
classifiers_textstat_emotion += [
    
    ('XGBoost', lambda: xgb.XGBClassifier()),
    
#     ('Logistic Regression', lambda: LogisticRegression(solver="newton-cg", C=2.0, max_iter=1000)),
    
#     ('NB-SVM', lambda: NbSvmClassifier()),
    
#     ("Decision Tree", lambda: DecisionTreeClassifier(max_depth=5)),
#     ("Random Forest", lambda: RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1)),
#     ("Extra Trees", lambda: ExtraTreesClassifier(max_depth=5)),
#     ("Gradient Boosting", lambda: GradientBoostingClassifier()),
    
#     ("Nearest Neighbors", lambda: KNeighborsClassifier(131)),
#     ("Naive Bayes", lambda: GaussianNB()),
#     ("Neural Net", lambda: MLPClassifier(alpha=1)),
#     ("AdaBoost", lambda: AdaBoostClassifier()),
#     ("QDA", lambda: QuadraticDiscriminantAnalysis()),
    
]

In [59]:
# concat features
X_train_features_textstat_emotion = [None]*cv_folds
X_test_features_textstat_emotion = [None]*cv_folds
for fold in range(cv_folds):
    X_train_features_textstat_emotion[fold] = np.hstack((X_train_features_textstat[fold], X_train_features_emotion[fold].values))
    X_test_features_textstat_emotion[fold] = np.hstack((X_test_features_textstat[fold], X_test_features_emotion[fold].values))

print(X_train_features_textstat[0].shape)
print(X_train_features_emotion[0].shape)
print(X_train_features_textstat_emotion[0].shape)

(76680, 8)
(76680, 4)
(76680, 12)


In [60]:
clf_textstat_emotion = {}
y_pred_textstat_emotion = {}
scores_textstat_emotion = {}
run_classifiers(classifiers_textstat_emotion, X_train_features_textstat_emotion,
                X_test_features_textstat_emotion, y_train,
                clf_textstat_emotion, y_pred_textstat_emotion, scores_textstat_emotion)

Training with XGBoost
Column-wise log loss for XGBoost: [ 0.25704964  0.04406231  0.16881971  0.01425027  0.16175613  0.04428455] - 0.1150371041726767
[toxic] XGBoost : 0.25704964456713714
[severe_toxic] XGBoost : 0.04406231405357034
[obscene] XGBoost : 0.16881971271279
[threat] XGBoost : 0.014250270017938266
[insult] XGBoost : 0.16175613442820064
[identity_hate] XGBoost : 0.04428454925642382
Average: 0.11503710417267671


## Bag of words only

In [34]:
classifiers_bow = [
#     ("Multinominal Naive Bayes", lambda: MultinomialNB()),
#     ('XGBoost', lambda: xgb.XGBClassifier()),
    ('NB-SVM', lambda: NbSvmClassifier()),
#     ('Logistic Regression', lambda: LogisticRegression(solver="newton-cg", C=2.0, max_iter=1000)),
]

In [35]:
clf_bow = {}
y_pred_bow = {}
scores_bow = {}
run_classifiers(classifiers_bow, X_train_features_bow,
                X_test_features_bow, y_train,
                clf_bow, y_pred_bow, scores_bow)

Training with NB-SVM
Column-wise log loss for NB-SVM: [ 0.10984432  0.02989424  0.06178523  0.01158265  0.07788502  0.0280709 ] - 0.05317706064445081
[toxic] NB-SVM : 0.1098443236820131
[severe_toxic] NB-SVM : 0.029894241779756425
[obscene] NB-SVM : 0.06178523018708384
[threat] NB-SVM : 0.01158264798630578
[insult] NB-SVM : 0.0778850242041904
[identity_hate] NB-SVM : 0.028070896027355303
Average: 0.0531770606444508


## Word2Vec only

In [39]:
classifiers_w2v = [
#     ("Random Forest", lambda: RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1)),
#     ("Extra Trees", lambda: ExtraTreesClassifier(max_depth=5)),
    ('Logistic Regression', lambda: LogisticRegression(solver="newton-cg", C=2.0, max_iter=1000))
]

In [49]:
clf_w2v = {}
y_pred_w2v = {}
scores_w2v = {}

run_classifiers(classifiers_w2v, X_train_features_w2v,
                X_test_features_w2v, y_train,
                clf_w2v, y_pred_w2v, scores_w2v)

Training with Logistic Regression
Column-wise log loss for Logistic Regression: [ 0.14969022  0.03185584  0.09973018  0.0133629   0.10659322  0.03318054] - 0.07240214930700903
[toxic] Logistic Regression : 0.14969021900629492
[severe_toxic] Logistic Regression : 0.03185584269411572
[obscene] Logistic Regression : 0.09973017523216073
[threat] Logistic Regression : 0.013362904976916574
[insult] Logistic Regression : 0.1065932172309775
[identity_hate] Logistic Regression : 0.033180536701588756
Average: 0.07240214930700903


## Word2Vec glove only

In [73]:
classifiers_w2v_glove = [
#     ("Random Forest", lambda: RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1)),
#     ("Extra Trees", lambda: ExtraTreesClassifier(max_depth=5)),
    ('Logistic Regression', lambda: LogisticRegression(solver="newton-cg", C=2.0, max_iter=1000))
]

In [74]:
clf_w2v_glove = {}
y_pred_w2v_glove = {}
scores_w2v_glove = {}

run_classifiers(classifiers_w2v_glove, X_train_features_w2v_glove,
                X_test_features_w2v_glove, y_train,
                clf_w2v_glove, y_pred_w2v_glove, scores_w2v_glove)

Training with Logistic Regression
Column-wise log loss for Logistic Regression: [ 0.21309508  0.04024206  0.14121731  0.01670117  0.13662473  0.03881534] - 0.09778261574609753
[toxic] Logistic Regression : 0.21309507787243942
[severe_toxic] Logistic Regression : 0.040242061860483166
[obscene] Logistic Regression : 0.14121730655828618
[threat] Logistic Regression : 0.016701174299176898
[insult] Logistic Regression : 0.13662473380238926
[identity_hate] Logistic Regression : 0.03881534008381026
Average: 0.09778261574609753


# Average results from multiple features and classifiers

In [50]:
def average_predictions(y_preds, classifier_names, weights):
    normalized_weights = np.array(weights) / np.sum(weights)
    y_pred_avg = [None] * cv_folds
    
    for i, fold in enumerate(range(cv_folds)):
        for i, y_pred in enumerate(y_preds):
            if i==0:
                y_pred_avg[fold] = y_pred[classifier_names[i]][fold]*weights[i]
            else:
                y_pred_avg[fold] += y_pred[classifier_names[i]][fold]*weights[i]
    return calculate_score(y_test[fold], y_pred_avg[fold])

## Textstat+emotion + BoW

In [61]:
textstat_emotion_AVG_bow_score = average_predictions([y_pred_textstat_emotion, y_pred_bow],
                                                     ["XGBoost", "NB-SVM"],
                                                     [0.5, 0.5])
print(textstat_emotion_AVG_bow_score)
print(np.mean(textstat_emotion_AVG_bow_score))


# 0.11366952025039936
# 0.053176833214058444

[0.15449181633605233, 0.031560525357768703, 0.092664343353911063, 0.012277951374001441, 0.10168048385854131, 0.032841573882729395]
0.0709194490272


## Textstat+emotion + Word2Vec

In [62]:
textstat_emotion_AVG_w2v_score = average_predictions([y_pred_textstat_emotion, y_pred_w2v],
                                                     ["XGBoost", "Logistic Regression"],
                                                     [0.5, 0.5])
print(textstat_emotion_AVG_w2v_score)
print(np.mean(textstat_emotion_AVG_w2v_score))


# 0.11366952025039936
# 0.07266043582520464

[0.17380224597115454, 0.034570600071424248, 0.11705944494758774, 0.013036789672118227, 0.11878032058068376, 0.036281322589013093]
0.0822551206387


## BoW + Word2Vec

In [77]:
bow_AVG_w2v_score = average_predictions([y_pred_bow, y_pred_w2v],
                                        ["NB-SVM", "Logistic Regression"],
                                        [0.9, 0.1])
print(bow_AVG_w2v_score)
print(np.mean(bow_AVG_w2v_score))

# 0.053176833214058444
# 0.07266043582520464

[0.10556171529822136, 0.027286902212003302, 0.06117745396973194, 0.011202445642929175, 0.077549464425811843, 0.028430402620803086]
0.0518680640283


## BoW + Word2Vec + textstat+emotion

In [64]:
bow_AVG_w2v_AVG_textstat_emotion_score = average_predictions([y_pred_bow, y_pred_w2v, y_pred_textstat_emotion],
                                        ["NB-SVM", "Logistic Regression", "XGBoost"],
                                        [0.9, 0.1, 0.001])
print(bow_AVG_w2v_AVG_textstat_emotion_score)
print(np.mean(bow_AVG_w2v_AVG_textstat_emotion_score))

# 0.053176833214058444
# 0.07266043582520464
# 0.11366952025039936

[0.10558159526655088, 0.027285213606180359, 0.062445173643995523, 0.01120044290915395, 0.077625754272640332, 0.028431435235483218]
0.0520949358223


# RUN ON TEST SET

In [None]:
def run_classifiers(classifiers, X_train_features, X_test_features, y_train, clf, y_pred, scores):
    for classifier_name, classifier in classifiers:
        print('Training with {}'.format(classifier_name))
        clf[classifier_name] = [{}]*cv_folds
        y_pred[classifier_name] = [{}]*cv_folds
        scores[classifier_name] = [{}]*cv_folds
        for fold in range(cv_folds):
            y_pred[classifier_name][fold] = pd.DataFrame()
            for label in labels:
                clf[classifier_name][fold][label] = classifier()
                clf[classifier_name][fold][label].fit(X_train_features[fold], y_train[fold][label])

                y_pred[classifier_name][fold][label] = clf[classifier_name][fold][label].predict_proba(X_test_features[fold]).T[1]

            scores[classifier_name][fold] = calculate_score(y_test[fold], y_pred[classifier_name][fold])
        print('Column-wise log loss for {}: {} - {}'.format(classifier_name, np.mean(scores[classifier_name], axis=0), np.mean(scores[classifier_name])))
    best_classifier_per_label(classifiers, y_pred, scores)
    
    


In [125]:
def predict(clf, classifier_name, X_test_features, fold=0):
    y_pred_test = []
    for label in labels:
        y_pred_test.append(clf[classifier_name][fold][label].predict_proba(X_test_features).T[1])
    return pd.DataFrame(np.array(y_pred_test).T, columns=labels)

In [112]:
X_testa_features_bow = vec.transform(test_for_submission['comment_text'])

In [113]:
X_testa_features_w2v = test_for_submission['comment_text'].apply(featurize_w2v)

In [114]:
X_testa_features_w2v = np.array(X_testa_features_w2v.tolist())

In [120]:
X_testa_features_bow.shape

(226998, 285100)

In [121]:
X_testa_features_w2v.shape

(226998, 300)

In [126]:
y_pred_test_bow = predict(clf_bow, "NB-SVM", X_testa_features_bow)

In [127]:
y_pred_test_w2v = predict(clf_w2v, "Logistic Regression", X_testa_features_w2v)

In [137]:
y_pred_test_bow["id"] = test_for_submission["id"]
y_pred_test_w2v["id"] = test_for_submission["id"]

In [141]:
y_pred_test_bow = y_pred_test_bow[["id"] + labels]
y_pred_test_w2v = y_pred_test_w2v[["id"] + labels]

In [147]:
y_pred_test_bow.to_csv("submission_bow.csv", index=False, encoding="UTF-8")

In [146]:
y_pred_test_w2v.to_csv("submission_w2v.csv", index=False, encoding="UTF-8")

In [150]:
y_pred_test_bow.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,6044863,0.012127,0.001585,0.005665,0.000498,0.005277,0.001622
1,6102620,0.009424,0.000887,0.006853,0.000426,0.005164,0.001454
2,14563293,0.004883,0.001114,0.003772,0.000485,0.003223,0.000956
3,21086297,0.039752,0.00265,0.01117,0.000468,0.011503,0.00121
4,22982444,0.010744,0.002018,0.006046,0.000539,0.004098,0.001528


In [151]:
y_pred_test_w2v.head()  0.012127*0.9 + 0.009607*0.1

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,6044863,0.009607,0.006013,0.013383,0.000152,0.012652,0.004393
1,6102620,0.007159,0.001165,0.003517,0.000453,0.004856,0.002553
2,14563293,0.008073,0.001183,0.005025,0.000245,0.006145,0.000629
3,21086297,0.00044,6.9e-05,1.6e-05,2e-05,0.000372,0.000163
4,22982444,0.004941,0.018521,0.002493,0.000103,0.006026,0.005221


In [153]:
y_pred_test_bow_w2v = y_pred_test_bow.loc[:, labels] *0.9 + y_pred_test_w2v.loc[:, labels] *0.1


y_pred_test_bow_w2v["id"] = test_for_submission["id"]

y_pred_test_bow_w2v = y_pred_test_bow_w2v[["id"] + labels]

y_pred_test_bow_w2v.to_csv("submission_bow_w2v.csv", index=False, encoding="UTF-8")

In [154]:
y_pred_test_bow_w2v.shape

(226998, 7)