In [1]:
import os
import numpy as np
import pandas as pd

from textstat.textstat import textstat

from sklearn.model_selection import KFold
from sklearn.metrics import log_loss
from scipy.sparse import hstack

# Classifiers
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression

from NbSvmClassifier import NbSvmClassifier
import xgboost as xgb


# BoW feature extraction
import re, string
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

import gensim



In [2]:
train_file = '../data/train.csv'
test_file = '../data/test.csv'
sample_submission_file = '../data/sample_submission.csv'

train_all = pd.read_csv(train_file)
test_for_submission = pd.read_csv(test_file)
sample_submission = pd.read_csv(sample_submission_file)

# Clean data

TODO: Account for more cases

In [3]:
train_all['comment_text'].fillna("unknown", inplace=True)
test_for_submission['comment_text'].fillna("unknown", inplace=True)

# Split train data

In [4]:
kf = KFold(n_splits=5, random_state=0)
cv_splits_train = []
cv_splits_test = []
for train_index, test_index in kf.split(train_all):
    cv_splits_train.append(train_index)
    cv_splits_test.append(test_index)
cv_folds = len(cv_splits_train)

In [5]:
cv_train = []
cv_test = []
for i in range(cv_folds):
    cv_train.append(train_all.loc[cv_splits_train[i], :])
    cv_test.append(train_all.loc[cv_splits_test[i], :])

In [6]:
train = cv_train
test = cv_test

Count and check that the data is split such that the percentages of labels in train/test are roughly equal

In [7]:
labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [8]:
def print_count_of_each_label(df):
    for label in labels:
        print('{}: {} / {} ({}%)'.format(label.rjust(len(labels[-1])),
                                         df.loc[df[label] == 1].shape[0],
                                         len(df),
                                         np.round(df.loc[df[label] == 1].shape[0]/len(df)*100, 3)))

for i in range(cv_folds):
    print('CV train {}'.format(i))
    print_count_of_each_label(cv_train[i])
    print('CV test {}'.format(i))
    print_count_of_each_label(cv_test[i])

CV train 0
        toxic: 7366 / 76680 (9.606%)
 severe_toxic: 792 / 76680 (1.033%)
      obscene: 4108 / 76680 (5.357%)
       threat: 234 / 76680 (0.305%)
       insult: 3826 / 76680 (4.99%)
identity_hate: 654 / 76680 (0.853%)
CV test 0
        toxic: 1871 / 19171 (9.76%)
 severe_toxic: 173 / 19171 (0.902%)
      obscene: 1001 / 19171 (5.221%)
       threat: 71 / 19171 (0.37%)
       insult: 939 / 19171 (4.898%)
identity_hate: 160 / 19171 (0.835%)
CV train 1
        toxic: 7413 / 76681 (9.667%)
 severe_toxic: 772 / 76681 (1.007%)
      obscene: 4136 / 76681 (5.394%)
       threat: 244 / 76681 (0.318%)
       insult: 3841 / 76681 (5.009%)
identity_hate: 657 / 76681 (0.857%)
CV test 1
        toxic: 1824 / 19170 (9.515%)
 severe_toxic: 193 / 19170 (1.007%)
      obscene: 973 / 19170 (5.076%)
       threat: 61 / 19170 (0.318%)
       insult: 924 / 19170 (4.82%)
identity_hate: 157 / 19170 (0.819%)
CV train 2
        toxic: 7397 / 76681 (9.646%)
 severe_toxic: 764 / 76681 (0.996%)
      o

# Split train/test into X and y

So that we can use sklearn classifiers easily

In [9]:
X_train = [None] * cv_folds
y_train = [None] * cv_folds
X_test = [None] * cv_folds
y_test = [None] * cv_folds
for i in range(cv_folds):
    X_train[i], y_train[i] = train[i][["comment_text"]], train[i][labels]
    X_test[i], y_test[i] = test[i][["comment_text"]], test[i][labels]

In [10]:
print(X_train[0].shape, y_train[0].shape)
print(X_test[0].shape, y_test[0].shape)

(76680, 1) (76680, 6)
(19171, 1) (19171, 6)


# Extract features

## Extract textstat features

In [11]:
def extract_textstat_features(df):
    features_df = pd.DataFrame()
    features_df['comment_text_len'] = df['comment_text'].apply(len)
    features_df['comment_text_lex_count'] = df['comment_text'].apply(textstat.lexicon_count)
    features_df['comment_text_syl_count'] = df['comment_text'].apply(textstat.syllable_count)
    features_df['comment_text_sent_count'] = df['comment_text'].apply(textstat.sentence_count)
    features_df['comment_text_flesch_reading_ease'] = df['comment_text'].apply(textstat.flesch_reading_ease)
    features_df['comment_text_flesch_kincaid_grade'] = df['comment_text'].apply(textstat.flesch_kincaid_grade)
    
    features_df['comment_text_syl_over_lex'] = features_df['comment_text_syl_count'] / features_df['comment_text_lex_count']
    features_df['comment_text_lex_over_sent'] = features_df['comment_text_lex_count'] / features_df['comment_text_sent_count']
    
    return features_df

In [12]:
train_textstat_features_file = '../data/train_textstat_features.csv'
if os.path.isfile(train_textstat_features_file):
    X_train_all_features_textstat = pd.read_csv(train_textstat_features_file, index_col=0)
else:
    X_train_all_features_textstat = extract_textstat_features(train_all)
    X_train_all_features_textstat.to_csv(train_textstat_features_file)

## Split textstat features

In [13]:
X_train_features_textstat = []
X_test_features_textstat = []
for i in range(cv_folds):
    X_train_features_textstat.append(X_train_all_features_textstat.loc[cv_splits_train[i], :])
    X_test_features_textstat.append(X_train_all_features_textstat.loc[cv_splits_test[i], :])

## Extract BoW

In [14]:
re_tok = re.compile('([{}“”¨«»®´·º½¾¿¡§£₤‘’])'.format(string.punctuation))
def tokenize(s): return re_tok.sub(r' \1 ', s).split()

In [15]:
n = train_all.shape[0]
vec = TfidfVectorizer(ngram_range=(1,2), tokenizer=tokenize,
               min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1,
               smooth_idf=1, sublinear_tf=1 )
train_term_doc = vec.fit_transform(train_all['comment_text'])

In [16]:
X_train_features_bow = []
X_test_features_bow = []
for i in range(cv_folds):
    X_train_features_bow.append(train_term_doc[cv_splits_train[i], :])
    X_test_features_bow.append(train_term_doc[cv_splits_test[i], :])

## Extract emotion scores

In [17]:
emotion_lexicon_file = "../data/features/NRC-AffectIntensity-Lexicon.txt"
emotion_lexicon = pd.read_csv(emotion_lexicon_file, sep = "\t")

In [18]:
emotion_term_score = {'anger': {}, 'fear': {}, 'joy': {}, 'sadness': {}}
for row in emotion_lexicon.itertuples():
    emotion_term_score[row.AffectDimension][row.term] = row.score

In [19]:
translator = str.maketrans('', '', string.punctuation)
emotions = ['anger', 'fear', 'joy', 'sadness']
def avg_emotion_score(comment_text):
    try:
        comment_cleaned = comment_text.translate(translator)
    except:
        comment_cleaned = ""
    comment_cleaned = comment_cleaned.lower()
    comment_cleaned_words = comment_cleaned.split(" ")
    
    emotion_scores = {'anger': 0, 'fear': 0, 'joy': 0, 'sadness': 0}
    for emotion in emotions:
        scores = [emotion_term_score[emotion].get(word) for word in comment_cleaned_words
                    if emotion_term_score[emotion].get(word) is not None]
        if len(scores) == 0:
            continue
        emotion_scores[emotion] = np.mean(scores)
        
    return [emotion_scores[emotion] for emotion in emotions]
    
def extract_emotion_features(df):
    features_df = df['comment_text'].apply(avg_emotion_score)
    return pd.DataFrame(features_df.values.tolist(), columns=['comment_text_emotion_{}'.format(emotion) for emotion in emotions])

In [20]:
X_train_all_features_emotion = extract_emotion_features(train_all)

In [21]:
X_train_features_emotion = []
X_test_features_emotion = []
for i in range(cv_folds):
    X_train_features_emotion.append(X_train_all_features_emotion.loc[cv_splits_train[i], :])
    X_test_features_emotion.append(X_train_all_features_emotion.loc[cv_splits_test[i], :])

## Word2Vec

In [22]:
w2v_model = gensim.models.KeyedVectors.load_word2vec_format('../data/GoogleNews-vectors-negative300.bin', binary=True)

In [23]:
def featurize_w2v(comment_text):
    sentence = tokenize(comment_text)
    f = np.zeros(w2v_model.vector_size)
    for w in sentence:
        try:
            vec = w2v_model[w]
        except KeyError:
            continue
        f += vec
    f /= len(sentence)
    return f

In [24]:
X_train_all_features_w2v = train_all['comment_text'].apply(featurize_w2v)

In [25]:
X_train_all_features_w2v = np.array(X_train_all_features_w2v.tolist())

In [26]:
X_train_features_w2v = []
X_test_features_w2v = []
for i in range(cv_folds):
    X_train_features_w2v.append(X_train_all_features_w2v[cv_splits_train[i]])
    X_test_features_w2v.append(X_train_all_features_w2v[cv_splits_test[i]])

# Function to calculate mean column-wise log loss of y_pred vs y_actual

In [27]:
def calculate_score(y_actual, y_pred):
    col_log_loss = [log_loss(np.array(y_actual[label]),
                             np.array([1.-np.array(y_pred[label]), np.array(y_pred[label])]).T) for label in labels]
    return col_log_loss

# Test classification scores

## Perfect score

In [28]:
np.mean(calculate_score(y_test[0], y_test[0]))

9.9920072216264108e-16

## ZeroR

In [29]:
scores_zeror = []
for i in range(cv_folds):
    data = np.array([np.zeros(len(labels))] * len(X_test[i]))
    y_pred_zeror = pd.DataFrame(data, columns=labels)
    scores = calculate_score(y_test[i], y_pred_zeror)
    scores_zeror.append(scores)
print(np.mean(scores_zeror, axis=0))
print(np.mean(scores_zeror))

[ 3.32844347  0.34772673  1.84096823  0.10990296  1.71701177  0.29331534]
1.27289475201


## All 0.5

In [30]:
scores_half = []
for i in range(cv_folds):
    data = np.array([np.ones(len(labels))*0.5] * len(X_test[i]))
    y_pred_half = pd.DataFrame(data, columns=labels)
    scores = calculate_score(y_test[i], y_pred_half)
    scores_half.append(scores)
print(np.mean(scores_half, axis=0))
print(np.mean(scores_half))

[ 0.69314718  0.69314718  0.69314718  0.69314718  0.69314718  0.69314718]
0.69314718056


## Test calculate_score

In [31]:
scores_half = []
for i in range(cv_folds):
    data = np.array([np.array([0.5, 0.6, 0.7, 0.8, 0.9, 1.0])] * len(X_test[i]))
    y_pred_half = pd.DataFrame(data, columns=labels)
    scores = calculate_score(y_test[i], y_pred_half)
    scores_half.append(scores)
print(np.mean(scores_half, axis=0))
print(np.mean(scores_half))

[  0.69314718   0.91220862   1.15881057   1.6050267    2.19335539
  34.24546105]
6.80133491769


In [32]:
def best_classifier_per_label(classifiers, y_pred, scores):
    classifier_scores = np.array([np.mean(scores[classifier_name], axis=0) for classifier_name, _ in classifiers])
    best_classifier_score_per_label = [(classifiers[min_idx][0], classifier_scores[min_idx][i])
                                           for i, min_idx in enumerate(np.argmin(classifier_scores, axis=0))]
    for i, label in enumerate(labels):
        print("[{}] {} : {}".format(label, best_classifier_score_per_label[i][0], best_classifier_score_per_label[i][1]))
    print("Average: {}".format(np.mean([x[1] for x in best_classifier_score_per_label])))
    

## Run classifiers

In [33]:
def run_classifiers(classifiers, X_train_features, X_test_features, y_train, clf, y_pred, scores):
    for classifier_name, classifier in classifiers:
        print('Training with {}'.format(classifier_name))
        clf[classifier_name] = [{}]*cv_folds
        y_pred[classifier_name] = [{}]*cv_folds
        scores[classifier_name] = [{}]*cv_folds
        for fold in range(cv_folds):
            y_pred[classifier_name][fold] = pd.DataFrame()
            for label in labels:
                clf[classifier_name][fold][label] = classifier()
                clf[classifier_name][fold][label].fit(X_train_features[fold], y_train[fold][label])

                y_pred[classifier_name][fold][label] = clf[classifier_name][fold][label].predict_proba(X_test_features[fold]).T[1]

            scores[classifier_name][fold] = calculate_score(y_test[fold], y_pred[classifier_name][fold])
        print('Column-wise log loss for {}: {} - {}'.format(classifier_name, np.mean(scores[classifier_name], axis=0), np.mean(scores[classifier_name])))
    best_classifier_per_label(classifiers, y_pred, scores)

## Textstat features only

In [None]:
classifiers_textstat = []
# classifiers_textstat += [
#     ('Logistic Regression {}'.format(c), lambda: LogisticRegression(solver="newton-cg", C=2.0**c, max_iter=1000)) for c in np.arange(-3,4,1)]
classifiers_textstat += [
    
    ('XGBoost', lambda: xgb.XGBClassifier()),
    
    ('Logistic Regression', lambda: LogisticRegression(solver="newton-cg", C=2.0, max_iter=1000)),
    
    ('NB-SVM', lambda: NbSvmClassifier()),
    
    ("Decision Tree", lambda: DecisionTreeClassifier(max_depth=5)),
    ("Random Forest", lambda: RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1)),
    ("Extra Trees", lambda: ExtraTreesClassifier(max_depth=5)),
    ("Gradient Boosting", lambda: GradientBoostingClassifier()),
    
    ("Nearest Neighbors", lambda: KNeighborsClassifier(131)),
    ("Naive Bayes", lambda: GaussianNB()),
    ("Neural Net", lambda: MLPClassifier(alpha=1)),
    ("AdaBoost", lambda: AdaBoostClassifier()),
    ("QDA", lambda: QuadraticDiscriminantAnalysis()),
    
    # Cannot even run
    #     ("Gaussian Process", lambda: GaussianProcessClassifier(1.0 * RBF(1.0)))  # Memory error??? Even with 32GB ram???
    #     ("Linear SVM", lambda: SVC(kernel="linear", C=0.025)),                   # Slow like shit
    #     ("RBF SVM", lambda: SVC(gamma=2, C=1))                                   # Slow like shit
]

In [None]:
clf_textstat = {}
y_pred_textstat = {}
scores_textstat = {}
run_classifiers(classifiers_textstat, X_train_features_textstat, X_test_features_textstat, y_train,
                clf_textstat, y_pred_textstat, scores_textstat)

# Emotion only

In [None]:
classifiers_emotion = []
classifiers_emotion += [
    
    ('XGBoost', lambda: xgb.XGBClassifier()),
    
    ('Logistic Regression', lambda: LogisticRegression(solver="newton-cg", C=2.0, max_iter=1000)),
    
    ('NB-SVM', lambda: NbSvmClassifier()),
    
    ("Decision Tree", lambda: DecisionTreeClassifier(max_depth=5)),
    ("Random Forest", lambda: RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1)),
    ("Extra Trees", lambda: ExtraTreesClassifier(max_depth=5)),
    ("Gradient Boosting", lambda: GradientBoostingClassifier()),
    
    ("Nearest Neighbors", lambda: KNeighborsClassifier(131)),
    ("Naive Bayes", lambda: GaussianNB()),
    ("Neural Net", lambda: MLPClassifier(alpha=1)),
    ("AdaBoost", lambda: AdaBoostClassifier()),
    ("QDA", lambda: QuadraticDiscriminantAnalysis()),
    
]

In [None]:
clf_emotion = {}
y_pred_emotion = {}
scores_emotion = {}
run_classifiers(classifiers_emotion, X_train_features_emotion, X_test_features_emotion, y_train,
                clf_emotion, y_pred_emotion, scores_emotion)

## Textstat + emotion

In [None]:
classifiers_textstat_emotion = []
classifiers_textstat_emotion += [
    
    ('XGBoost', lambda: xgb.XGBClassifier()),
    
    ('Logistic Regression', lambda: LogisticRegression(solver="newton-cg", C=2.0, max_iter=1000)),
    
    ('NB-SVM', lambda: NbSvmClassifier()),
    
    ("Decision Tree", lambda: DecisionTreeClassifier(max_depth=5)),
    ("Random Forest", lambda: RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1)),
    ("Extra Trees", lambda: ExtraTreesClassifier(max_depth=5)),
    ("Gradient Boosting", lambda: GradientBoostingClassifier()),
    
    ("Nearest Neighbors", lambda: KNeighborsClassifier(131)),
    ("Naive Bayes", lambda: GaussianNB()),
    ("Neural Net", lambda: MLPClassifier(alpha=1)),
    ("AdaBoost", lambda: AdaBoostClassifier()),
    ("QDA", lambda: QuadraticDiscriminantAnalysis()),
    
]

In [None]:
# concat features
X_train_features_textstat_emotion = [None]*cv_folds
X_test_features_textstat_emotion = [None]*cv_folds
for fold in range(cv_folds):
    X_train_features_textstat_emotion[fold] = np.hstack((X_train_features_textstat[fold], X_train_features_emotion[fold].values))
    X_test_features_textstat_emotion[fold] = np.hstack((X_test_features_textstat[fold], X_test_features_emotion[fold].values))

print(X_train_features_textstat[0].shape)
print(X_train_features_emotion[0].shape)
print(X_train_features_textstat_emotion[0].shape)

In [None]:
clf_textstat_emotion = {}
y_pred_textstat_emotion = {}
scores_textstat_emotion = {}
run_classifiers(classifiers_textstat_emotion, X_train_features_textstat_emotion,
                X_test_features_textstat_emotion, y_train,
                clf_textstat_emotion, y_pred_textstat_emotion, scores_textstat_emotion)

## Bag of words only

In [None]:
classifiers_bow = [
    ("Multinominal Naive Bayes", lambda: MultinomialNB()),
    ('XGBoost', lambda: xgb.XGBClassifier()),
    ('NB-SVM', lambda: NbSvmClassifier()),
    ('Logistic Regression', lambda: LogisticRegression(solver="newton-cg", C=2.0, max_iter=1000)),
]

In [None]:
clf_bow = {}
y_pred_bow = {}
scores_bow = {}
run_classifiers(classifiers_bow, X_train_features_bow,
                X_test_features_bow, y_train,
                clf_bow, y_pred_bow, scores_bow)

## Bag of words + textstat

In [None]:
classifiers_bow_textstat = [
#     ('XGBoost', lambda: xgb.XGBClassifier()),
    ('NB-SVM', lambda: NbSvmClassifier()),
#     ('Logistic Regression', lambda: LogisticRegression(solver="newton-cg", C=2.0, max_iter=1000))
]

In [None]:
# concat features
X_train_features_bow_textstat = [None]*cv_folds
X_test_features_bow_textstat = [None]*cv_folds
for fold in range(cv_folds):
    X_train_features_bow_textstat[fold] = hstack((X_train_features_bow[fold], X_train_features_textstat[fold].values)).tocsr()
    X_test_features_bow_textstat[fold] = hstack((X_test_features_bow[fold], X_test_features_textstat[fold].values)).tocsr()

print(X_train_features_bow[0].shape)
print(X_train_features_textstat[0].shape)
print(X_train_features_bow_textstat[0].shape)

In [None]:
clf_bow_textstat = {}
y_pred_bow_textstat = {}
scores_bow_textstat = {}
run_classifiers(classifiers_bow_textstat, X_train_features_bow_textstat,
                X_test_features_bow_textstat, y_train,
                clf_bow_textstat, y_pred_bow_textstat, scores_bow_textstat)

## Bag of words + textstat + emotion

In [None]:
classifiers_bow_textstat_emotion = [
#     ('XGBoost', lambda: xgb.XGBClassifier()),
    ('NB-SVM', lambda: NbSvmClassifier()),
#     ('Logistic Regression', lambda: LogisticRegression(solver="newton-cg", C=2.0, max_iter=1000))
]

In [None]:
# concat features
X_train_features_bow_textstat_emotion = [None]*cv_folds
X_test_features_bow_textstat_emotion = [None]*cv_folds
for fold in range(cv_folds):
    X_train_features_bow_textstat_emotion[fold] = hstack((X_train_features_bow[fold], X_train_features_textstat[fold].values, X_train_features_emotion[fold].values))
    X_test_features_bow_textstat_emotion[fold] = hstack((X_test_features_bow[fold], X_test_features_textstat[fold].values, X_test_features_emotion[fold].values))

print(X_train_features_bow[0].shape)
print(X_train_features_textstat[0].shape)
print(X_train_features_emotion[0].shape)
print(X_train_features_bow_textstat_emotion[0].shape)

In [None]:
clf_bow_textstat_emotion = {}
y_pred_bow_textstat_emotion = {}
scores_bow_textstat_emotion = {}

run_classifiers(classifiers_bow_textstat_emotion, X_train_features_bow_textstat_emotion,
                X_test_features_bow_textstat_emotion, y_train,
                clf_bow_textstat_emotion, y_pred_bow_textstat_emotion, scores_bow_textstat_emotion)

## Word2Vec

In [41]:
classifiers_w2v = [
    ("Random Forest", lambda: RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1)),
    ("Extra Trees", lambda: ExtraTreesClassifier(max_depth=5)),
    ('Logistic Regression', lambda: LogisticRegression(solver="newton-cg", C=2.0, max_iter=1000))
]

In [42]:
clf_w2v = {}
y_pred_w2v = {}
scores_w2v = {}

run_classifiers(classifiers_w2v, X_train_features_w2v,
                X_test_features_w2v, y_train,
                clf_w2v, y_pred_w2v, scores_w2v)

Training with Random Forest
Column-wise log loss for Random Forest: [ 0.26095864  0.04228687  0.16740486  0.01842564  0.15764403  0.04248773] - 0.1148679622405156
Training with Extra Trees
Column-wise log loss for Extra Trees: [ 0.2545771   0.03963212  0.16112278  0.01777193  0.1532527   0.04061723] - 0.11116230999035771
Training with Logistic Regression
Column-wise log loss for Logistic Regression: [ 0.1495776   0.03237393  0.10044754  0.01371428  0.10646988  0.03337939] - 0.07266043582520464
[toxic] Logistic Regression : 0.14957760244765905
[severe_toxic] Logistic Regression : 0.032373928675010925
[obscene] Logistic Regression : 0.10044753728050695
[threat] Logistic Regression : 0.013714278381529976
[insult] Logistic Regression : 0.10646988202741728
[identity_hate] Logistic Regression : 0.033379386139103705
Average: 0.07266043582520466
