In [1]:
import os
import numpy as np
import pandas as pd

from textstat.textstat import textstat

from sklearn.model_selection import ShuffleSplit
from sklearn.metrics import log_loss

# Classifiers
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

# BoW feature extraction
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
train_file = '../data/train.csv'
test_file = '../data/test.csv'
sample_submission_file = '../data/sample_submission.csv'

train_all = pd.read_csv(train_file)
test_for_submission = pd.read_csv(test_file)
sample_submission = pd.read_csv(sample_submission_file)

# Split train data

In [3]:
rs = ShuffleSplit(n_splits=5, test_size=.2, random_state=0)
cv_splits_train = []
cv_splits_test = []
for train_index, test_index in rs.split(train_all):
    cv_splits_train.append(train_index)
    cv_splits_test.append(test_index)
cv_folds = len(cv_splits_train)

In [4]:
cv_train = []
cv_test = []
for i in range(cv_folds):
    cv_train.append(train_all.loc[cv_splits_train[i], :])
    cv_test.append(train_all.loc[cv_splits_test[i], :])

In [5]:
train = cv_train
test = cv_test

Count and check that the data is split such that the percentages of labels in train/test are roughly equal

In [6]:
labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [7]:
def print_count_of_each_label(df):
    for label in labels:
        print('{}: {} / {} ({}%)'.format(label.rjust(len(labels[-1])),
                                         df.loc[df[label] == 1].shape[0],
                                         len(df),
                                         np.round(df.loc[df[label] == 1].shape[0]/len(df)*100, 3)))

for i in range(cv_folds):
    print('CV train {}'.format(i))
    print_count_of_each_label(cv_train[i])
    print('CV test {}'.format(i))
    print_count_of_each_label(cv_test[i])

CV train 0
        toxic: 7396 / 76680 (9.645%)
 severe_toxic: 768 / 76680 (1.002%)
      obscene: 4073 / 76680 (5.312%)
       threat: 251 / 76680 (0.327%)
       insult: 3831 / 76680 (4.996%)
identity_hate: 641 / 76680 (0.836%)
CV test 0
        toxic: 1841 / 19171 (9.603%)
 severe_toxic: 197 / 19171 (1.028%)
      obscene: 1036 / 19171 (5.404%)
       threat: 54 / 19171 (0.282%)
       insult: 934 / 19171 (4.872%)
identity_hate: 173 / 19171 (0.902%)
CV train 1
        toxic: 7326 / 76680 (9.554%)
 severe_toxic: 771 / 76680 (1.005%)
      obscene: 4067 / 76680 (5.304%)
       threat: 256 / 76680 (0.334%)
       insult: 3786 / 76680 (4.937%)
identity_hate: 646 / 76680 (0.842%)
CV test 1
        toxic: 1911 / 19171 (9.968%)
 severe_toxic: 194 / 19171 (1.012%)
      obscene: 1042 / 19171 (5.435%)
       threat: 49 / 19171 (0.256%)
       insult: 979 / 19171 (5.107%)
identity_hate: 168 / 19171 (0.876%)
CV train 2
        toxic: 7377 / 76680 (9.621%)
 severe_toxic: 754 / 76680 (0.983%)
  

# Split train/test into X and y

So that we can use sklearn classifiers easily

In [8]:
X_train = [None] * cv_folds
y_train = [None] * cv_folds
X_test = [None] * cv_folds
y_test = [None] * cv_folds
for i in range(cv_folds):
    X_train[i], y_train[i] = train[i][["comment_text"]], train[i][labels]
    X_test[i], y_test[i] = test[i][["comment_text"]], test[i][labels]

In [9]:
print(X_train[0].shape, y_train[0].shape)
print(X_test[0].shape, y_test[0].shape)

(76680, 1) (76680, 6)
(19171, 1) (19171, 6)


# Extract features

## Extract textstat features

In [10]:
def extract_features(df):
    features_df = pd.DataFrame()
    features_df['comment_text_len'] = df['comment_text'].apply(len)
    features_df['comment_text_lex_count'] = df['comment_text'].apply(textstat.lexicon_count)
    features_df['comment_text_syl_count'] = df['comment_text'].apply(textstat.syllable_count)
    features_df['comment_text_sent_count'] = df['comment_text'].apply(textstat.sentence_count)
    features_df['comment_text_flesch_reading_ease'] = df['comment_text'].apply(textstat.flesch_reading_ease)
    features_df['comment_text_flesch_kincaid_grade'] = df['comment_text'].apply(textstat.flesch_kincaid_grade)
    
    features_df['comment_text_syl_over_lex'] = features_df['comment_text_syl_count'] / features_df['comment_text_lex_count']
    
    return features_df

In [11]:
textstat_features_file = '../data/textstat_features.csv'
if os.path.isfile(textstat_features_file):
    X_train_all_features_textstat = pd.read_csv(textstat_features_file)
else:
    X_train_all_features_textstat = extract_features(train_all)
    X_train_all_features_textstat.to_csv(textstat_features_file)

## Split textstat features

In [12]:
X_train_features_textstat = []
X_test_features_textstat = []
for i in range(cv_folds):
    X_train_features_textstat.append(X_train_all_features_textstat.loc[cv_splits_train[i], :])
    X_test_features_textstat.append(X_train_all_features_textstat.loc[cv_splits_test[i], :])

# Function to calculate mean column-wise log loss of y_pred vs y_actual

In [13]:
def calculate_score(y_actual, y_pred):
    return np.mean([log_loss(np.array(y_actual[label]),
                             np.array([1.-np.array(y_pred[label]), np.array(y_pred[label])]).T) for label in labels])

# Test classification scores

## Perfect score

In [14]:
calculate_score(y_test[0], y_test[0])

9.9920072216264108e-16

## ZeroR

In [15]:
scores_zeror = []
for i in range(cv_folds):
    data = np.array([np.zeros(len(labels))] * len(X_test[i]))
    y_pred_zeror = pd.DataFrame(data, columns=labels)
    scores_zeror.append(calculate_score(y_test[i], y_pred_zeror))
print(scores_zeror)
print(np.mean(scores_zeror))

[1.2716404815645752, 1.304069565864215, 1.2863536772190416, 1.2740426359571411, 1.2806485605366973]
1.28335098423


## All 0.5

In [16]:
scores_half = []
for i in range(cv_folds):
    data = np.array([np.ones(len(labels))*0.5] * len(X_test[i]))
    y_pred_half = pd.DataFrame(data, columns=labels)
    scores_half.append(calculate_score(y_test[i], y_pred_half))
print(scores_half)
print(np.mean(scores_half))

[0.69314718055994529, 0.69314718055994529, 0.69314718055994529, 0.69314718055994529, 0.69314718055994529]
0.69314718056


## Textstat features only

### Loop through a bunch of classifiers

In [17]:
classifiers = [
    ("Nearest Neighbors", KNeighborsClassifier(131)),
    ("Naive Bayes", GaussianNB()),
    ("Decision Tree", DecisionTreeClassifier(max_depth=5)),
    ("Random Forest", RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1)),
    ("Neural Net", MLPClassifier(alpha=1)),
    ("AdaBoost", AdaBoostClassifier()),
    ("QDA", QuadraticDiscriminantAnalysis()),
    #     ("Gaussian Process", GaussianProcessClassifier(1.0 * RBF(1.0)))  # Memory error??? Even with 32GB ram???
    #     ("Linear SVM", SVC(kernel="linear", C=0.025)),                   # Slow as shit
    #     ("RBF SVM", SVC(gamma=2, C=1))                                   # Slow as shit
]

In [18]:
clf = {}
y_pred = {}
scores_textstat = {}
for classifier_name, classifier in classifiers:
    print('Training with {}'.format(classifier_name))
    clf[classifier_name] = [{}]*cv_folds
    y_pred[classifier_name] = [{}]*cv_folds
    scores_textstat[classifier_name] = [{}]*cv_folds
    for fold in range(cv_folds):
        for label in labels:
            clf[classifier_name][fold][label] = classifier
            clf[classifier_name][fold][label].fit(X_train_features_textstat[fold], y_train[fold][label])

        y_pred[classifier_name][fold] = pd.DataFrame()
        for label in labels:
            y_pred[classifier_name][fold][label] = clf[classifier_name][fold][label].predict_proba(X_test_features_textstat[fold]).T[1]

        scores_textstat[classifier_name][fold] = calculate_score(y_test[fold], y_pred[classifier_name][fold])
    print('Column-wise log loss for {}: {} - {}'.format(classifier_name, scores_textstat[classifier_name], np.mean(scores_textstat[classifier_name])))

Training with Nearest Neighbors
Column-wise log loss for Nearest Neighbors: [0.5256809621385633, 0.52571879365431262, 0.5176337468898744, 0.5064720242972145, 0.51618901691386421] - 0.5183389087787658
Training with Naive Bayes
Column-wise log loss for Naive Bayes: [0.67802880910037111, 0.75718086146898489, 0.72469968356287995, 0.67596525454034795, 0.71034136466691755] - 0.7092431946679003
Training with Decision Tree
Column-wise log loss for Decision Tree: [0.17948596754211299, 0.18728442467369955, 0.18341879999086066, 0.18183658030306449, 0.17913390401916163] - 0.18223193530577986
Training with Random Forest
Column-wise log loss for Random Forest: [0.17470317728286464, 0.17875249356071574, 0.17497159213817096, 0.17268445506654065, 0.17346514054694429] - 0.17491537171904725
Training with Neural Net
Column-wise log loss for Neural Net: [1.2705339326373306, 1.3008524708637486, 1.2877383233392219, 1.2715774636027959, 1.2803600643980317] - 1.2822124509682258
Training with AdaBoost
Column-wis

## Bag of words