In [31]:
import numpy as np
import pandas as pd
from textstat.textstat import textstat
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss

from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

In [2]:
train_file = '../data/train.csv'
test_file = '../data/test.csv'
sample_submission_file = '../data/sample_submission.csv'

train_all = pd.read_csv(train_file)
test_for_submission = pd.read_csv(test_file)
sample_submission = pd.read_csv(sample_submission_file)

# Split train data

In [3]:
train, test = train_test_split(train_all, test_size=0.2)

Count and check that the data is split such that the percentages of labels in train/test are roughly equal

In [4]:
labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [5]:
def print_count_of_each_label(df):
    for label in labels:
        print('{}: {} / {} ({}%)'.format(label.rjust(len(labels[-1])),
                                         df.loc[df[label] == 1].shape[0],
                                         len(df),
                                         np.round(df.loc[df[label] == 1].shape[0]/len(df)*100, 3)))

print('train')
print_count_of_each_label(train)
print('test')
print_count_of_each_label(test)

train
        toxic: 7449 / 76680 (9.714%)
 severe_toxic: 783 / 76680 (1.021%)
      obscene: 4153 / 76680 (5.416%)
       threat: 247 / 76680 (0.322%)
       insult: 3845 / 76680 (5.014%)
identity_hate: 657 / 76680 (0.857%)
test
        toxic: 1788 / 19171 (9.327%)
 severe_toxic: 182 / 19171 (0.949%)
      obscene: 956 / 19171 (4.987%)
       threat: 58 / 19171 (0.303%)
       insult: 920 / 19171 (4.799%)
identity_hate: 157 / 19171 (0.819%)


# Split train/test into X and y

So that we can use sklearn classifiers easily

In [6]:
X_train, y_train = train[["comment_text"]], train[labels]
X_test, y_test = test[["comment_text"]], test[labels]

In [7]:
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(76680, 1) (76680, 6)
(19171, 1) (19171, 6)


# Function to calculate mean column-wise log loss of y_pred vs y_actual

In [8]:
y_test.head()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
61761,0,0,0,0,0,0
6322,0,0,0,0,0,0
26528,0,0,0,0,0,0
47980,0,0,0,0,0,0
90134,0,0,0,0,0,0


In [9]:
def calculate_score(y_actual, y_pred):
    return np.mean([log_loss(np.array(y_actual[label]),
                             np.array([1.-np.array(y_pred[label]), np.array(y_pred[label])]).T) for label in labels])

# Test classification scores

## Perfect score

In [10]:
calculate_score(y_test, y_test)

9.9920072216264108e-16

## ZeroR

In [11]:
data = np.array([np.zeros(len(labels))] * len(X_test))
y_pred_zeror = pd.DataFrame(data, columns=labels)
calculate_score(y_test, y_pred_zeror)

1.2193936235262668

## All 0.5

In [12]:
data = np.array([np.ones(len(labels))*0.5] * len(X_test))
y_pred_half = pd.DataFrame(data, columns=labels)
calculate_score(y_test, y_pred_half)

0.69314718055994529

## Textstat features only

In [13]:
def extract_features(df):
    features_df = pd.DataFrame()
    features_df['comment_text_len'] = df['comment_text'].apply(len)
    features_df['comment_text_lex_count'] = df['comment_text'].apply(textstat.lexicon_count)
    features_df['comment_text_syl_count'] = df['comment_text'].apply(textstat.syllable_count)
    features_df['comment_text_sent_count'] = df['comment_text'].apply(textstat.sentence_count)
    features_df['comment_text_flesch_reading_ease'] = df['comment_text'].apply(textstat.flesch_reading_ease)
    features_df['comment_text_flesch_kincaid_grade'] = df['comment_text'].apply(textstat.flesch_kincaid_grade)
    
    features_df['comment_text_syl_over_lex'] = features_df['comment_text_syl_count'] / features_df['comment_text_lex_count']
    
    return features_df

In [14]:
X_train_features_textstat = extract_features(X_train)

In [15]:
X_test_features_textstat = extract_features(X_test)

### Loop through a bunch of classifiers

In [28]:
classifiers = [
    ("Nearest Neighbors", KNeighborsClassifier(131)),
    ("Naive Bayes", GaussianNB()),
    ("Decision Tree", DecisionTreeClassifier(max_depth=5)),
    ("Random Forest", RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1)),
    ("Neural Net", MLPClassifier(alpha=1)),
    ("AdaBoost", AdaBoostClassifier()),
    ("QDA", QuadraticDiscriminantAnalysis()),
    #     ("Gaussian Process", GaussianProcessClassifier(1.0 * RBF(1.0)))  # Memory error??? Even with 32GB ram???
    #     ("Linear SVM", SVC(kernel="linear", C=0.025)),                   # Slow as shit
    #     ("RBF SVM", SVC(gamma=2, C=1))                                   # Slow as shit
]

In [29]:
clf = {}
y_pred = {}
for classifier_name, classifier in classifiers:
    print('Training with {}'.format(classifier_name))
    clf[classifier_name] = {}
    for label in labels:
        clf[classifier_name][label] = classifier
        clf[classifier_name][label].fit(X_train_features_textstat, y_train[label])
    
    print('Predicting with {}'.format(classifier_name))
    y_pred[classifier_name] = pd.DataFrame()
    for label in labels:
        y_pred[classifier_name][label] = clf[classifier_name][label].predict_proba(X_test_features_textstat).T[1]
    
    score = calculate_score(y_test, y_pred[classifier_name])
    print('Column-wise log loss for {}: {}'.format(classifier_name, score))

Training with Nearest Neighbors
Predicting with Nearest Neighbors
Column-wise log loss for Nearest Neighbors: 0.4477744043060689
Training with Naive Bayes
Predicting with Naive Bayes
Column-wise log loss for Naive Bayes: 0.5520901850154825
Training with Decision Tree
Predicting with Decision Tree
Column-wise log loss for Decision Tree: 0.17642219076440394
Training with Random Forest
Predicting with Random Forest
Column-wise log loss for Random Forest: 0.1660879190388692
Training with Neural Net
Predicting with Neural Net
Column-wise log loss for Neural Net: 0.17423034114113337
Training with AdaBoost
Predicting with AdaBoost
Column-wise log loss for AdaBoost: 0.6448197566728328
Training with QDA
Predicting with QDA
Column-wise log loss for QDA: 0.42661147513758674
