In [1]:
import os
import numpy as np
import pandas as pd

from textstat.textstat import textstat

from sklearn.model_selection import KFold
from sklearn.metrics import log_loss

# Classifiers
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

# from NbSvmClassifier import NbSvmClassifier

# BoW feature extraction
import re, string
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
train_file = '../data/train.csv'
test_file = '../data/test.csv'
sample_submission_file = '../data/sample_submission.csv'

train_all = pd.read_csv(train_file)
test_for_submission = pd.read_csv(test_file)
sample_submission = pd.read_csv(sample_submission_file)

# Clean data

TODO: Account for more cases

In [3]:
train_all['comment_text'].fillna("unknown", inplace=True)
test_for_submission['comment_text'].fillna("unknown", inplace=True)

# Split train data

In [4]:
kf = KFold(n_splits=5, random_state=0)
cv_splits_train = []
cv_splits_test = []
for train_index, test_index in kf.split(train_all):
    cv_splits_train.append(train_index)
    cv_splits_test.append(test_index)
cv_folds = len(cv_splits_train)

In [5]:
cv_train = []
cv_test = []
for i in range(cv_folds):
    cv_train.append(train_all.loc[cv_splits_train[i], :])
    cv_test.append(train_all.loc[cv_splits_test[i], :])

In [6]:
train = cv_train
test = cv_test

Count and check that the data is split such that the percentages of labels in train/test are roughly equal

In [7]:
labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [8]:
def print_count_of_each_label(df):
    for label in labels:
        print('{}: {} / {} ({}%)'.format(label.rjust(len(labels[-1])),
                                         df.loc[df[label] == 1].shape[0],
                                         len(df),
                                         np.round(df.loc[df[label] == 1].shape[0]/len(df)*100, 3)))

for i in range(cv_folds):
    print('CV train {}'.format(i))
    print_count_of_each_label(cv_train[i])
    print('CV test {}'.format(i))
    print_count_of_each_label(cv_test[i])

CV train 0
        toxic: 7366 / 76680 (9.606%)
 severe_toxic: 792 / 76680 (1.033%)
      obscene: 4108 / 76680 (5.357%)
       threat: 234 / 76680 (0.305%)
       insult: 3826 / 76680 (4.99%)
identity_hate: 654 / 76680 (0.853%)
CV test 0
        toxic: 1871 / 19171 (9.76%)
 severe_toxic: 173 / 19171 (0.902%)
      obscene: 1001 / 19171 (5.221%)
       threat: 71 / 19171 (0.37%)
       insult: 939 / 19171 (4.898%)
identity_hate: 160 / 19171 (0.835%)
CV train 1
        toxic: 7413 / 76681 (9.667%)
 severe_toxic: 772 / 76681 (1.007%)
      obscene: 4136 / 76681 (5.394%)
       threat: 244 / 76681 (0.318%)
       insult: 3841 / 76681 (5.009%)
identity_hate: 657 / 76681 (0.857%)
CV test 1
        toxic: 1824 / 19170 (9.515%)
 severe_toxic: 193 / 19170 (1.007%)
      obscene: 973 / 19170 (5.076%)
       threat: 61 / 19170 (0.318%)
       insult: 924 / 19170 (4.82%)
identity_hate: 157 / 19170 (0.819%)
CV train 2
        toxic: 7397 / 76681 (9.646%)
 severe_toxic: 764 / 76681 (0.996%)
      o

# Split train/test into X and y

So that we can use sklearn classifiers easily

In [9]:
X_train = [None] * cv_folds
y_train = [None] * cv_folds
X_test = [None] * cv_folds
y_test = [None] * cv_folds
for i in range(cv_folds):
    X_train[i], y_train[i] = train[i][["comment_text"]], train[i][labels]
    X_test[i], y_test[i] = test[i][["comment_text"]], test[i][labels]

In [10]:
print(X_train[0].shape, y_train[0].shape)
print(X_test[0].shape, y_test[0].shape)

(76680, 1) (76680, 6)
(19171, 1) (19171, 6)


# Extract features

## Extract textstat features

In [11]:
def extract_features(df):
    features_df = pd.DataFrame()
    features_df['comment_text_len'] = df['comment_text'].apply(len)
    features_df['comment_text_lex_count'] = df['comment_text'].apply(textstat.lexicon_count)
    features_df['comment_text_syl_count'] = df['comment_text'].apply(textstat.syllable_count)
    features_df['comment_text_sent_count'] = df['comment_text'].apply(textstat.sentence_count)
    features_df['comment_text_flesch_reading_ease'] = df['comment_text'].apply(textstat.flesch_reading_ease)
    features_df['comment_text_flesch_kincaid_grade'] = df['comment_text'].apply(textstat.flesch_kincaid_grade)
    
    features_df['comment_text_syl_over_lex'] = features_df['comment_text_syl_count'] / features_df['comment_text_lex_count']
    
    return features_df

In [12]:
train_textstat_features_file = '../data/train_textstat_features.csv'
if os.path.isfile(train_textstat_features_file):
    X_train_all_features_textstat = pd.read_csv(train_textstat_features_file)
else:
    X_train_all_features_textstat = extract_features(train_all)
    X_train_all_features_textstat.to_csv(train_textstat_features_file)

## Split textstat features

In [13]:
X_train_features_textstat = []
X_test_features_textstat = []
for i in range(cv_folds):
    X_train_features_textstat.append(X_train_all_features_textstat.loc[cv_splits_train[i], :])
    X_test_features_textstat.append(X_train_all_features_textstat.loc[cv_splits_test[i], :])

## Extract BoW

In [14]:
re_tok = re.compile('([{}“”¨«»®´·º½¾¿¡§£₤‘’])'.format(string.punctuation))
def tokenize(s): return re_tok.sub(r' \1 ', s).split()

In [15]:
n = train_all.shape[0]
vec = TfidfVectorizer(ngram_range=(1,2), tokenizer=tokenize,
               min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1,
               smooth_idf=1, sublinear_tf=1 )
train_term_doc = vec.fit_transform(train_all['comment_text'])

In [16]:
X_train_features_bow = []
X_test_features_bow = []
for i in range(cv_folds):
    X_train_features_bow.append(train_term_doc[cv_splits_train[i], :])
    X_test_features_bow.append(train_term_doc[cv_splits_test[i], :])

# Function to calculate mean column-wise log loss of y_pred vs y_actual

In [17]:
def calculate_score(y_actual, y_pred):
    return np.mean([log_loss(np.array(y_actual[label]),
                             np.array([1.-np.array(y_pred[label]), np.array(y_pred[label])]).T) for label in labels])

# Test classification scores

## Perfect score

In [18]:
calculate_score(y_test[0], y_test[0])

9.9920072216264108e-16

## ZeroR

In [19]:
scores_zeror = []
for i in range(cv_folds):
    data = np.array([np.zeros(len(labels))] * len(X_test[i]))
    y_pred_zeror = pd.DataFrame(data, columns=labels)
    scores_zeror.append(calculate_score(y_test[i], y_pred_zeror))
print(scores_zeror)
print(np.mean(scores_zeror))

[1.2656350955831603, 1.2407774653431667, 1.2816162202528159, 1.2738088112259711, 1.3026361676327827]
1.27289475201


## All 0.5

In [20]:
scores_half = []
for i in range(cv_folds):
    data = np.array([np.ones(len(labels))*0.5] * len(X_test[i]))
    y_pred_half = pd.DataFrame(data, columns=labels)
    scores_half.append(calculate_score(y_test[i], y_pred_half))
print(scores_half)
print(np.mean(scores_half))

[0.69314718055994529, 0.69314718055994529, 0.69314718055994529, 0.69314718055994529, 0.69314718055994529]
0.69314718056


## Textstat features only

In [21]:
# https://www.kaggle.com/jhoward/nb-svm-strong-linear-baseline-eda-0-052-lb#261316

import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_is_fitted
from sklearn.utils.multiclass import unique_labels
from sklearn.linear_model import LogisticRegression

class NbSvmClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, C=1.0, dual=False, n_jobs=1):
        self.C = C
        self.dual = dual
        self.n_jobs = n_jobs

    def predict(self, x):
        # Verify that model has been fit
        check_is_fitted(self, ['_x', '_y', '_r', '_clf'])
        return self._clf.predict(x.multiply(self._r))

    def predict_proba(self, x):
        # Verify that model has been fit
        check_is_fitted(self, ['_x', '_y', '_r', '_clf'])
        return self._clf.predict_proba(x.multiply(self._r))

    def fit(self, x, y):
        # Check that X and y have correct shape
        y = y.values
        x, y = check_X_y(x, y, accept_sparse=True)

        # Store labels, X and y
        self._x = x
        self._y = y
        self._classes = unique_labels(y)

        def pr(x, y_i, y):
            p = x[y==y_i].sum(0)
            return (p+1) / ((y==y_i).sum()+1)
        self._r = np.log(pr(x,1,y) / pr(x,0,y))
        self._clf = LogisticRegression(C=self.C, dual=self.dual, n_jobs=self.n_jobs)
        
        try:
            x_nb = x * self._r
        except:
            x_nb = x.multiply(self._r)
    
        self._clf.fit(x_nb, y)
        return self

In [28]:
classifiers_textstat = [
    
    ('NB-SVM', NbSvmClassifier()),
    
    ("Decision Tree", DecisionTreeClassifier(max_depth=5)),
    ("Random Forest", RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1)),
    ("Extra Trees", ExtraTreesClassifier(max_depth=5)),
    ("Gradient Boosting", GradientBoostingClassifier()),
    
    # Does not perform too well, may need hyper parameter tuning
    #     ("Nearest Neighbors", KNeighborsClassifier(131)),
    #     ("Naive Bayes", GaussianNB()),
    #     ("Neural Net", MLPClassifier(alpha=1)),
    #     ("AdaBoost", AdaBoostClassifier()),
    #     ("QDA", QuadraticDiscriminantAnalysis()),
    
    # Cannot even run
    #     ("Gaussian Process", GaussianProcessClassifier(1.0 * RBF(1.0)))  # Memory error??? Even with 32GB ram???
    #     ("Linear SVM", SVC(kernel="linear", C=0.025)),                   # Slow like shit
    #     ("RBF SVM", SVC(gamma=2, C=1))                                   # Slow like shit
]

In [29]:
clf_textstat = {}
y_pred_textstat = {}
scores_textstat = {}
for classifier_name, classifier in classifiers_textstat:
    print('Training with {}'.format(classifier_name))
    clf_textstat[classifier_name] = [{}]*cv_folds
    y_pred_textstat[classifier_name] = [{}]*cv_folds
    scores_textstat[classifier_name] = [{}]*cv_folds
    for fold in range(cv_folds):
        for label in labels:
            clf_textstat[classifier_name][fold][label] = classifier
            clf_textstat[classifier_name][fold][label].fit(X_train_features_textstat[fold], y_train[fold][label])

        y_pred_textstat[classifier_name][fold] = pd.DataFrame()
        for label in labels:
            y_pred_textstat[classifier_name][fold][label] = clf_textstat[classifier_name][fold][label].predict_proba(X_test_features_textstat[fold]).T[1]

        scores_textstat[classifier_name][fold] = calculate_score(y_test[fold], y_pred_textstat[classifier_name][fold])
    print('Column-wise log loss for {}: {} - {}'.format(classifier_name, scores_textstat[classifier_name], np.mean(scores_textstat[classifier_name])))

Training with NB-SVM
Column-wise log loss for NB-SVM: [0.18095665548136211, 0.17882501320839905, 0.18230149530739689, 0.18217079958540527, 0.18592839572795675] - 0.182036471862104
Training with Decision Tree
Column-wise log loss for Decision Tree: [0.18957052605820379, 0.17613638822970826, 0.1793007937950295, 0.18664318223035425, 0.22541131934805081] - 0.1914124419322693
Training with Random Forest
Column-wise log loss for Random Forest: [0.17240761235645463, 0.17024284803354539, 0.17496624440305722, 0.1744611051823873, 0.1716097767158847] - 0.17273751733826587
Training with Extra Trees
Column-wise log loss for Extra Trees: [0.17893788975208857, 0.17846357096391977, 0.18201174008308574, 0.17996148024629258, 0.18446533344684615] - 0.18076800289844655
Training with Gradient Boosting
Column-wise log loss for Gradient Boosting: [0.17491601261758297, 0.17192840005695872, 0.1770418719005408, 0.17536055567933465, 0.19379075699436746] - 0.1786075194497569


## Bag of words only

In [30]:
classifiers_bow = [
    
    ('NB-SVM 2', NbSvmClassifier(C=4, dual=True)),
    ('NB-SVM', NbSvmClassifier()),
    
    ("Decision Tree", DecisionTreeClassifier(max_depth=5)),
    ("Random Forest", RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1)),
    ("Extra Trees", ExtraTreesClassifier(max_depth=5)),
    
    #     ("Gradient Boosting", GradientBoostingClassifier()),           # Sloooow
    
    # Does not perform too well, may need hyper parameter tuning
    #     ("Nearest Neighbors", KNeighborsClassifier(131)),
    #     ("Naive Bayes", GaussianNB()),
    #     ("Neural Net", MLPClassifier(alpha=1)),
    #     ("AdaBoost", AdaBoostClassifier()),
    #     ("QDA", QuadraticDiscriminantAnalysis()),
    
    # Cannot even run
    #     ("Gaussian Process", GaussianProcessClassifier(1.0 * RBF(1.0)))  # Memory error??? Even with 32GB ram???
    #     ("Linear SVM", SVC(kernel="linear", C=0.025)),                   # Slow like shit
    #     ("RBF SVM", SVC(gamma=2, C=1))                                   # Slow like shit
]

In [31]:
clf_bow = {}
y_pred_bow = {}
scores_bow = {}
for classifier_name, classifier in classifiers_bow:
    print('Training with {}'.format(classifier_name))
    clf_bow[classifier_name] = [{}]*cv_folds
    y_pred_bow[classifier_name] = [{}]*cv_folds
    scores_bow[classifier_name] = [{}]*cv_folds
    for fold in range(cv_folds):
        print('Fold {}'.format(fold))
        for label in labels:
            clf_bow[classifier_name][fold][label] = classifier
            clf_bow[classifier_name][fold][label].fit(X_train_features_bow[fold], y_train[fold][label])

        y_pred_bow[classifier_name][fold] = pd.DataFrame()
        for label in labels:
            y_pred_bow[classifier_name][fold][label] = clf_bow[classifier_name][fold][label].predict_proba(X_test_features_bow[fold]).T[1]

        scores_bow[classifier_name][fold] = calculate_score(y_test[fold], y_pred_bow[classifier_name][fold])
    print('Column-wise log loss for {}: {} - {}'.format(classifier_name, scores_bow[classifier_name], np.mean(scores_bow[classifier_name])))

Training with NB-SVM 2
Fold 0
Fold 1
Fold 2
Fold 3
Fold 4
Column-wise log loss for NB-SVM 2: [0.19844112468989492, 0.19392082828970536, 0.20404326571297679, 0.20250041765877583, 0.20624392623515853] - 0.2010299125173023
Training with NB-SVM
Fold 0
Fold 1
Fold 2
Fold 3
Fold 4
Column-wise log loss for NB-SVM: [0.16948686654519271, 0.16559772931639868, 0.17308711286704992, 0.17281802990400899, 0.17602815742741826] - 0.1714035792120137
Training with Decision Tree
Fold 0
Fold 1
Fold 2
Fold 3
Fold 4
Column-wise log loss for Decision Tree: [0.2035290665424109, 0.18481231106222365, 0.19680163652584448, 0.19196176863100647, 0.19546617070047986] - 0.19451419069239306
Training with Random Forest
Fold 0
Fold 1
Fold 2
Fold 3
Fold 4
Column-wise log loss for Random Forest: [0.18285599858864762, 0.1796933537266997, 0.18518115357689954, 0.18389392534292792, 0.18812967836177996] - 0.18395082191939097
Training with Extra Trees
Fold 0
Fold 1
Fold 2
Fold 3
Fold 4
Column-wise log loss for Extra Trees: [0.18