In [96]:
import numpy as np
import pandas as pd
from textstat.textstat import textstat
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

In [2]:
train_file = '../data/train.csv'
test_file = '../data/test.csv'
sample_submission_file = '../data/sample_submission.csv'

train_all = pd.read_csv(train_file)
test_for_submission = pd.read_csv(test_file)
sample_submission = pd.read_csv(sample_submission_file)

# Split train data

In [3]:
train, test = train_test_split(train_all, test_size=0.2)

Count and check that the data is split such that the percentages of labels in train/test are roughly equal

In [4]:
labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [5]:
def print_count_of_each_label(df):
    for label in labels:
        print('{}: {} / {} ({}%)'.format(label.rjust(len(labels[-1])),
                                         df.loc[df[label] == 1].shape[0],
                                         len(df),
                                         np.round(df.loc[df[label] == 1].shape[0]/len(df)*100, 3)))

print('train')
print_count_of_each_label(train)
print('test')
print_count_of_each_label(test)

train
        toxic: 7416 / 76680 (9.671%)
 severe_toxic: 782 / 76680 (1.02%)
      obscene: 4076 / 76680 (5.316%)
       threat: 240 / 76680 (0.313%)
       insult: 3840 / 76680 (5.008%)
identity_hate: 670 / 76680 (0.874%)
test
        toxic: 1821 / 19171 (9.499%)
 severe_toxic: 183 / 19171 (0.955%)
      obscene: 1033 / 19171 (5.388%)
       threat: 65 / 19171 (0.339%)
       insult: 925 / 19171 (4.825%)
identity_hate: 144 / 19171 (0.751%)


# Split train/test into X and y

So that we can use sklearn classifiers easily

In [6]:
X_train, y_train = train[["comment_text"]], train[labels]
X_test, y_test = test[["comment_text"]], test[labels]

In [7]:
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(76680, 1) (76680, 6)
(19171, 1) (19171, 6)


# Function to calculate mean column-wise log loss of y_pred vs y_actual

In [8]:
y_test.head()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
10562,0,0,0,0,0,0
1045,0,0,0,0,0,0
17460,1,0,1,0,0,0
40058,0,0,0,0,0,0
45940,0,0,0,0,0,0


In [9]:
def calculate_score(y_actual, y_pred):
    return np.mean([log_loss(np.array(y_actual[label]),
                             np.array([1.-np.array(y_pred[label]), np.array(y_pred[label])]).T) for label in labels])

# Test classification scores

## Perfect score

In [10]:
calculate_score(y_test, y_test)

9.9920072216264108e-16

## ZeroR

In [11]:
data = np.array([np.zeros(len(labels))] * len(X_test))
y_pred_zeror = pd.DataFrame(data, columns=labels)
calculate_score(y_test, y_pred_zeror)

1.2524232464240479

## All 0.5

In [12]:
data = np.array([np.ones(len(labels))*0.5] * len(X_test))
y_pred_half = pd.DataFrame(data, columns=labels)
calculate_score(y_test, y_pred_half)

0.69314718055994529

## Textstat features only

In [13]:
def extract_features(df):
    features_df = pd.DataFrame()
    features_df['comment_text_len'] = df['comment_text'].apply(len)
    features_df['comment_text_lex_count'] = df['comment_text'].apply(textstat.lexicon_count)
    features_df['comment_text_syl_count'] = df['comment_text'].apply(textstat.syllable_count)
    features_df['comment_text_sent_count'] = df['comment_text'].apply(textstat.sentence_count)
    features_df['comment_text_flesch_reading_ease'] = df['comment_text'].apply(textstat.flesch_reading_ease)
    features_df['comment_text_flesch_kincaid_grade'] = df['comment_text'].apply(textstat.flesch_kincaid_grade)
    
    features_df['comment_text_syl_over_lex'] = features_df['comment_text_syl_count'] / features_df['comment_text_lex_count']
    
    return features_df

In [14]:
X_train_features_textstat = extract_features(X_train)

In [15]:
X_test_features_textstat = extract_features(X_test)

### KNN, with a magic K

In [91]:
clf_knn = {}
for label in labels:
    clf_knn[label] = KNeighborsClassifier(131)
    clf_knn[label].fit(X_train_features_textstat, y_train[label])

In [92]:
y_pred_knn = pd.DataFrame()
for label in labels:
    y_pred_knn[label] = clf_knn[label].predict_proba(X_test_features_textstat).T[1]

In [93]:
calculate_score(y_test, y_pred_knn)

0.15745454720171975

### Naive Bayes

In [97]:
clf_nb = {}
for label in labels:
    clf_nb[label] = GaussianNB()
    clf_nb[label].fit(X_train_features_textstat, y_train[label])

In [98]:
y_pred_nb = pd.DataFrame()
for label in labels:
    y_pred_nb[label] = clf_nb[label].predict_proba(X_test_features_textstat).T[1]

In [100]:
calculate_score(y_test, y_pred_nb)

0.56757429568583684