In [1]:
import os
import numpy as np
import pandas as pd

from textstat.textstat import textstat

from sklearn.model_selection import KFold
from sklearn.metrics import log_loss
from scipy.sparse import hstack

# Classifiers
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression

from NbSvmClassifier import NbSvmClassifier

# BoW feature extraction
import re, string
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
train_file = '../data/train.csv'
test_file = '../data/test.csv'
sample_submission_file = '../data/sample_submission.csv'

train_all = pd.read_csv(train_file)
test_for_submission = pd.read_csv(test_file)
sample_submission = pd.read_csv(sample_submission_file)

# Clean data

TODO: Account for more cases

In [3]:
train_all['comment_text'].fillna("unknown", inplace=True)
test_for_submission['comment_text'].fillna("unknown", inplace=True)

# Split train data

In [4]:
kf = KFold(n_splits=5, random_state=0)
cv_splits_train = []
cv_splits_test = []
for train_index, test_index in kf.split(train_all):
    cv_splits_train.append(train_index)
    cv_splits_test.append(test_index)
cv_folds = len(cv_splits_train)

In [5]:
cv_train = []
cv_test = []
for i in range(cv_folds):
    cv_train.append(train_all.loc[cv_splits_train[i], :])
    cv_test.append(train_all.loc[cv_splits_test[i], :])

In [6]:
train = cv_train
test = cv_test

Count and check that the data is split such that the percentages of labels in train/test are roughly equal

In [7]:
labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [8]:
def print_count_of_each_label(df):
    for label in labels:
        print('{}: {} / {} ({}%)'.format(label.rjust(len(labels[-1])),
                                         df.loc[df[label] == 1].shape[0],
                                         len(df),
                                         np.round(df.loc[df[label] == 1].shape[0]/len(df)*100, 3)))

for i in range(cv_folds):
    print('CV train {}'.format(i))
    print_count_of_each_label(cv_train[i])
    print('CV test {}'.format(i))
    print_count_of_each_label(cv_test[i])

CV train 0
        toxic: 7366 / 76680 (9.606%)
 severe_toxic: 792 / 76680 (1.033%)
      obscene: 4108 / 76680 (5.357%)
       threat: 234 / 76680 (0.305%)
       insult: 3826 / 76680 (4.99%)
identity_hate: 654 / 76680 (0.853%)
CV test 0
        toxic: 1871 / 19171 (9.76%)
 severe_toxic: 173 / 19171 (0.902%)
      obscene: 1001 / 19171 (5.221%)
       threat: 71 / 19171 (0.37%)
       insult: 939 / 19171 (4.898%)
identity_hate: 160 / 19171 (0.835%)
CV train 1
        toxic: 7413 / 76681 (9.667%)
 severe_toxic: 772 / 76681 (1.007%)
      obscene: 4136 / 76681 (5.394%)
       threat: 244 / 76681 (0.318%)
       insult: 3841 / 76681 (5.009%)
identity_hate: 657 / 76681 (0.857%)
CV test 1
        toxic: 1824 / 19170 (9.515%)
 severe_toxic: 193 / 19170 (1.007%)
      obscene: 973 / 19170 (5.076%)
       threat: 61 / 19170 (0.318%)
       insult: 924 / 19170 (4.82%)
identity_hate: 157 / 19170 (0.819%)
CV train 2
        toxic: 7397 / 76681 (9.646%)
 severe_toxic: 764 / 76681 (0.996%)
      o

# Split train/test into X and y

So that we can use sklearn classifiers easily

In [9]:
X_train = [None] * cv_folds
y_train = [None] * cv_folds
X_test = [None] * cv_folds
y_test = [None] * cv_folds
for i in range(cv_folds):
    X_train[i], y_train[i] = train[i][["comment_text"]], train[i][labels]
    X_test[i], y_test[i] = test[i][["comment_text"]], test[i][labels]

In [10]:
print(X_train[0].shape, y_train[0].shape)
print(X_test[0].shape, y_test[0].shape)

(76680, 1) (76680, 6)
(19171, 1) (19171, 6)


# Extract features

## Extract textstat features

In [11]:
def extract_textstat_features(df):
    features_df = pd.DataFrame()
    features_df['comment_text_len'] = df['comment_text'].apply(len)
    features_df['comment_text_lex_count'] = df['comment_text'].apply(textstat.lexicon_count)
    features_df['comment_text_syl_count'] = df['comment_text'].apply(textstat.syllable_count)
    features_df['comment_text_sent_count'] = df['comment_text'].apply(textstat.sentence_count)
    features_df['comment_text_flesch_reading_ease'] = df['comment_text'].apply(textstat.flesch_reading_ease)
    features_df['comment_text_flesch_kincaid_grade'] = df['comment_text'].apply(textstat.flesch_kincaid_grade)
    
    features_df['comment_text_syl_over_lex'] = features_df['comment_text_syl_count'] / features_df['comment_text_lex_count']
    features_df['comment_text_lex_over_sent'] = features_df['comment_text_lex_count'] / features_df['comment_text_sent_count']
    
#         df.loc[:,'comment_len'] = df.loc[:,'comment_text'].apply(len)
#         df.loc[:,'comment_avg_syllable'] = df.loc[:,'comment_text'].apply(average_syllable)
#         df.loc[:,'comment_syllable'] = df.loc[:,'comment_text'].apply(textstat.syllable_count)
#         df.loc[:,'comment_num_sent'] = df.loc[:, 'comment_text'].apply(textstat.sentence_count)
#         df.loc[:, "comment_word_per_sent"] = df.loc[:, "comment_text"].apply(lambda x: textstat.lexicon_count(x) / textstat.sentence_count(x))
#         df.loc[:,'comment_flesch_reading_ease'] = df.loc[:,'comment_text'].apply(textstat.flesch_reading_ease)
#         df.loc[:,'comment_flesch_kincaid_grade'] = df.loc[:,'comment_text'].apply(textstat.flesch_kincaid_grade)
    
    return features_df

In [12]:
train_textstat_features_file = '../data/train_textstat_features.csv'
if os.path.isfile(train_textstat_features_file):
    X_train_all_features_textstat = pd.read_csv(train_textstat_features_file, index_col=0)
else:
    X_train_all_features_textstat = extract_textstat_features(train_all)
    X_train_all_features_textstat.to_csv(train_textstat_features_file)

## Split textstat features

In [13]:
X_train_features_textstat = []
X_test_features_textstat = []
for i in range(cv_folds):
    X_train_features_textstat.append(X_train_all_features_textstat.loc[cv_splits_train[i], :])
    X_test_features_textstat.append(X_train_all_features_textstat.loc[cv_splits_test[i], :])

## Extract BoW

In [14]:
re_tok = re.compile('([{}“”¨«»®´·º½¾¿¡§£₤‘’])'.format(string.punctuation))
def tokenize(s): return re_tok.sub(r' \1 ', s).split()

In [15]:
n = train_all.shape[0]
vec = TfidfVectorizer(ngram_range=(1,2), tokenizer=tokenize,
               min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1,
               smooth_idf=1, sublinear_tf=1 )
train_term_doc = vec.fit_transform(train_all['comment_text'])

In [16]:
X_train_features_bow = []
X_test_features_bow = []
for i in range(cv_folds):
    X_train_features_bow.append(train_term_doc[cv_splits_train[i], :])
    X_test_features_bow.append(train_term_doc[cv_splits_test[i], :])

## Extract emotion scores

In [17]:
emotion_lexicon_file = "../data/features/NRC-AffectIntensity-Lexicon.txt"
emotion_lexicon = pd.read_csv(emotion_lexicon_file, sep = "\t")

In [18]:
emotion_term_score = {'anger': {}, 'fear': {}, 'joy': {}, 'sadness': {}}
for row in emotion_lexicon.itertuples():
    emotion_term_score[row.AffectDimension][row.term] = row.score

In [19]:
translator = str.maketrans('', '', string.punctuation)
emotions = ['anger', 'fear', 'joy', 'sadness']
def avg_emotion_score(comment_text):
    try:
        comment_cleaned = comment_text.translate(translator)
    except:
        comment_cleaned = ""
    comment_cleaned = comment_cleaned.lower()
    comment_cleaned_words = comment_cleaned.split(" ")
    
    emotion_scores = {'anger': 0, 'fear': 0, 'joy': 0, 'sadness': 0}
    for emotion in emotions:
        scores = [emotion_term_score[emotion].get(word) for word in comment_cleaned_words
                    if emotion_term_score[emotion].get(word) is not None]
        if len(scores) == 0:
            continue
        emotion_scores[emotion] = np.mean(scores)
        
    return [emotion_scores[emotion] for emotion in emotions]
    
def extract_emotion_features(df):
    features_df = df['comment_text'].apply(avg_emotion_score)
    return pd.DataFrame(features_df.values.tolist(), columns=['comment_text_emotion_{}'.format(emotion) for emotion in emotions])

In [20]:
X_train_all_features_emotion = extract_emotion_features(train_all)

In [21]:
X_train_features_emotion = []
X_test_features_emotion = []
for i in range(cv_folds):
    X_train_features_emotion.append(X_train_all_features_emotion.loc[cv_splits_train[i], :])
    X_test_features_emotion.append(X_train_all_features_emotion.loc[cv_splits_test[i], :])

# Function to calculate mean column-wise log loss of y_pred vs y_actual

In [22]:
def calculate_score(y_actual, y_pred):
    col_log_loss = [log_loss(np.array(y_actual[label]),
                             np.array([1.-np.array(y_pred[label]), np.array(y_pred[label])]).T) for label in labels]
    return col_log_loss

# Test classification scores

## Perfect score

In [23]:
np.mean(calculate_score(y_test[0], y_test[0]))

9.9920072216264108e-16

## ZeroR

In [24]:
scores_zeror = []
for i in range(cv_folds):
    data = np.array([np.zeros(len(labels))] * len(X_test[i]))
    y_pred_zeror = pd.DataFrame(data, columns=labels)
    scores = calculate_score(y_test[i], y_pred_zeror)
    scores_zeror.append(scores)
print(np.mean(scores_zeror, axis=0))
print(np.mean(scores_zeror))

[ 3.32844347  0.34772673  1.84096823  0.10990296  1.71701177  0.29331534]
1.27289475201


## All 0.5

In [25]:
scores_half = []
for i in range(cv_folds):
    data = np.array([np.ones(len(labels))*0.5] * len(X_test[i]))
    y_pred_half = pd.DataFrame(data, columns=labels)
    scores = calculate_score(y_test[i], y_pred_half)
    scores_half.append(scores)
print(np.mean(scores_half, axis=0))
print(np.mean(scores_half))

[ 0.69314718  0.69314718  0.69314718  0.69314718  0.69314718  0.69314718]
0.69314718056


## Test calculate_score

In [26]:
scores_half = []
for i in range(cv_folds):
    data = np.array([np.array([0.5, 0.6, 0.7, 0.8, 0.9, 1.0])] * len(X_test[i]))
    y_pred_half = pd.DataFrame(data, columns=labels)
    scores = calculate_score(y_test[i], y_pred_half)
    scores_half.append(scores)
print(np.mean(scores_half, axis=0))
print(np.mean(scores_half))

[  0.69314718   0.91220862   1.15881057   1.6050267    2.19335539
  34.24546105]
6.80133491769


## Textstat features only

In [27]:
classifiers_textstat = []
# classifiers_textstat += [
#     ('Logistic Regression {}'.format(c), lambda: LogisticRegression(solver="newton-cg", C=2.0**c, max_iter=1000)) for c in np.arange(-3,4,1)]
classifiers_textstat += [
    ('Logistic Regression', lambda: LogisticRegression(solver="newton-cg", C=2.0, max_iter=1000)),
    
    ('NB-SVM', lambda: NbSvmClassifier()),
    
    ("Decision Tree", lambda: DecisionTreeClassifier(max_depth=5)),
    ("Random Forest", lambda: RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1)),
    ("Extra Trees", lambda: ExtraTreesClassifier(max_depth=5)),
    ("Gradient Boosting", lambda: GradientBoostingClassifier()),
    
    # Does not perform too well, may need hyper parameter tuning
    #     ("Nearest Neighbors", lambda: KNeighborsClassifier(131)),
    #     ("Naive Bayes", lambda: GaussianNB()),
    #     ("Neural Net", lambda: MLPClassifier(alpha=1)),
    #     ("AdaBoost", lambda: AdaBoostClassifier()),
    #     ("QDA", lambda: QuadraticDiscriminantAnalysis()),
    
    # Cannot even run
    #     ("Gaussian Process", lambda: GaussianProcessClassifier(1.0 * RBF(1.0)))  # Memory error??? Even with 32GB ram???
    #     ("Linear SVM", lambda: SVC(kernel="linear", C=0.025)),                   # Slow like shit
    #     ("RBF SVM", lambda: SVC(gamma=2, C=1))                                   # Slow like shit
]

In [33]:
clf_textstat = {}
y_pred_textstat = {}
scores_textstat = {}
for classifier_name, classifier in classifiers_textstat:
    print('Training with {}'.format(classifier_name))
    clf_textstat[classifier_name] = [{}]*cv_folds
    y_pred_textstat[classifier_name] = [{}]*cv_folds
    scores_textstat[classifier_name] = [{}]*cv_folds
    for fold in range(cv_folds):
        y_pred_textstat[classifier_name][fold] = pd.DataFrame()
        for label in labels:
            clf_textstat[classifier_name][fold][label] = classifier()
            clf_textstat[classifier_name][fold][label].fit(X_train_features_textstat[fold], y_train[fold][label])
            
            y_pred_textstat[classifier_name][fold][label] = clf_textstat[classifier_name][fold][label].predict_proba(X_test_features_textstat[fold]).T[1]
            
        scores = calculate_score(y_test[fold], y_pred_textstat[classifier_name][fold])
        scores_textstat[classifier_name][fold] = scores
    print('Column-wise log loss for {}: {} - {}'.format(classifier_name, np.mean(scores_textstat[classifier_name], axis=0), np.mean(scores_textstat[classifier_name])))

Training with Logistic Regression


  np.exp(prob, prob)


Column-wise log loss for Logistic Regression: [ 0.30794229  0.05469084  0.20307188  0.02067343  0.19241465  0.04878659] - 0.13792994630312155
Training with NB-SVM
Column-wise log loss for NB-SVM: [ 0.3088186   0.05477122  0.2064551   0.02241579  0.19257606  0.04878047] - 0.1389695399837736
Training with Decision Tree
Column-wise log loss for Decision Tree: [ 0.29664338  0.05876473  0.19443927  0.02251912  0.18707125  0.05133406] - 0.13512863452000448
Training with Random Forest
Column-wise log loss for Random Forest: [ 0.28864478  0.05000099  0.18833004  0.01977042  0.18025965  0.0464014 ] - 0.12890121405005844
Training with Extra Trees
Column-wise log loss for Extra Trees: [ 0.30802356  0.05347133  0.20226986  0.0211011   0.19282647  0.04805904] - 0.13762522740427266
Training with Gradient Boosting
Column-wise log loss for Gradient Boosting: [ 0.28496513  0.05051078  0.18617237  0.02208947  0.17856005  0.04840794] - 0.1284509573612043


# Emotion only

In [34]:
classifiers_emotion = []
classifiers_emotion += [
    ('Logistic Regression', lambda: LogisticRegression(solver="newton-cg", C=2.0, max_iter=1000)),
    
    ('NB-SVM', lambda: NbSvmClassifier()),
    
    ("Decision Tree", lambda: DecisionTreeClassifier(max_depth=5)),
    ("Random Forest", lambda: RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1)),
    ("Extra Trees", lambda: ExtraTreesClassifier(max_depth=5)),
    ("Gradient Boosting", lambda: GradientBoostingClassifier()),
    
    # Does not perform too well, may need hyper parameter tuning
    #     ("Nearest Neighbors", lambda: KNeighborsClassifier(131)),
    #     ("Naive Bayes", lambda: GaussianNB()),
    #     ("Neural Net", lambda: MLPClassifier(alpha=1)),
    #     ("AdaBoost", lambda: AdaBoostClassifier()),
    #     ("QDA", lambda: QuadraticDiscriminantAnalysis()),
    
]

In [35]:
clf_emotion = {}
y_pred_emotion = {}
scores_emotion = {}
for classifier_name, classifier in classifiers_emotion:
    print('Training with {}'.format(classifier_name))
    clf_emotion[classifier_name] = [{}]*cv_folds
    y_pred_emotion[classifier_name] = [{}]*cv_folds
    scores_emotion[classifier_name] = [{}]*cv_folds
    for fold in range(cv_folds):
        y_pred_emotion[classifier_name][fold] = pd.DataFrame()
        for label in labels:
            clf_emotion[classifier_name][fold][label] = classifier()
            clf_emotion[classifier_name][fold][label].fit(X_train_features_emotion[fold], y_train[fold][label])

            y_pred_emotion[classifier_name][fold][label] = clf_emotion[classifier_name][fold][label].predict_proba(X_test_features_emotion[fold]).T[1]

        scores = calculate_score(y_test[fold], y_pred_emotion[classifier_name][fold])
        scores_emotion[classifier_name][fold] = scores
    print('Column-wise log loss for {}: {} - {}'.format(classifier_name, np.mean(scores_emotion[classifier_name], axis=0), np.mean(scores_emotion[classifier_name])))

Training with Logistic Regression
Column-wise log loss for Logistic Regression: [ 0.30281782  0.05346437  0.19816069  0.01778966  0.18850244  0.04790268] - 0.13477294240101637
Training with NB-SVM
Column-wise log loss for NB-SVM: [ 0.30284635  0.05354436  0.19828223  0.0178291   0.18855963  0.04798816] - 0.13484163988018344
Training with Decision Tree
Column-wise log loss for Decision Tree: [ 0.29119536  0.05792187  0.1897252   0.02283176  0.18021672  0.05319166] - 0.1325137627182451
Training with Random Forest
Column-wise log loss for Random Forest: [ 0.28915403  0.05011628  0.18697609  0.01669613  0.17832108  0.04682818] - 0.1280152994526549
Training with Extra Trees
Column-wise log loss for Extra Trees: [ 0.2965821   0.05189271  0.19287636  0.01693412  0.18453332  0.04727002] - 0.1316814358144425
Training with Gradient Boosting
Column-wise log loss for Gradient Boosting: [ 0.28031718  0.05082792  0.17995954  0.02624012  0.17276956  0.04804572] - 0.1263600061065549


## Textstat + emotion

In [36]:
classifiers_textstat_emotion = []
classifiers_textstat_emotion += [
    ('Logistic Regression', lambda: LogisticRegression(solver="newton-cg", C=2.0, max_iter=1000)),
    
    ('NB-SVM', lambda: NbSvmClassifier()),
    
    ("Decision Tree", lambda: DecisionTreeClassifier(max_depth=5)),
    ("Random Forest", lambda: RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1)),
    ("Extra Trees", lambda: ExtraTreeslambda: Classifier(max_depth=5)),
    ("Gradient Boosting", lambda: GradientBoostingClassifier()),
    
    # Does not perform too well, may need hyper parameter tuning
    #     ("Nearest Neighbors", lambda: KNeighborsClassifier(131)),
    #     ("Naive Bayes", lambda: GaussianNB()),
    #     ("Neural Net", lambda: MLPClassifier(alpha=1)),
    #     ("AdaBoost", lambda: AdaBoostClassifier()),
    #     ("QDA", lambda: QuadraticDiscriminantAnalysis()),
    
]

In [37]:
# concat features
X_train_features_textstat_emotion = [None]*cv_folds
X_test_features_textstat_emotion = [None]*cv_folds
for fold in range(cv_folds):
    X_train_features_textstat_emotion[fold] = np.hstack((X_train_features_textstat[fold], X_train_features_emotion[fold].values))
    X_test_features_textstat_emotion[fold] = np.hstack((X_test_features_textstat[fold], X_test_features_emotion[fold].values))

print(X_train_features_textstat[0].shape)
print(X_train_features_emotion[0].shape)
print(X_train_features_textstat_emotion[0].shape)

(76680, 8)
(76680, 4)
(76680, 12)


In [38]:
clf_textstat_emotion = {}
y_pred_textstat_emotion = {}
scores_textstat_emotion = {}
for classifier_name, classifier in classifiers_textstat_emotion:
    print('Training with {}'.format(classifier_name))
    clf_textstat_emotion[classifier_name] = [{}]*cv_folds
    y_pred_textstat_emotion[classifier_name] = [{}]*cv_folds
    scores_textstat_emotion[classifier_name] = [{}]*cv_folds
    for fold in range(cv_folds):
        y_pred_textstat_emotion[classifier_name][fold] = pd.DataFrame()
        for label in labels:
            clf_textstat_emotion[classifier_name][fold][label] = classifier()
            clf_textstat_emotion[classifier_name][fold][label].fit(X_train_features_textstat_emotion[fold], y_train[fold][label])

            y_pred_textstat_emotion[classifier_name][fold][label] = clf_textstat_emotion[classifier_name][fold][label].predict_proba(X_test_features_textstat_emotion[fold]).T[1]

        scores = calculate_score(y_test[fold], y_pred_textstat_emotion[classifier_name][fold])
        scores_textstat_emotion[classifier_name][fold] = scores
    print('Column-wise log loss for {}: {} - {}'.format(classifier_name, np.mean(scores_textstat_emotion[classifier_name], axis=0), np.mean(scores_textstat_emotion[classifier_name])))

Training with Logistic Regression




Column-wise log loss for Logistic Regression: [ 0.29033679  0.05177318  0.19207388  0.01697266  0.18157816  0.04752037] - 0.13004250616136864
Training with NB-SVM
Column-wise log loss for NB-SVM: [ 0.29464626  0.05191786  0.19800408  0.02284097  0.18207067  0.04755327] - 0.13283885191981842
Training with Decision Tree
Column-wise log loss for Decision Tree: [ 0.27283879  0.05568164  0.17735648  0.02070593  0.17172848  0.05227849] - 0.12509830237204234
Training with Random Forest
Column-wise log loss for Random Forest: [ 0.27397721  0.04776043  0.17894093  0.01656611  0.17107763  0.04546845] - 0.12229846011036805
Training with Extra Trees
Column-wise log loss for Extra Trees: [ 0.29267864  0.05099159  0.19224774  0.01751213  0.18423401  0.04737975] - 0.13084064269916476
Training with Gradient Boosting
Column-wise log loss for Gradient Boosting: [ 0.2547657   0.04662796  0.1659162   0.02121334  0.16029433  0.04724113] - 0.1160097776950937


## Bag of words only

In [39]:
classifiers_bow = [
    ('Logistic Regression', lambda: LogisticRegression(solver="newton-cg", C=2.0, max_iter=1000)),
    
    ('NB-SVM 2', lambda: NbSvmClassifier(C=4, dual=True)),
    ('NB-SVM', lambda: NbSvmClassifier()),
    
    ("Decision Tree", lambda: DecisionTreeClassifier(max_depth=5)),
    ("Random Forest", lambda: RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1)),
    ("Extra Trees", lambda: ExtraTreesClassifier(max_depth=5)),
    
    #     ("Gradient Boosting", lambda: GradientBoostingClassifier()),           # Sloooow
    
    # Does not perform too well, may need hyper parameter tuning
    #     ("Nearest Neighbors", lambda: KNeighborsClassifier(131)),
    #     ("Naive Bayes", lambda: GaussianNB()),
    #     ("Neural Net", lambda: MLPClassifier(alpha=1)),
    #     ("AdaBoost", lambda: AdaBoostClassifier()),
    #     ("QDA", lambda: QuadraticDiscriminantAnalysis()),
]

In [40]:
clf_bow = {}
y_pred_bow = {}
scores_bow = {}
for classifier_name, classifier in classifiers_bow:
    print('Training with {}'.format(classifier_name))
    clf_bow[classifier_name] = [{}]*cv_folds
    y_pred_bow[classifier_name] = [{}]*cv_folds
    scores_bow[classifier_name] = [{}]*cv_folds
    for fold in range(cv_folds):
        print('Fold {}'.format(fold))
        y_pred_bow[classifier_name][fold] = pd.DataFrame()
        for label in labels:
            clf_bow[classifier_name][fold][label] = classifier()
            clf_bow[classifier_name][fold][label].fit(X_train_features_bow[fold], y_train[fold][label])

            y_pred_bow[classifier_name][fold][label] = clf_bow[classifier_name][fold][label].predict_proba(X_test_features_bow[fold]).T[1]

        scores = calculate_score(y_test[fold], y_pred_bow[classifier_name][fold])
        scores_bow[classifier_name][fold] = scores
    print('Column-wise log loss for {}: {} - {}'.format(classifier_name, np.mean(scores_bow[classifier_name], axis=0), np.mean(scores_bow[classifier_name])))

Training with Logistic Regression
Fold 0
Fold 1
Fold 2
Fold 3
Fold 4
Column-wise log loss for Logistic Regression: [ 0.12359399  0.02828408  0.07265129  0.01201408  0.08213134  0.02879822] - 0.05791216907554267
Training with NB-SVM 2
Fold 0
Fold 1
Fold 2
Fold 3
Fold 4
Column-wise log loss for NB-SVM 2: [ 0.10573381  0.03360432  0.06352179  0.01242645  0.08139436  0.03061778] - 0.054549752566824365
Training with NB-SVM
Fold 0
Fold 1
Fold 2
Fold 3
Fold 4
Column-wise log loss for NB-SVM: [ 0.10984278  0.02989426  0.06178523  0.01158265  0.07788519  0.0280709 ] - 0.053176833214058444
Training with Decision Tree
Fold 0
Fold 1
Fold 2
Fold 3
Fold 4
Column-wise log loss for Decision Tree: [ 0.25173574  0.04746888  0.13167932  0.02264682  0.14923202  0.04553562] - 0.10804973374567567
Training with Random Forest
Fold 0
Fold 1
Fold 2
Fold 3
Fold 4
Column-wise log loss for Random Forest: [ 0.3169088   0.05632756  0.20790394  0.02147864  0.19725372  0.04893481] - 0.14146791095929903
Training with E

## Bag of words + textstat

In [50]:
classifiers_bow_textstat = [
#     ('Logistic Regression', lambda: LogisticRegression(solver="newton-cg", C=2.0, max_iter=1000)), # Slooooow! Why??
    
#     ('NB-SVM 2', lambda: NbSvmClassifier(C=4, dual=True)),
#     ('NB-SVM', lambda: NbSvmClassifier()),
    
    ("Decision Tree", lambda: DecisionTreeClassifier(max_depth=5)),
    ("Random Forest", lambda: RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1)),
    ("Extra Trees", lambda: ExtraTreesClassifier(max_depth=5)),
    
    #     ("Gradient Boosting", lambda: GradientBoostingClassifier()),           # Sloooow
    
    # Does not perform too well, may need hyper parameter tuning
    #     ("Nearest Neighbors", lambda: KNeighborsClassifier(131)),
    #     ("Naive Bayes", lambda: GaussianNB()),
    #     ("Neural Net", lambda: MLPClassifier(alpha=1)),
    #     ("AdaBoost", lambda: AdaBoostClassifier()),
    #     ("QDA", lambda: QuadraticDiscriminantAnalysis()),
]

In [51]:
# concat features
X_train_features_bow_textstat = [None]*cv_folds
X_test_features_bow_textstat = [None]*cv_folds
for fold in range(cv_folds):
    X_train_features_bow_textstat[fold] = hstack((X_train_features_bow[fold], X_train_features_textstat[fold].values))
    X_test_features_bow_textstat[fold] = hstack((X_test_features_bow[fold], X_test_features_textstat[fold].values))

print(X_train_features_bow[0].shape)
print(X_train_features_textstat[0].shape)
print(X_train_features_bow_textstat[0].shape)

(76680, 285100)
(76680, 8)
(76680, 285108)


In [52]:
clf_bow_textstat = {}
y_pred_bow_textstat = {}
scores_bow_textstat = {}

for classifier_name, classifier in classifiers_bow_textstat:
    print('Training with {}'.format(classifier_name))
    clf_bow_textstat[classifier_name] = [{}]*cv_folds
    y_pred_bow_textstat[classifier_name] = [{}]*cv_folds
    scores_bow_textstat[classifier_name] = [{}]*cv_folds
    for fold in range(cv_folds):
        print('Fold {}'.format(fold))
        y_pred_bow_textstat[classifier_name][fold] = pd.DataFrame()
        for label in labels:
            clf_bow_textstat[classifier_name][fold][label] = classifier()
            clf_bow_textstat[classifier_name][fold][label].fit(X_train_features_bow_textstat[fold], y_train[fold][label])

            y_pred_bow_textstat[classifier_name][fold][label] = clf_bow_textstat[classifier_name][fold][label].predict_proba(X_test_features_bow_textstat[fold]).T[1]

        scores = calculate_score(y_test[fold], y_pred_bow_textstat[classifier_name][fold])
        scores_bow_textstat[classifier_name][fold] = scores
    print('Column-wise log loss for {}: {} - {}'.format(classifier_name, np.mean(scores_bow_textstat[classifier_name], axis=0), np.mean(scores_bow_textstat[classifier_name])))

Training with Decision Tree
Fold 0
Fold 1
Fold 2
Fold 3
Fold 4
Column-wise log loss for Decision Tree: [ 0.25068726  0.04770229  0.13236065  0.0241218   0.14983967  0.04600826] - 0.10845332202417264
Training with Random Forest
Fold 0
Fold 1
Fold 2
Fold 3
Fold 4
Column-wise log loss for Random Forest: [ 0.31686288  0.05630328  0.2080952   0.02148042  0.1976552   0.04895261] - 0.14155826494408374
Training with Extra Trees
Fold 0
Fold 1
Fold 2
Fold 3
Fold 4
Column-wise log loss for Extra Trees: [ 0.31078971  0.05465875  0.20224768  0.02050163  0.18999523  0.04755232] - 0.1376242203721537


Training with Logistic Regression
Fold 0
Fold 1
Fold 2
Fold 3
Fold 4
Column-wise log loss for Logistic Regression: [ 0.12352003  0.02783171  0.07256627  0.01234592  0.08213656  0.0289028 ] - 0.057883879582133674
Training with NB-SVM 2
Fold 0
---------------------------------------------------------------------------
C:\Users\Low WeiLin\Documents\Code\toxic-comment-classification\src\NbSvmClassifier.py in pr(x, y_i, y)
     40 
     41         def pr(x, y_i, y):
---> 42             p = x[y==y_i].sum(0)
     43             return (p+1) / ((y==y_i).sum()+1)
     44         self._r = np.log(pr(x,1,y) / pr(x,0,y))

TypeError: only integer scalar arrays can be converted to a scalar index

## Bag of words + textstat + emotion

In [53]:
classifiers_bow_textstat_emotion = [
    ('Logistic Regression', lambda: LogisticRegression(solver="newton-cg", C=2.0, max_iter=1000)), # Slooooow! Why??
    
    ("Decision Tree", lambda: DecisionTreeClassifier()),
    ("Random Forest", lambda: RandomForestClassifier()),
    ("Extra Trees", lambda: ExtraTreesClassifier()),
    
    # Does not perform too well, may need hyper parameter tuning
    #     ("Nearest Neighbors", lambda: KNeighborsClassifier(131)),
    #     ("Naive Bayes", lambda: GaussianNB()),
    #     ("Neural Net", lambda: MLPClassifier(alpha=1)),
    #     ("AdaBoost", lambda: AdaBoostClassifier()),
    #     ("QDA", lambda: QuadraticDiscriminantAnalysis()),
]

In [54]:
# concat features
X_train_features_bow_textstat_emotion = [None]*cv_folds
X_test_features_bow_textstat_emotion = [None]*cv_folds
for fold in range(cv_folds):
    X_train_features_bow_textstat_emotion[fold] = hstack((X_train_features_bow[fold], X_train_features_textstat[fold].values, X_train_features_emotion[fold].values))
    X_test_features_bow_textstat_emotion[fold] = hstack((X_test_features_bow[fold], X_test_features_textstat[fold].values, X_test_features_emotion[fold].values))

print(X_train_features_bow[0].shape)
print(X_train_features_textstat[0].shape)
print(X_train_features_emotion[0].shape)
print(X_train_features_bow_textstat_emotion[0].shape)

(76680, 285100)
(76680, 8)
(76680, 4)
(76680, 285112)


In [55]:
clf_bow_textstat_emotion = {}
y_pred_bow_textstat_emotion = {}
scores_bow_textstat_emotion = {}

for classifier_name, classifier in classifiers_bow_textstat_emotion:
    print('Training with {}'.format(classifier_name))
    clf_bow_textstat_emotion[classifier_name] = [{}]*cv_folds
    y_pred_bow_textstat_emotion[classifier_name] = [{}]*cv_folds
    scores_bow_textstat_emotion[classifier_name] = [{}]*cv_folds
    for fold in range(cv_folds):
        print('Fold {}'.format(fold))
        y_pred_bow_textstat_emotion[classifier_name][fold] = pd.DataFrame()
        for label in labels:
            clf_bow_textstat_emotion[classifier_name][fold][label] = classifier()
            clf_bow_textstat_emotion[classifier_name][fold][label].fit(X_train_features_bow_textstat_emotion[fold], y_train[fold][label])

            y_pred_bow_textstat_emotion[classifier_name][fold][label] = clf_bow_textstat_emotion[classifier_name][fold][label].predict_proba(X_test_features_bow_textstat_emotion[fold]).T[1]

        scores = calculate_score(y_test[fold], y_pred_bow_textstat_emotion[classifier_name][fold])
        scores_bow_textstat_emotion[classifier_name][fold] = scores
    print('Column-wise log loss for {}: {} - {}'.format(classifier_name, np.mean(scores_bow_textstat_emotion[classifier_name], axis=0), np.mean(scores_bow_textstat_emotion[classifier_name])))

Training with Logistic Regression
Fold 0
Fold 1
Fold 2
Fold 3
Fold 4
Column-wise log loss for Logistic Regression: [ 0.12185255  0.02761395  0.0719265   0.01136909  0.08155415  0.02864128] - 0.05715958636594785
Training with Decision Tree
Fold 0
Fold 1
Fold 2
Fold 3
Fold 4
Column-wise log loss for Decision Tree: [ 2.18076677  0.45943146  0.93726288  0.15602615  1.39595742  0.40610127] - 0.92259099000155
Training with Random Forest
Fold 0
Fold 1
Fold 2
Fold 3
Fold 4
Column-wise log loss for Random Forest: [ 0.42593906  0.11274696  0.21027722  0.05634246  0.24891257  0.14090212] - 0.19918673038733836
Training with Extra Trees
Fold 0
Fold 1
Fold 2
Fold 3
Fold 4
Column-wise log loss for Extra Trees: [ 0.38588522  0.10994773  0.18596305  0.05088959  0.24099076  0.13203048] - 0.18428447077487053
