In [1]:
import os
import numpy as np
import pandas as pd

from textstat.textstat import textstat

from sklearn.model_selection import KFold
from sklearn.metrics import log_loss
from scipy.sparse import hstack

# Classifiers
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression

from NbSvmClassifier import NbSvmClassifier
import xgboost as xgb


# BoW feature extraction
import re, string
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

import gensim



In [2]:
train_file = '../data/train_cleaned.csv'
test_file = '../data/test_cleaned.csv'
sample_submission_file = '../data/sample_submission.csv'

train_all = pd.read_csv(train_file)
test_for_submission = pd.read_csv(test_file)
sample_submission = pd.read_csv(sample_submission_file)

# Clean data

TODO: Account for more cases

In [3]:
train_all['comment_text'].fillna("unknown", inplace=True)
test_for_submission['comment_text'].fillna("unknown", inplace=True)

# Split train data

In [4]:
kf = KFold(n_splits=5, random_state=0)
cv_splits_train = []
cv_splits_test = []
for train_index, test_index in kf.split(train_all):
    cv_splits_train.append(train_index)
    cv_splits_test.append(test_index)
cv_folds = len(cv_splits_train)

In [5]:
cv_train = []
cv_test = []
for i in range(cv_folds):
    cv_train.append(train_all.loc[cv_splits_train[i], :])
    cv_test.append(train_all.loc[cv_splits_test[i], :])

In [6]:
train = cv_train
test = cv_test

Count and check that the data is split such that the percentages of labels in train/test are roughly equal

In [7]:
labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [8]:
def print_count_of_each_label(df):
    for label in labels:
        print('{}: {} / {} ({}%)'.format(label.rjust(len(labels[-1])),
                                         df.loc[df[label] == 1].shape[0],
                                         len(df),
                                         np.round(df.loc[df[label] == 1].shape[0]/len(df)*100, 3)))

for i in range(cv_folds):
    print('CV train {}'.format(i))
    print_count_of_each_label(cv_train[i])
    print('CV test {}'.format(i))
    print_count_of_each_label(cv_test[i])

CV train 0
        toxic: 7366 / 76680 (9.606%)
 severe_toxic: 792 / 76680 (1.033%)
      obscene: 4108 / 76680 (5.357%)
       threat: 234 / 76680 (0.305%)
       insult: 3826 / 76680 (4.99%)
identity_hate: 654 / 76680 (0.853%)
CV test 0
        toxic: 1871 / 19171 (9.76%)
 severe_toxic: 173 / 19171 (0.902%)
      obscene: 1001 / 19171 (5.221%)
       threat: 71 / 19171 (0.37%)
       insult: 939 / 19171 (4.898%)
identity_hate: 160 / 19171 (0.835%)
CV train 1
        toxic: 7413 / 76681 (9.667%)
 severe_toxic: 772 / 76681 (1.007%)
      obscene: 4136 / 76681 (5.394%)
       threat: 244 / 76681 (0.318%)
       insult: 3841 / 76681 (5.009%)
identity_hate: 657 / 76681 (0.857%)
CV test 1
        toxic: 1824 / 19170 (9.515%)
 severe_toxic: 193 / 19170 (1.007%)
      obscene: 973 / 19170 (5.076%)
       threat: 61 / 19170 (0.318%)
       insult: 924 / 19170 (4.82%)
identity_hate: 157 / 19170 (0.819%)
CV train 2
        toxic: 7397 / 76681 (9.646%)
 severe_toxic: 764 / 76681 (0.996%)
      o

# Split train/test into X and y

So that we can use sklearn classifiers easily

In [9]:
X_train = [None] * cv_folds
y_train = [None] * cv_folds
X_test = [None] * cv_folds
y_test = [None] * cv_folds
for i in range(cv_folds):
    X_train[i], y_train[i] = train[i][["comment_text"]], train[i][labels]
    X_test[i], y_test[i] = test[i][["comment_text"]], test[i][labels]

In [10]:
print(X_train[0].shape, y_train[0].shape)
print(X_test[0].shape, y_test[0].shape)

(76680, 1) (76680, 6)
(19171, 1) (19171, 6)


# Extract features

## Extract textstat features

In [11]:
def extract_textstat_features(df):
    features_df = pd.DataFrame()
    features_df['comment_text_len'] = df['comment_text'].apply(len)
    features_df['comment_text_lex_count'] = df['comment_text'].apply(textstat.lexicon_count)
    features_df['comment_text_syl_count'] = df['comment_text'].apply(textstat.syllable_count)
    features_df['comment_text_sent_count'] = df['comment_text'].apply(textstat.sentence_count)
    features_df['comment_text_flesch_reading_ease'] = df['comment_text'].apply(textstat.flesch_reading_ease)
    features_df['comment_text_flesch_kincaid_grade'] = df['comment_text'].apply(textstat.flesch_kincaid_grade)
    
    features_df['comment_text_syl_over_lex'] = features_df['comment_text_syl_count'] / features_df['comment_text_lex_count']
    features_df['comment_text_lex_over_sent'] = features_df['comment_text_lex_count'] / features_df['comment_text_sent_count']
    
    return features_df

In [46]:
train_textstat_features_file = '../data/train_textstat_features.csv'
if os.path.isfile(train_textstat_features_file):
    X_train_all_features_textstat = pd.read_csv(train_textstat_features_file, index_col=0)
else:
    X_train_all_features_textstat = extract_textstat_features(train_all)
    X_train_all_features_textstat.to_csv(train_textstat_features_file)

## Split textstat features

In [47]:
X_train_features_textstat = []
X_test_features_textstat = []
for i in range(cv_folds):
    X_train_features_textstat.append(X_train_all_features_textstat.loc[cv_splits_train[i], :])
    X_test_features_textstat.append(X_train_all_features_textstat.loc[cv_splits_test[i], :])

## Extract BoW

In [14]:
re_tok = re.compile('([{}“”¨«»®´·º½¾¿¡§£₤‘’])'.format(string.punctuation))
def tokenize(s): return re_tok.sub(r' \1 ', s).split()

In [15]:
n = train_all.shape[0]
vec = TfidfVectorizer(ngram_range=(1,2), tokenizer=tokenize,
               min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1,
               smooth_idf=1, sublinear_tf=1 )
train_term_doc = vec.fit_transform(train_all['comment_text'])

In [16]:
X_train_features_bow = []
X_test_features_bow = []
for i in range(cv_folds):
    X_train_features_bow.append(train_term_doc[cv_splits_train[i], :])
    X_test_features_bow.append(train_term_doc[cv_splits_test[i], :])

## Extract emotion scores

In [17]:
emotion_lexicon_file = "../data/features/NRC-AffectIntensity-Lexicon.txt"
emotion_lexicon = pd.read_csv(emotion_lexicon_file, sep = "\t")

In [18]:
emotion_term_score = {'anger': {}, 'fear': {}, 'joy': {}, 'sadness': {}}
for row in emotion_lexicon.itertuples():
    emotion_term_score[row.AffectDimension][row.term] = row.score

In [19]:
translator = str.maketrans('', '', string.punctuation)
emotions = ['anger', 'fear', 'joy', 'sadness']
def avg_emotion_score(comment_text):
    try:
        comment_cleaned = comment_text.translate(translator)
    except:
        comment_cleaned = ""
    comment_cleaned = comment_cleaned.lower()
    comment_cleaned_words = comment_cleaned.split(" ")
    
    emotion_scores = {'anger': 0, 'fear': 0, 'joy': 0, 'sadness': 0}
    for emotion in emotions:
        scores = [emotion_term_score[emotion].get(word) for word in comment_cleaned_words
                    if emotion_term_score[emotion].get(word) is not None]
        if len(scores) == 0:
            continue
        emotion_scores[emotion] = np.sum(scores) / len(comment_cleaned_words)
        
    return [emotion_scores[emotion] for emotion in emotions]
    
def extract_emotion_features(df):
    features_df = df['comment_text'].apply(avg_emotion_score)
    return pd.DataFrame(features_df.values.tolist(), columns=['comment_text_emotion_{}'.format(emotion) for emotion in emotions])

In [20]:
X_train_all_features_emotion = extract_emotion_features(train_all)

In [21]:
X_train_features_emotion = []
X_test_features_emotion = []
for i in range(cv_folds):
    X_train_features_emotion.append(X_train_all_features_emotion.loc[cv_splits_train[i], :])
    X_test_features_emotion.append(X_train_all_features_emotion.loc[cv_splits_test[i], :])

## Extract Word2Vec

In [22]:
w2v_model = gensim.models.KeyedVectors.load_word2vec_format('../data/GoogleNews-vectors-negative300.bin', binary=True)

In [23]:
def featurize_w2v(comment_text):
    sentence = tokenize(comment_text)
    f = np.zeros(w2v_model.vector_size)
    count = 0 
    for w in sentence:
        try:
            vec = w2v_model[w]
            count += 1
        except KeyError:
            continue
        f += vec
    if count > 0:
        f /= count
    return f

In [24]:
X_train_all_features_w2v = train_all['comment_text'].apply(featurize_w2v)

In [25]:
X_train_all_features_w2v = np.array(X_train_all_features_w2v.tolist())

In [26]:
X_train_features_w2v = []
X_test_features_w2v = []
for i in range(cv_folds):
    X_train_features_w2v.append(X_train_all_features_w2v[cv_splits_train[i]])
    X_test_features_w2v.append(X_train_all_features_w2v[cv_splits_test[i]])

## Extract glove Word2Vec

In [27]:
w2v_glove_model = gensim.models.KeyedVectors.load_word2vec_format('../data/glove.6B.50d.gensim.txt')

In [28]:
def featurize_w2v_glove(comment_text):
    sentence = tokenize(comment_text)
    f = np.zeros(w2v_glove_model.vector_size)
    count = 0 
    for w in sentence:
        try:
            vec = w2v_glove_model[w]
            count += 1
        except KeyError:
            continue
        f += vec
    if count > 0:
        f /= count
    return f

In [29]:
X_train_all_features_w2v_glove = train_all['comment_text'].apply(featurize_w2v_glove)

In [30]:
X_train_all_features_w2v_glove = np.array(X_train_all_features_w2v_glove.tolist())

In [31]:
X_train_features_w2v_glove = []
X_test_features_w2v_glove = []
for i in range(cv_folds):
    X_train_features_w2v_glove.append(X_train_all_features_w2v_glove[cv_splits_train[i]])
    X_test_features_w2v_glove.append(X_train_all_features_w2v_glove[cv_splits_test[i]])

# Function to calculate mean column-wise log loss of y_pred vs y_actual

In [32]:
def calculate_score(y_actual, y_pred):
    col_log_loss = [log_loss(np.array(y_actual[label]),
                             np.array([1.-np.array(y_pred[label]), np.array(y_pred[label])]).T) for label in labels]
    return col_log_loss

# Test classification scores

## Perfect score

In [33]:
np.mean(calculate_score(y_test[0], y_test[0]))

9.9920072216264108e-16

## ZeroR

In [34]:
scores_zeror = []
for i in range(cv_folds):
    data = np.array([np.zeros(len(labels))] * len(X_test[i]))
    y_pred_zeror = pd.DataFrame(data, columns=labels)
    scores = calculate_score(y_test[i], y_pred_zeror)
    scores_zeror.append(scores)
print(np.mean(scores_zeror, axis=0))
print(np.mean(scores_zeror))

[ 3.32844347  0.34772673  1.84096823  0.10990296  1.71701177  0.29331534]
1.27289475201


## All 0.5

In [35]:
scores_half = []
for i in range(cv_folds):
    data = np.array([np.ones(len(labels))*0.5] * len(X_test[i]))
    y_pred_half = pd.DataFrame(data, columns=labels)
    scores = calculate_score(y_test[i], y_pred_half)
    scores_half.append(scores)
print(np.mean(scores_half, axis=0))
print(np.mean(scores_half))

[ 0.69314718  0.69314718  0.69314718  0.69314718  0.69314718  0.69314718]
0.69314718056


## Test calculate_score

In [36]:
scores_half = []
for i in range(cv_folds):
    data = np.array([np.array([0.5, 0.6, 0.7, 0.8, 0.9, 1.0])] * len(X_test[i]))
    y_pred_half = pd.DataFrame(data, columns=labels)
    scores = calculate_score(y_test[i], y_pred_half)
    scores_half.append(scores)
print(np.mean(scores_half, axis=0))
print(np.mean(scores_half))

[  0.69314718   0.91220862   1.15881057   1.6050267    2.19335539
  34.24546105]
6.80133491769


In [37]:
def best_classifier_per_label(classifiers, y_pred, scores):
    classifier_scores = np.array([np.mean(scores[classifier_name], axis=0) for classifier_name, _ in classifiers])
    best_classifier_score_per_label = [(classifiers[min_idx][0], classifier_scores[min_idx][i])
                                           for i, min_idx in enumerate(np.argmin(classifier_scores, axis=0))]
    for i, label in enumerate(labels):
        print("[{}] {} : {}".format(label, best_classifier_score_per_label[i][0], best_classifier_score_per_label[i][1]))
    print("Average: {}".format(np.mean([x[1] for x in best_classifier_score_per_label])))
    

## Run classifiers

In [38]:
def run_classifiers(classifiers, X_train_features, X_test_features, y_train, clf, y_pred, scores):
    for classifier_name, classifier in classifiers:
        print('Training with {}'.format(classifier_name))
        clf[classifier_name] = [{}]*cv_folds
        y_pred[classifier_name] = [{}]*cv_folds
        scores[classifier_name] = [{}]*cv_folds
        for fold in range(cv_folds):
            y_pred[classifier_name][fold] = pd.DataFrame()
            for label in labels:
                clf[classifier_name][fold][label] = classifier()
                clf[classifier_name][fold][label].fit(X_train_features[fold], y_train[fold][label])

                y_pred[classifier_name][fold][label] = clf[classifier_name][fold][label].predict_proba(X_test_features[fold]).T[1]

            scores[classifier_name][fold] = calculate_score(y_test[fold], y_pred[classifier_name][fold])
        print('Column-wise log loss for {}: {} - {}'.format(classifier_name, np.mean(scores[classifier_name], axis=0), np.mean(scores[classifier_name])))
    best_classifier_per_label(classifiers, y_pred, scores)

## Textstat features only

In [48]:
classifiers_textstat = []
# classifiers_textstat += [
#     ('Logistic Regression {}'.format(c), lambda: LogisticRegression(solver="newton-cg", C=2.0**c, max_iter=1000)) for c in np.arange(-3,4,1)]
classifiers_textstat += [
    
    ('XGBoost', lambda: xgb.XGBClassifier()),
    
#     ('Logistic Regression', lambda: LogisticRegression(solver="newton-cg", C=2.0, max_iter=1000)),
    
#     ('NB-SVM', lambda: NbSvmClassifier()),
    
#     ("Decision Tree", lambda: DecisionTreeClassifier(max_depth=5)),
#     ("Random Forest", lambda: RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1)),
#     ("Extra Trees", lambda: ExtraTreesClassifier(max_depth=5)),
#     ("Gradient Boosting", lambda: GradientBoostingClassifier()),
    
#     ("Nearest Neighbors", lambda: KNeighborsClassifier(131)),
#     ("Naive Bayes", lambda: GaussianNB()),
#     ("Neural Net", lambda: MLPClassifier(alpha=1)),
#     ("AdaBoost", lambda: AdaBoostClassifier()),
#     ("QDA", lambda: QuadraticDiscriminantAnalysis()),
    
    # Cannot even run
    #     ("Gaussian Process", lambda: GaussianProcessClassifier(1.0 * RBF(1.0)))  # Memory error??? Even with 32GB ram???
    #     ("Linear SVM", lambda: SVC(kernel="linear", C=0.025)),                   # Slow like shit
    #     ("RBF SVM", lambda: SVC(gamma=2, C=1))                                   # Slow like shit
]

In [49]:
clf_textstat = {}
y_pred_textstat = {}
scores_textstat = {}
run_classifiers(classifiers_textstat, X_train_features_textstat, X_test_features_textstat, y_train,
                clf_textstat, y_pred_textstat, scores_textstat)

Training with XGBoost
Column-wise log loss for XGBoost: [ 0.28518643  0.04876666  0.18595811  0.01931855  0.17828931  0.04625131] - 0.12729506156026116
[toxic] XGBoost : 0.2851864250209148
[severe_toxic] XGBoost : 0.04876666009355714
[obscene] XGBoost : 0.18595811303769
[threat] XGBoost : 0.01931855298711939
[insult] XGBoost : 0.17828930868592613
[identity_hate] XGBoost : 0.046251309536359586
Average: 0.12729506156026119


# Emotion only

In [50]:
classifiers_emotion = []
classifiers_emotion += [
    
    ('XGBoost', lambda: xgb.XGBClassifier()),
    
#     ('Logistic Regression', lambda: LogisticRegression(solver="newton-cg", C=2.0, max_iter=1000)),
    
#     ('NB-SVM', lambda: NbSvmClassifier()),
    
#     ("Decision Tree", lambda: DecisionTreeClassifier(max_depth=5)),
#     ("Random Forest", lambda: RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1)),
#     ("Extra Trees", lambda: ExtraTreesClassifier(max_depth=5)),
#     ("Gradient Boosting", lambda: GradientBoostingClassifier()),
    
#     ("Nearest Neighbors", lambda: KNeighborsClassifier(131)),
#     ("Naive Bayes", lambda: GaussianNB()),
#     ("Neural Net", lambda: MLPClassifier(alpha=1)),
#     ("AdaBoost", lambda: AdaBoostClassifier()),
#     ("QDA", lambda: QuadraticDiscriminantAnalysis()),
    
]

In [51]:
clf_emotion = {}
y_pred_emotion = {}
scores_emotion = {}
run_classifiers(classifiers_emotion, X_train_features_emotion, X_test_features_emotion, y_train,
                clf_emotion, y_pred_emotion, scores_emotion)

Training with XGBoost
Column-wise log loss for XGBoost: [ 0.27819804  0.04838562  0.18275198  0.01566718  0.17313829  0.04611881] - 0.1240433215547912
[toxic] XGBoost : 0.2781980387027064
[severe_toxic] XGBoost : 0.04838562343177353
[obscene] XGBoost : 0.1827519805642647
[threat] XGBoost : 0.015667178484192957
[insult] XGBoost : 0.17313829326075458
[identity_hate] XGBoost : 0.04611881488505517
Average: 0.12404332155479124


## Textstat + emotion

In [52]:
classifiers_textstat_emotion = []
classifiers_textstat_emotion += [
    
    ('XGBoost', lambda: xgb.XGBClassifier()),
    
#     ('Logistic Regression', lambda: LogisticRegression(solver="newton-cg", C=2.0, max_iter=1000)),
    
#     ('NB-SVM', lambda: NbSvmClassifier()),
    
#     ("Decision Tree", lambda: DecisionTreeClassifier(max_depth=5)),
#     ("Random Forest", lambda: RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1)),
#     ("Extra Trees", lambda: ExtraTreesClassifier(max_depth=5)),
#     ("Gradient Boosting", lambda: GradientBoostingClassifier()),
    
#     ("Nearest Neighbors", lambda: KNeighborsClassifier(131)),
#     ("Naive Bayes", lambda: GaussianNB()),
#     ("Neural Net", lambda: MLPClassifier(alpha=1)),
#     ("AdaBoost", lambda: AdaBoostClassifier()),
#     ("QDA", lambda: QuadraticDiscriminantAnalysis()),
    
]

In [53]:
# concat features
X_train_features_textstat_emotion = [None]*cv_folds
X_test_features_textstat_emotion = [None]*cv_folds
for fold in range(cv_folds):
    X_train_features_textstat_emotion[fold] = np.hstack((X_train_features_textstat[fold], X_train_features_emotion[fold].values))
    X_test_features_textstat_emotion[fold] = np.hstack((X_test_features_textstat[fold], X_test_features_emotion[fold].values))

print(X_train_features_textstat[0].shape)
print(X_train_features_emotion[0].shape)
print(X_train_features_textstat_emotion[0].shape)

(76680, 8)
(76680, 4)
(76680, 12)


In [54]:
clf_textstat_emotion = {}
y_pred_textstat_emotion = {}
scores_textstat_emotion = {}
run_classifiers(classifiers_textstat_emotion, X_train_features_textstat_emotion,
                X_test_features_textstat_emotion, y_train,
                clf_textstat_emotion, y_pred_textstat_emotion, scores_textstat_emotion)

Training with XGBoost
Column-wise log loss for XGBoost: [ 0.25610325  0.04379224  0.16820614  0.01415375  0.16093455  0.04423599] - 0.11457098700482465
[toxic] XGBoost : 0.2561032508787663
[severe_toxic] XGBoost : 0.04379223912821207
[obscene] XGBoost : 0.16820614460524047
[threat] XGBoost : 0.014153745836174634
[insult] XGBoost : 0.1609345477500837
[identity_hate] XGBoost : 0.04423599383047068
Average: 0.11457098700482465


## Bag of words only

In [55]:
classifiers_bow = [
#     ("Multinominal Naive Bayes", lambda: MultinomialNB()),
#     ('XGBoost', lambda: xgb.XGBClassifier()),
    ('NB-SVM', lambda: NbSvmClassifier()),
#     ('Logistic Regression', lambda: LogisticRegression(solver="newton-cg", C=2.0, max_iter=1000)),
]

In [56]:
clf_bow = {}
y_pred_bow = {}
scores_bow = {}
run_classifiers(classifiers_bow, X_train_features_bow,
                X_test_features_bow, y_train,
                clf_bow, y_pred_bow, scores_bow)

Training with NB-SVM
Column-wise log loss for NB-SVM: [ 0.10949647  0.02983262  0.06156993  0.01155572  0.07772115  0.02805985] - 0.053039290859545
[toxic] NB-SVM : 0.10949647420069089
[severe_toxic] NB-SVM : 0.029832618984479965
[obscene] NB-SVM : 0.06156992798872225
[threat] NB-SVM : 0.011555724499387436
[insult] NB-SVM : 0.07772115379411594
[identity_hate] NB-SVM : 0.028059845689873485
Average: 0.053039290859545


## Word2Vec only

In [57]:
classifiers_w2v = [
#     ("Random Forest", lambda: RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1)),
#     ("Extra Trees", lambda: ExtraTreesClassifier(max_depth=5)),
    ('Logistic Regression', lambda: LogisticRegression(solver="newton-cg", C=2.0, max_iter=1000))
]

In [58]:
clf_w2v = {}
y_pred_w2v = {}
scores_w2v = {}

run_classifiers(classifiers_w2v, X_train_features_w2v,
                X_test_features_w2v, y_train,
                clf_w2v, y_pred_w2v, scores_w2v)

Training with Logistic Regression
Column-wise log loss for Logistic Regression: [ 0.1488576   0.03158888  0.09918152  0.01325023  0.10593741  0.03298833] - 0.07196732888336996
[toxic] Logistic Regression : 0.1488575982133069
[severe_toxic] Logistic Regression : 0.031588880319762515
[obscene] Logistic Regression : 0.09918152319866243
[threat] Logistic Regression : 0.013250234539208492
[insult] Logistic Regression : 0.10593741147747289
[identity_hate] Logistic Regression : 0.03298832555180655
Average: 0.07196732888336996


## Word2Vec glove only

In [59]:
classifiers_w2v_glove = [
#     ("Random Forest", lambda: RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1)),
#     ("Extra Trees", lambda: ExtraTreesClassifier(max_depth=5)),
    ('Logistic Regression', lambda: LogisticRegression(solver="newton-cg", C=2.0, max_iter=1000))
]

In [60]:
clf_w2v_glove = {}
y_pred_w2v_glove = {}
scores_w2v_glove = {}

run_classifiers(classifiers_w2v_glove, X_train_features_w2v_glove,
                X_test_features_w2v_glove, y_train,
                clf_w2v_glove, y_pred_w2v_glove, scores_w2v_glove)

Training with Logistic Regression
Column-wise log loss for Logistic Regression: [ 0.21252174  0.04004988  0.14054401  0.01664609  0.13617216  0.03871943] - 0.0974422186662644
[toxic] Logistic Regression : 0.2125217366696403
[severe_toxic] Logistic Regression : 0.040049878264603364
[obscene] Logistic Regression : 0.14054401396746347
[threat] Logistic Regression : 0.01664608637476516
[insult] Logistic Regression : 0.13617216245193126
[identity_hate] Logistic Regression : 0.03871943426918303
Average: 0.09744221866626442


# Average results from multiple features and classifiers

In [61]:
def average_predictions(y_preds, classifier_names, weights):
    normalized_weights = np.array(weights) / np.sum(weights)
    y_pred_avg = [None] * cv_folds
    
    for i, fold in enumerate(range(cv_folds)):
        for i, y_pred in enumerate(y_preds):
            if i==0:
                y_pred_avg[fold] = y_pred[classifier_names[i]][fold]*weights[i]
            else:
                y_pred_avg[fold] += y_pred[classifier_names[i]][fold]*weights[i]
    return calculate_score(y_test[fold], y_pred_avg[fold])

## Textstat+emotion + BoW

In [62]:
textstat_emotion_AVG_bow_score = average_predictions([y_pred_textstat_emotion, y_pred_bow],
                                                     ["XGBoost", "NB-SVM"],
                                                     [0.5, 0.5])
print(textstat_emotion_AVG_bow_score)
print(np.mean(textstat_emotion_AVG_bow_score))


# 0.11366952025039936
# 0.053176833214058444

[0.15373256593047591, 0.031310522596735772, 0.092055481177447052, 0.012053420399072751, 0.10086175858086022, 0.032783796952256014]
0.0704662576061


## Textstat+emotion + Word2Vec

In [63]:
textstat_emotion_AVG_w2v_score = average_predictions([y_pred_textstat_emotion, y_pred_w2v],
                                                     ["XGBoost", "Logistic Regression"],
                                                     [0.5, 0.5])
print(textstat_emotion_AVG_w2v_score)
print(np.mean(textstat_emotion_AVG_w2v_score))


# 0.11366952025039936
# 0.07266043582520464

[0.17295761240014643, 0.03422214995791574, 0.11626576535968658, 0.013057280943429297, 0.11770963094349506, 0.036066543945743018]
0.0817131639251


## BoW + Word2Vec

In [64]:
bow_AVG_w2v_score = average_predictions([y_pred_bow, y_pred_w2v],
                                        ["NB-SVM", "Logistic Regression"],
                                        [0.9, 0.1])
print(bow_AVG_w2v_score)
print(np.mean(bow_AVG_w2v_score))

# 0.053176833214058444
# 0.07266043582520464

[0.1050669017736116, 0.027189610923196897, 0.061038653338048435, 0.011188138411788708, 0.077229579341500418, 0.028412604984272102]
0.0516875814621


## BoW + Word2Vec + textstat+emotion

In [65]:
bow_AVG_w2v_AVG_textstat_emotion_score = average_predictions([y_pred_bow, y_pred_w2v, y_pred_textstat_emotion],
                                        ["NB-SVM", "Logistic Regression", "XGBoost"],
                                        [0.9, 0.1, 0.001])
print(bow_AVG_w2v_AVG_textstat_emotion_score)
print(np.mean(bow_AVG_w2v_AVG_textstat_emotion_score))

# 0.053176833214058444
# 0.07266043582520464
# 0.11366952025039936

[0.10508637038300961, 0.027188077859946355, 0.062308883119674356, 0.011184702132559446, 0.077312308974954058, 0.028414052128422844]
0.0519157324331


# RUN ON TEST SET

In [90]:
def train_classifier(classifier, X_train_features, y_train):
    clfs = {}
    for label in labels:
        clf = classifier()
        clf.fit(X_train_features, y_train[label])
        clfs[label] = clf
    return clfs

In [91]:
def predict(clfs, X_test_features):
    y_pred_test = []
    for label in labels:
        y_pred_test.append(clfs[label].predict_proba(X_test_features).T[1])
    return pd.DataFrame(np.array(y_pred_test).T, columns=labels)

In [68]:
X_testa_features_bow = vec.transform(test_for_submission['comment_text'])

In [69]:
X_testa_features_w2v = test_for_submission['comment_text'].apply(featurize_w2v)

In [70]:
X_testa_features_w2v = np.array(X_testa_features_w2v.tolist())

In [71]:
X_testa_features_bow.shape

(226998, 280405)

In [72]:
X_testa_features_w2v.shape

(226998, 300)

In [92]:
clf_bow_all = train_classifier(lambda: NbSvmClassifier(), train_term_doc, train_all)

In [93]:
clf_w2v_all = train_classifier(lambda: LogisticRegression(solver="newton-cg", C=2.0, max_iter=1000), X_train_all_features_w2v, train_all)

In [94]:
y_pred_test_bow = predict(clf_bow_all, X_testa_features_bow)

In [95]:
y_pred_test_w2v = predict(clf_w2v_all, X_testa_features_w2v)

In [96]:
y_pred_test_bow["id"] = test_for_submission["id"]
y_pred_test_w2v["id"] = test_for_submission["id"]

In [97]:
y_pred_test_bow = y_pred_test_bow[["id"] + labels]
y_pred_test_w2v = y_pred_test_w2v[["id"] + labels]

In [98]:
y_pred_test_bow.to_csv("submission_bow_cleaned_all_train.csv", index=False, encoding="UTF-8")

In [99]:
y_pred_test_w2v.to_csv("submission_w2v_cleaned_all_train.csv", index=False, encoding="UTF-8")

In [100]:
y_pred_test_bow.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,6044863,0.016136,0.001635,0.006372,0.000466,0.006074,0.001744
1,6102620,0.008654,0.000813,0.006233,0.000393,0.004405,0.001313
2,14563293,0.003823,0.001041,0.004285,0.000454,0.003652,0.00102
3,21086297,0.042234,0.001421,0.008851,0.000488,0.011289,0.001449
4,22982444,0.020978,0.002039,0.007677,0.000539,0.00748,0.001574


In [102]:
y_pred_test_w2v.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,6044863,0.041901,0.008376,0.042874,0.000995,0.029339,0.011189
1,6102620,0.005981,0.001169,0.003204,0.000368,0.004394,0.002523
2,14563293,0.00609,0.000696,0.005111,0.000389,0.005651,0.000504
3,21086297,0.001286,0.000207,9.3e-05,5.6e-05,0.00124,0.000421
4,22982444,0.010488,0.006483,0.015254,0.000562,0.008759,0.003868


In [103]:
y_pred_test_bow_w2v = y_pred_test_bow.loc[:, labels] *0.9 + y_pred_test_w2v.loc[:, labels] *0.1


y_pred_test_bow_w2v["id"] = test_for_submission["id"]

y_pred_test_bow_w2v = y_pred_test_bow_w2v[["id"] + labels]

y_pred_test_bow_w2v.to_csv("submission_bow_w2v_cleaned_all_train.csv", index=False, encoding="UTF-8")

In [104]:
y_pred_test_bow_w2v.shape

(226998, 7)