In [1]:
import numpy as np
import pandas as pd
import gc

In [2]:
y_pred_1  = pd.read_csv('tfidf-logistic.csv')            # 0.9790
y_pred_2  = pd.read_csv('tfidf-logistic+features.csv')   # 0.9745
y_pred_3  = pd.read_csv('simple_dnn.csv')                # 0.9748
y_pred_4  = pd.read_csv('simple_dnn+features.csv')       # 0.9751
y_pred_5  = pd.read_csv('submission.csv')

In [3]:
# The value of an ensemble is (a) the individual scores of the models and
# (b) their correlation with one another. We want to multiple individually high
# scoring models that all have low correlations. Based on this analysis, it
# looks like these kernels have relatively low correlations and will blend to a
# much higher score.
labels = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
for label in labels:
    print(label)
    print(np.corrcoef([y_pred_1[label].rank(pct=True), y_pred_2[label].rank(pct=True), y_pred_3[label].rank(pct=True), 
                       y_pred_4[label].rank(pct=True), y_pred_5[label].rank(pct=True)]))


toxic
[[1.         0.99731126 0.84100189 0.8411098  0.87030988]
 [0.99731126 1.         0.84166243 0.84173662 0.86936754]
 [0.84100189 0.84166243 1.         0.84684528 0.83835945]
 [0.8411098  0.84173662 0.84684528 1.         0.83757562]
 [0.87030988 0.86936754 0.83835945 0.83757562 1.        ]]
severe_toxic
[[1.         0.99402814 0.66239964 0.6655294  0.74421882]
 [0.99402814 1.         0.66423415 0.66566336 0.73497474]
 [0.66239964 0.66423415 1.         0.8053681  0.81003827]
 [0.6655294  0.66566336 0.8053681  1.         0.81317306]
 [0.74421882 0.73497474 0.81003827 0.81317306 1.        ]]
obscene
[[1.         0.99805006 0.73017522 0.75210469 0.79226175]
 [0.99805006 1.         0.73352335 0.75332947 0.79129096]
 [0.73017522 0.73352335 1.         0.78531108 0.77330405]
 [0.75210469 0.75332947 0.78531108 1.         0.79473481]
 [0.79226175 0.79129096 0.77330405 0.79473481 1.        ]]
threat
[[1.         0.99504724 0.58096228 0.56521832 0.64074011]
 [0.99504724 1.         0.57906527 

In [4]:
from scipy.stats import rankdata

predict_list = [y_pred_1.values, y_pred_2.values, y_pred_3.values, y_pred_4.values, y_pred_5.values]

print("Rank averaging on", len(predict_list), "files")
predictions = np.zeros_like(predict_list[0])
for predict in predict_list:
    for i in range(6):
        predictions[:, i] = np.add(predictions[:, i], rankdata(predict[:, i])/predictions.shape[0])  
predictions /= len(predict_list)

Rank averaging on 5 files


In [5]:
y_pred_1.shape

(153164, 7)

In [6]:
predictions.shape

(153164, 7)

In [7]:
submission = y_pred_1.copy()
submission.loc[:, 1:] = predictions[:, 1:]
submission.to_csv('rank_averaged_submission.csv', index=False)

In [8]:
submission.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.977348,0.980719,0.973166,0.989238,0.981793,0.0
1,0000247867823ef7,0.296345,0.429938,0.271074,0.202968,0.371624,0.0
2,00013b17ad220c46,0.562737,0.601538,0.608515,0.511349,0.529675,0.0
3,00017563c3f7919a,0.153331,0.403541,0.174817,0.414881,0.290114,0.0
4,00017695ad8997eb,0.484358,0.555281,0.387569,0.708575,0.411731,0.0


In [7]:
# submission = pd.DataFrame()
# submission['id'] = y_pred_1['id']
# for label in labels:
#     submission[label] = y_pred_1[label].rank(pct=True) * 0.3 + y_pred_4[label].rank(pct=True) * 0.15 + \
#                         y_pred_3[label].rank(pct=True) * 0.15 + y_pred_5[label].rank(pct=True) * 0.4

# submission.to_csv('mean.csv', index=False)

In [8]:
# merge = y_pred_1.columns.tolist()
# merge.remove('id')

# y_pred = y_pred_1.copy()
# for i in merge:
#     y_pred[i] =  (2*y_pred_1[i] + 1*y_pred_2[i] + 1*y_pred_3[i] +4*y_pred_4[i] + 4*y_pred_5[i])/12.0  