In [41]:
import os
import pandas as pd
import numpy as np

user_folder = os.path.expanduser('~')
data_folder = os.path.join(user_folder, 'E:/git/database/Toxic_Comment/blends')
files = os.listdir(data_folder)
files

['lgb_submission.csv',
 'Logistic_regression_with_words_and_char_n-grams.csv',
 'Minimal_LSTM_NB-SVM_baseline_ensemble.csv',
 'one_more_blend.csv',
 'Pooled_GRU_Fasttext.csv',
 'toxic_avenger.csv',
 'who09829_gru.csv']

In [51]:
gru = pd.read_csv(os.path.join(data_folder, files[4])) # PL score 0.9829
lstm_nb_svm = pd.read_csv(os.path.join(data_folder, files[2])) # 0.9811
lr = pd.read_csv(os.path.join(data_folder, files[1])) # 0.9788
lgb = pd.read_csv(os.path.join(data_folder, files[0])) # 0.9785
blend_p = pd.read_csv(os.path.join(data_folder, files[3])) # 0.9850

# ave = pd.read_csv(os.path.join(data_folder, files[5])) # 0.9823


weights = pd.Series([0.9829, 0.9811, 0.9788, 0.9785, 0.9850])
weights = 1/weights

### scaling

In [52]:
# Bojan suggests scaling with min-max to make sure that all the submissions have
# orderings that can be compared. Since our metric is AUC, this is okay and may
# improve performance.

from sklearn.preprocessing import minmax_scale
labels = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
for label in labels:
    print('Scaling {}... Please stand by.'.format(label))
    lgb[label] = minmax_scale(lgb[label])
    gru[label] = minmax_scale(gru[label])
    lr[label] = minmax_scale(lr[label])
    lstm_nb_svm[label] = minmax_scale(lstm_nb_svm[label])
    blend_p[label] = minmax_scale(blend_p[label])


Scaling toxic... Please stand by.
Scaling severe_toxic... Please stand by.
Scaling obscene... Please stand by.
Scaling threat... Please stand by.
Scaling insult... Please stand by.
Scaling identity_hate... Please stand by.


In [53]:
for label in labels:
    print(label)
    print(np.corrcoef([gru[label], lstm_nb_svm[label], lr[label], lgb[label], blend_p[label]]))

toxic
[[1.         0.94926809 0.90236963 0.90111066 0.97783595]
 [0.94926809 1.         0.95823705 0.94273615 0.98356096]
 [0.90236963 0.95823705 1.         0.95377901 0.95351634]
 [0.90111066 0.94273615 0.95377901 1.         0.94253547]
 [0.97783595 0.98356096 0.95351634 0.94253547 1.        ]]
severe_toxic
[[1.         0.85339415 0.82051887 0.78884913 0.94681823]
 [0.85339415 1.         0.88252021 0.8186413  0.9192528 ]
 [0.82051887 0.88252021 1.         0.86371851 0.89209998]
 [0.78884913 0.8186413  0.86371851 1.         0.86059469]
 [0.94681823 0.9192528  0.89209998 0.86059469 1.        ]]
obscene
[[1.         0.96100403 0.92729218 0.93439385 0.98447173]
 [0.96100403 1.         0.955631   0.94607069 0.98364744]
 [0.92729218 0.955631   1.         0.94986687 0.95644366]
 [0.93439385 0.94607069 0.94986687 1.         0.95585908]
 [0.98447173 0.98364744 0.95644366 0.95585908 1.        ]]
threat
[[1.         0.78330735 0.79996174 0.75896158 0.93353892]
 [0.78330735 1.         0.84338327 

In [45]:
import copy
submission = copy.deepcopy(gru)

In [46]:
datasets = [gru, lstm_nb_svm, lr, lgb, blend_p]
datasets_rmid = [df.drop('id', axis=1) for df in datasets]

In [47]:
result = [weights[i]*datasets_rmid[i] for i in range(len(datasets))]
result = sum(result)/sum(weights)

In [48]:
submission.iloc[:,1:] = result

In [49]:
submission.to_csv('myBlend_scale2_result.csv', index=False)

In [50]:
submission.head(1)

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.992482,0.312776,0.978632,0.041128,0.946862,0.316
