In [1]:
import os,gc
import pandas as pd
import numpy as np

from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.cross_validation import cross_val_score

from mlens.metrics import make_scorer
from mlens.model_selection import Evaluator

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier

from copy import deepcopy

[MLENS] backend: threading


In [2]:
user_folder = os.path.expanduser("~")
data_folder = os.path.join(os.path.expanduser("~"), 'E:/git/database/Toxic_Comment')

In [3]:
class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
train_target = pd.read_csv(os.path.join(data_folder, "train.csv"))
train_target = train_target[class_names]
submission = pd.read_csv(os.path.join(data_folder, "sample_submission.csv"))
gc.collect()

20

In [4]:
models = ['lr', 'lstm', 'svm', 'rf','lgb']
# models = ['lr', 'lstm', 'svm', ‘rf', 'lgb']

train_preds = {}
test_preds = {}

for model in models:
    train_preds[model] = pd.read_csv(os.path.join(data_folder, "%s_train_preds.csv" % model))
    test_preds[model] = pd.read_csv(os.path.join(data_folder, "%s_submission.csv" % model))
    
gc.collect()

0

In [6]:
train_class_data = {}
for i in class_names:
    data = np.zeros((train_target.shape[0], len(models)))
    data = pd.DataFrame(data); data.columns = models
    for model in models:
        data[model] = train_preds[model][i]
    train_class_data[i] = data
    del data

In [7]:
test_class_data = {}
for i in class_names:
    data = np.zeros((submission.shape[0], len(models)))
    data = pd.DataFrame(data); data.columns = models
    for model in models:
        data[model] = test_preds[model][i]
    test_class_data[i] = data
    del data

In [8]:
def scoring(y_true, y_score, models):
    score = np.zeros((len(models),))
    idx = 0
    for model in models:
        score[idx] = roc_auc_score(y_true, y_score[model])
        idx += 1
    # score = score.mean()
    return score

### ensembling

In [11]:
# base ensembling: select extreme
def select_score(array):
    data = array
    data_min = min(data)
    data_max = max(data)
    length = len(array)
    
    flag = (data < 0.5).sum()
    
    if flag > 0.5*length:
        score = data_min
    elif flag < 0.5*length:
        score = data_max
    else:
        score = data.mean()
    
    return score

for label in class_names:
    
    X_test = test_class_data[label]
    result = X_test.apply(select_score, axis=1)
    submission[label] = result

submission.to_csv(os.path.join(data_folder, "ensemble_select2_submission.csv"), index=False)

In [10]:
# base ensembling: average
for label in class_names:
    X_test = test_class_data[label]
    result = X_test.apply(np.mean, axis=1)
    submission[label] = result
    
submission.to_csv(os.path.join(data_folder, "ensemble_average_submission.csv"), index=False)  

In [40]:
train_submission_select = deepcopy(train_preds['lgb'])
for label in class_names:
    X_train = train_class_data[label]
    result = X_train.apply(select_score, axis=1)
    train_submission_select[label] = result

train_submission_ave = deepcopy(train_preds['lgb'])
for label in class_names:
    X_train = train_class_data[label]
    result = X_train.apply(np.mean, axis=1)
    train_submission_ave[label] = result

In [41]:
lr = scoring(train_target['toxic'], train_class_data['toxic'], models)
select = roc_auc_score(train_target['toxic'], train_submission_select['toxic'])
ave = roc_auc_score(train_target['toxic'], train_submission_ave['toxic'])
[lr, select, ave]

[array([0.98599063, 0.98852906, 0.9666938 , 0.99991472, 0.99132291]),
 0.9954486343493429,
 0.9973911182788009]

### ensembler
ensemble_logistic_submission: LogisticRegression(), ['lr‘，'lgb', 'lstm', 'rf', 'svm']
<br>
ensemble_logistic_submission: LogisticRegression(), ['lr‘，'lgb', 'lstm', 'svm']
<br>
ensemble_select_submission: select_score(), ['lr‘，'lgb', 'lstm', 'rf', 'svm']
<br>
ensemble_select2_submission: select_score(), ['lr‘，'lgb', 'lstm', 'svm']
<br>



In [29]:
lr = roc_auc_score(train_target['toxic'],train_preds['lr']['toxic'])
select = roc_auc_score(train_target['toxic'], train_submission['toxic'])
[lr, select]

[0.9859906303243691, 0.9893216925063395]