In [1]:
%%capture
%cd ../src

In [2]:
from config import Config
config = Config()
config.model_dir = '../models'

In [3]:
from ensamble import OptimizeThresh
opt = OptimizeThresh(cfg = config, 
                     source_dir = 'tuned',
                     kinds = ['scaled'])
dict_model_threshold = opt.run()

scaled     all_folds_decision_tree.csv              with acc: 0.9411 and optim_threshold 0.515
scaled     all_folds_logistic_regression.csv        with acc: 0.9284 and optim_threshold 0.381
scaled     all_folds_gradient_boosting.csv          with acc: 0.9432 and optim_threshold 0.482
scaled     all_folds_random_forest.csv              with acc: 0.9432 and optim_threshold 0.457
scaled     all_folds_xgboost.csv                    with acc: 0.9426 and optim_threshold 0.598



In [4]:
dict_model_threshold

{'scaled': {'decision_tree': 0.515,
  'logistic_regression': 0.381,
  'gradient_boosting': 0.482,
  'random_forest': 0.457,
  'xgboost': 0.598}}

In [5]:
from ensamble import Ensambler
e = Ensambler(cfg = config, source_dir = 'tuned')
e.load_predictions()
dict_coefs, mean_thresh = e.run_optimizer()

order of models:
['scaled_decision_tree_optim_pred', 'scaled_logistic_regression_optim_pred', 'scaled_gradient_boosting_optim_pred', 'scaled_random_forest_optim_pred', 'scaled_xgboost_optim_pred']
Optimization terminated successfully.
         Current function value: -0.943445
         Iterations: 10
         Function evaluations: 64
optim_th: 0.53
Accuracy score for fold 0: 0.944 with AUC: 0.8219458259783844
coefs:        [0.13508471 0.10830701 0.28437283 0.14665575 0.32557969]
threshold:    0.53
Optimization terminated successfully.
         Current function value: -0.936981
         Iterations: 10
         Function evaluations: 69
optim_th: 0.23
Accuracy score for fold 1: 0.9369285714285714 with AUC: 0.821207618780113
coefs:        [0.18491963 0.57151507 0.06118526 0.15885052 0.02352951]
threshold:    0.23
Optimization terminated successfully.
         Current function value: -0.943642
         Iterations: 11
         Function evaluations: 65
optim_th: 0.49
Accuracy score for fold 2

In [6]:
dict_coefs

{'scaled_decision_tree_optim_pred': 0.2903341107549342,
 'scaled_logistic_regression_optim_pred': 0.22431573605866673,
 'scaled_gradient_boosting_optim_pred': 0.1346512206041277,
 'scaled_random_forest_optim_pred': 0.1452921689053081,
 'scaled_xgboost_optim_pred': 0.20781338815307832}

In [7]:
mean_thresh

0.374

Combine for submission

In [8]:
import os
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [9]:
source_dir = '../models/predictions/tuned/scaled_final'
files = os.listdir(source_dir)
for ignore in ['svm']:
    files = [f for f in files if ignore not in f]
    
req_cols = ['id', 'kfold', 'churn_probability', 'prob']
df = pd.DataFrame()
for f in files:
    alg = f.replace('final_', '').replace('.csv', '')
    d = pd.read_csv(f'{source_dir}/{f}')[req_cols]
    d[f'scaled_{alg}_optim_pred'] = np.where(d['prob'] > dict_model_threshold['scaled'][alg], 1, 0)
    d.drop(columns = 'prob', inplace = True)
    df = d if df.empty else df.merge(d, on = ['id', 'kfold', 'churn_probability'])
    
df['final_pred'] = np.sum(df[dict_coefs.keys()].values * [dict_coefs[k] for k in dict_coefs.keys()], axis = 1)
df['final_pred'] = np.where(df['final_pred'] > mean_thresh, 1, 0)

In [10]:
from sklearn.metrics import accuracy_score
train = df[df['kfold'] == 'train']
test = df[df['kfold'] == 'test']
accuracy_score(y_true = train['churn_probability'], y_pred = train['final_pred'])

0.9664995214217346

In [11]:
test = test[['id', 'final_pred']]
test.rename(columns = {'final_pred' : 'churn_probability'}, inplace = True)
test.head()

Unnamed: 0,id,churn_probability
69999,69999,0
70000,70000,0
70001,70001,1
70002,70002,0
70003,70003,0


In [12]:
### Sample submission file
sample = pd.read_csv('../data/raw/sample.csv')
sub = test.merge(sample.drop(columns = ['churn_probability']), on = 'id')
sub.to_csv('../submissions/tuned.csv', index = False)

In [13]:
pd.read_csv('../submissions/tuned.csv')

Unnamed: 0,id,churn_probability
0,69999,0
1,70000,0
2,70001,1
3,70002,0
4,70003,0
...,...,...
29995,99994,0
29996,99995,0
29997,99996,0
29998,99997,0
