In [1]:
%%capture
%cd ../src

In [2]:
from config import Config
config = Config()
config.model_dir = '../models'

In [3]:
from ensamble import OptimizeThresh
opt = OptimizeThresh(cfg = config, 
                     source_dir = 'baseline',
                     kinds = ['pca', 'scaled'])
dict_model_threshold = opt.run()

pca        all_folds_decision_tree.csv              with acc: 0.9076 and optim_threshold 0.585
pca        all_folds_logistic_regression.csv        with acc: 0.9142 and optim_threshold 0.118
pca        all_folds_svm.csv                        with acc: 0.8988 and optim_threshold 0.804
pca        all_folds_gradient_boosting.csv          with acc: 0.9213 and optim_threshold 0.418
pca        all_folds_random_forest.csv              with acc: 0.9085 and optim_threshold 0.292
pca        all_folds_xgboost.csv                    with acc: 0.926 and optim_threshold 0.59

scaled     all_folds_decision_tree.csv              with acc: 0.9366 and optim_threshold 0.565
scaled     all_folds_logistic_regression.csv        with acc: 0.9284 and optim_threshold 0.381
scaled     all_folds_svm.csv                        with acc: 0.8986 and optim_threshold 0.217
scaled     all_folds_gradient_boosting.csv          with acc: 0.9432 and optim_threshold 0.485
scaled     all_folds_random_forest.csv             

In [4]:
dict_model_threshold

{'pca': {'decision_tree': 0.585,
  'logistic_regression': 0.11800000000000001,
  'svm': 0.804,
  'gradient_boosting': 0.418,
  'random_forest': 0.292,
  'xgboost': 0.59},
 'scaled': {'decision_tree': 0.5650000000000001,
  'logistic_regression': 0.381,
  'svm': 0.217,
  'gradient_boosting': 0.485,
  'random_forest': 0.659,
  'xgboost': 0.523}}

In [5]:
from ensamble import Ensambler
e = Ensambler(cfg = config, source_dir = 'baseline')
e.load_predictions()
dict_coefs, mean_thresh = e.run_optimizer()

order of models:
['scaled_decision_tree_optim_pred', 'scaled_logistic_regression_optim_pred', 'scaled_gradient_boosting_optim_pred', 'scaled_random_forest_optim_pred', 'scaled_xgboost_optim_pred']
Optimization terminated successfully.
         Current function value: -0.943338
         Iterations: 10
         Function evaluations: 64
optim_th: 0.4
Accuracy score for fold 0: 0.9435 with AUC: 0.8222912933282255
coefs:        [0.25322986 0.02927221 0.26777721 0.32007415 0.14239786]
threshold:    0.4
Optimization terminated successfully.
         Current function value: -0.943410
         Iterations: 14
         Function evaluations: 64
optim_th: 0.38
Accuracy score for fold 1: 0.9420714285714286 with AUC: 0.8064231493865949
coefs:        [0.22738802 0.13890979 0.14438766 0.25816767 0.23802247]
threshold:    0.38
Optimization terminated successfully.
         Current function value: -0.943535
         Iterations: 9
         Function evaluations: 62
optim_th: 0.37
Accuracy score for fold 2:

In [6]:
dict_coefs

{'scaled_decision_tree_optim_pred': 0.22141197433984344,
 'scaled_logistic_regression_optim_pred': 0.16685569629122515,
 'scaled_gradient_boosting_optim_pred': 0.15359142599980663,
 'scaled_random_forest_optim_pred': 0.31671772772544904,
 'scaled_xgboost_optim_pred': 0.14601027039949538}

In [7]:
mean_thresh

0.41

Combine for submission

In [20]:
import os
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [32]:
source_dir = '../models/predictions/baseline/scaled_final'
files = os.listdir(source_dir)
for ignore in ['svm']:
    files = [f for f in files if ignore not in f]
    
req_cols = ['id', 'kfold', 'churn_probability', 'prob']
df = pd.DataFrame()
for f in files:
    alg = f.replace('final_', '').replace('.csv', '')
    d = pd.read_csv(f'{source_dir}/{f}')[req_cols]
    d[f'scaled_{alg}_optim_pred'] = np.where(d['prob'] > dict_model_threshold['scaled'][alg], 1, 0)
    d.drop(columns = 'prob', inplace = True)
    df = d if df.empty else df.merge(d, on = ['id', 'kfold', 'churn_probability'])
    
df['final_pred'] = np.sum(df[dict_coefs.keys()].values * [dict_coefs[k] for k in dict_coefs.keys()], axis = 1)
df['final_pred'] = np.where(df['final_pred'] > mean_thresh, 1, 0)

In [38]:
from sklearn.metrics import accuracy_score
train = df[df['kfold'] == 'train']
test = df[df['kfold'] == 'test']
accuracy_score(y_true = train['churn_probability'], y_pred = train['final_pred'])

0.9463849483564051

In [39]:
test = test[['id', 'final_pred']]
test.rename(columns = {'final_pred' : 'churn_probability'}, inplace = True)
test.head()

Unnamed: 0,id,churn_probability
69999,69999,0
70000,70000,0
70001,70001,1
70002,70002,0
70003,70003,0


In [43]:
### Sample submission file
sample = pd.read_csv('../data/raw/sample.csv')
sub = test.merge(sample.drop(columns = ['churn_probability']), on = 'id')
sub.to_csv('../submissions/baseline.csv', index = False)

In [44]:
pd.read_csv('../submissions/baseline.csv')

Unnamed: 0,id,churn_probability
0,69999,0
1,70000,0
2,70001,1
3,70002,0
4,70003,0
...,...,...
29995,99994,0
29996,99995,0
29997,99996,0
29998,99997,0
