In [1]:
# Standard python libraries
import os
import time
import logging
 
# Installed libraries
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import torch
# Imports from our package
import lightautoml
from lightautoml.automl.base import AutoML
from lightautoml.ml_algo.boost_lgbm import BoostLGBM
from lightautoml.ml_algo.tuning.optuna import OptunaTuner
from lightautoml.pipelines.features.lgb_pipeline import LGBSimpleFeatures
from lightautoml.pipelines.ml.base import MLPipeline
from lightautoml.pipelines.selection.importance_based import ImportanceCutoffSelector, ModelBasedImportanceEstimator
from lightautoml.reader.base import PandasToPandasReader
from lightautoml.tasks import Task
from lightautoml.utils.profiler import Profiler
from lightautoml.automl.blend import WeightedBlender
from lightautoml.dataset.roles import DatetimeRole
from lightautoml.automl.presets.tabular_presets import TabularAutoML, TabularUtilizedAutoML
from lightautoml.report import ReportDeco

In [2]:
N_THREADS = 8 
N_FOLDS = 5 
RANDOM_STATE = 42 
TEST_SIZE = 0.2 
TARGET_NAME = 'Is_Lead'

In [14]:
p = Profiler()
p.change_deco_settings({'enabled': True})

In [49]:
data = pd.read_csv("train.csv")
data.head()

Unnamed: 0,ID,Gender,Age,Region_Code,Occupation,Channel_Code,Vintage,Credit_Product,Avg_Account_Balance,Is_Active,Is_Lead
0,NNVBBKZB,Female,73,RG268,Other,X3,43,No,1045696,No,0
1,IDD62UNG,Female,30,RG277,Salaried,X1,32,No,581988,No,0
2,HD3DSEMC,Female,56,RG268,Self_Employed,X3,26,No,1484315,Yes,0
3,BF3NC7KV,Male,34,RG270,Salaried,X1,19,No,470454,No,0
4,TEASRWXV,Female,30,RG282,Salaried,X1,33,No,886787,No,0


In [50]:
data.drop(["ID"],inplace=True,axis=1)

In [51]:
data.head()

Unnamed: 0,Gender,Age,Region_Code,Occupation,Channel_Code,Vintage,Credit_Product,Avg_Account_Balance,Is_Active,Is_Lead
0,Female,73,RG268,Other,X3,43,No,1045696,No,0
1,Female,30,RG277,Salaried,X1,32,No,581988,No,0
2,Female,56,RG268,Self_Employed,X3,26,No,1484315,Yes,0
3,Male,34,RG270,Salaried,X1,19,No,470454,No,0
4,Female,30,RG282,Salaried,X1,33,No,886787,No,0


In [52]:
train_data, test_data = train_test_split(data, 
                                         test_size=TEST_SIZE, 
                                         stratify=data[TARGET_NAME],random_state=RANDOM_STATE)
#logging.info('Data splitted. Parts sizes: train_data = {}, test_data = {}'
#              .format(train_data.shape, test_data.shape))

In [53]:
test_data.head()

Unnamed: 0,Gender,Age,Region_Code,Occupation,Channel_Code,Vintage,Credit_Product,Avg_Account_Balance,Is_Active,Is_Lead
30702,Male,59,RG283,Self_Employed,X3,51,No,844547,Yes,0
217904,Female,27,RG264,Salaried,X1,14,Yes,884535,No,0
81680,Male,49,RG284,Salaried,X3,109,Yes,3390285,Yes,1
157008,Male,62,RG272,Other,X2,69,,1171774,No,1
83329,Male,32,RG262,Self_Employed,X3,32,No,1965556,Yes,0


In [54]:
task = Task('binary')
reader = PandasToPandasReader(task, cv=N_FOLDS, random_state=RANDOM_STATE)

In [55]:
model0 = BoostLGBM(
    default_params={'learning_rate': 0.05, 'num_leaves': 64, 'seed': 42, 'num_threads': N_THREADS}
)
pipe0 = LGBSimpleFeatures()
mbie = ModelBasedImportanceEstimator()
selector = ImportanceCutoffSelector(pipe0, model0, mbie, cutoff=0)

In [56]:

pipe = LGBSimpleFeatures()
params_tuner1 = OptunaTuner(n_trials=20, timeout=30) 
model1 = BoostLGBM(
    default_params={'learning_rate': 0.05, 'num_leaves': 128, 'seed': 1, 'num_threads': N_THREADS}
)

model2 = BoostLGBM(
    default_params={'learning_rate': 0.025, 'num_leaves': 64, 'seed': 2, 'num_threads': N_THREADS}
)
pipeline_lvl1 = MLPipeline([
    (model1, params_tuner1),
    model2
], pre_selection=selector, features_pipeline=pipe, post_selection=None)

In [57]:
pipe1 = LGBSimpleFeatures()
model = BoostLGBM(
    default_params={'learning_rate': 0.05, 'num_leaves': 64, 'max_bin': 1024, 'seed': 3, 'num_threads': N_THREADS},
    freeze_defaults=True
)
pipeline_lvl2 = MLPipeline([model], pre_selection=None, features_pipeline=pipe1, post_selection=None)

In [58]:
automl = AutoML(reader, [
    [pipeline_lvl1],
    [pipeline_lvl2],
], skip_conn=False)

In [59]:
test_data.head()

Unnamed: 0,Gender,Age,Region_Code,Occupation,Channel_Code,Vintage,Credit_Product,Avg_Account_Balance,Is_Active,Is_Lead
30702,Male,59,RG283,Self_Employed,X3,51,No,844547,Yes,0
217904,Female,27,RG264,Salaried,X1,14,Yes,884535,No,0
81680,Male,49,RG284,Salaried,X3,109,Yes,3390285,Yes,1
157008,Male,62,RG272,Other,X2,69,,1171774,No,1
83329,Male,32,RG262,Self_Employed,X3,32,No,1965556,Yes,0


In [60]:
oof_pred = automl.fit_predict(train_data, roles={'target': TARGET_NAME})
logging.info('oof_pred:\n{}\nShape = {}'.format(oof_pred, oof_pred.shape))

Train data shape: (196580, 10)
Feats was rejected during automatic roles guess: []


Layer 1 ...
Train process start. Time left 9999999958.27476 secs
Start fitting LightGBM ...

===== Start working with fold 0 for LightGBM =====

Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.872291
[200]	valid's auc: 0.872084
Early stopping, best iteration is:
[141]	valid's auc: 0.872392
LightGBM fitting and predicting completed
Optuna may run 6299999956.929967 secs


INFO:optuna.storages._in_memory:A new study created in memory with name: no-name-7a4c15a8-6780-49f8-bf6c-dd723f321218


Start fitting Lvl_0_Pipe_0_Mod_0_LightGBM ...

===== Start working with fold 0 for Lvl_0_Pipe_0_Mod_0_LightGBM =====

Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.871261
[200]	valid's auc: 0.869895
Early stopping, best iteration is:
[104]	valid's auc: 0.871357
Lvl_0_Pipe_0_Mod_0_LightGBM fitting and predicting completed


INFO:optuna.study:Trial 0 finished with value: 0.8713569563576735 and parameters: {'feature_fraction': 0.6872700594236812, 'num_leaves': 244}. Best is trial 0 with value: 0.8713569563576735.


Start fitting Lvl_0_Pipe_0_Mod_0_LightGBM ...

===== Start working with fold 0 for Lvl_0_Pipe_0_Mod_0_LightGBM =====

Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.871812
Early stopping, best iteration is:
[47]	valid's auc: 0.87213
Lvl_0_Pipe_0_Mod_0_LightGBM fitting and predicting completed


INFO:optuna.study:Trial 1 finished with value: 0.8721300802462069 and parameters: {'feature_fraction': 0.8659969709057025, 'num_leaves': 159}. Best is trial 1 with value: 0.8721300802462069.


Start fitting Lvl_0_Pipe_0_Mod_0_LightGBM ...

===== Start working with fold 0 for Lvl_0_Pipe_0_Mod_0_LightGBM =====

Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.871724
[200]	valid's auc: 0.872024
Early stopping, best iteration is:
[135]	valid's auc: 0.872167
Lvl_0_Pipe_0_Mod_0_LightGBM fitting and predicting completed


INFO:optuna.study:Trial 2 finished with value: 0.8721671216876423 and parameters: {'feature_fraction': 0.5780093202212182, 'num_leaves': 53}. Best is trial 2 with value: 0.8721671216876423.


Start fitting Lvl_0_Pipe_0_Mod_0_LightGBM ...

===== Start working with fold 0 for Lvl_0_Pipe_0_Mod_0_LightGBM =====

Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.870868
[200]	valid's auc: 0.869871
Early stopping, best iteration is:
[109]	valid's auc: 0.871071
Lvl_0_Pipe_0_Mod_0_LightGBM fitting and predicting completed


INFO:optuna.study:Trial 3 finished with value: 0.8710708970328733 and parameters: {'feature_fraction': 0.5290418060840998, 'num_leaves': 223}. Best is trial 2 with value: 0.8721671216876423.


Start fitting Lvl_0_Pipe_0_Mod_0_LightGBM ...

===== Start working with fold 0 for Lvl_0_Pipe_0_Mod_0_LightGBM =====

Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.871545
Early stopping, best iteration is:
[63]	valid's auc: 0.871793
Lvl_0_Pipe_0_Mod_0_LightGBM fitting and predicting completed


INFO:optuna.study:Trial 4 finished with value: 0.8717925418988401 and parameters: {'feature_fraction': 0.8005575058716043, 'num_leaves': 185}. Best is trial 2 with value: 0.8721671216876423.


Start fitting Lvl_0_Pipe_0_Mod_0_LightGBM ...

===== Start working with fold 0 for Lvl_0_Pipe_0_Mod_0_LightGBM =====

Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.871724
[200]	valid's auc: 0.872024
Early stopping, best iteration is:
[135]	valid's auc: 0.872167

===== Start working with fold 1 for Lvl_0_Pipe_0_Mod_0_LightGBM =====

Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.87606
[200]	valid's auc: 0.876591
Early stopping, best iteration is:
[183]	valid's auc: 0.876647

===== Start working with fold 2 for Lvl_0_Pipe_0_Mod_0_LightGBM =====

Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.877362
[200]	valid's auc: 0.877957
Early stopping, best iteration is:
[189]	valid's auc: 0.878001

===== Start working with fold 3 for Lvl_0_Pipe_0_Mod_0_LightGBM =====

Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.86661
[200]	valid's auc: 0.866909
Early s

INFO:root:oof_pred:
array([[0.19915326],
       [0.0752631 ],
       [0.22942251],
       ...,
       [0.14231992],
       [0.09307461],
       [0.0903881 ]], dtype=float32)
Shape = (196580, 1)


In [29]:
test_data.head()

Unnamed: 0,Gender,Age,Region_Code,Occupation,Channel_Code,Vintage,Credit_Product,Avg_Account_Balance,Is_Active,Is_Lead
27048,1.0,29.0,0.0,3.0,0.0,21.0,0.0,1279935.0,0.0,0
19457,0.0,37.0,14.0,1.0,0.0,32.0,0.0,1060621.0,0.0,0
227262,1.0,71.0,33.0,1.0,1.0,80.0,0.0,647243.0,1.0,1
340435,1.0,43.0,4.0,3.0,2.0,93.0,0.0,1576991.0,1.0,1
65516,1.0,29.0,30.0,2.0,0.0,32.0,0.0,412710.0,1.0,0


In [31]:
data.columns,test.columns

(Index(['Gender', 'Age', 'Region_Code', 'Occupation', 'Channel_Code', 'Vintage',
        'Credit_Product', 'Avg_Account_Balance', 'Is_Active', 'Is_Lead'],
       dtype='object'),
 Index(['Gender', 'Age', 'Region_Code', 'Occupation', 'Channel_Code', 'Vintage',
        'Credit_Product', 'Avg_Account_Balance', 'Is_Active'],
       dtype='object'))

In [61]:
test = pd.read_csv("test.csv")

In [62]:
test['Is_Active'] =[0]*test.shape[0]

In [64]:
test.drop(['ID'],inplace=True,axis=1)

In [66]:
test.head()

Unnamed: 0,Gender,Age,Region_Code,Occupation,Channel_Code,Vintage,Credit_Product,Avg_Account_Balance,Is_Active
0,Male,29,RG254,Other,X1,25,Yes,742366,0
1,Male,43,RG268,Other,X2,49,,925537,0
2,Male,31,RG270,Salaried,X1,14,No,215949,0
3,Male,29,RG272,Other,X1,33,No,868070,0
4,Female,29,RG270,Other,X1,19,No,657087,0


In [67]:
test_pred = automl.predict(test)
#logging.info('Prediction for test data:\n{}\nShape = {}'
 #             .format(test_pred, test_pred.shape))
#logging.info('Check scores...')
#logging.info('OOF score: {}'.format(roc_auc_score(train_data[TARGET_NAME].values, oof_pred.data[:, 0])))
#logging.info('TEST score: {}'.format(roc_auc_score(test_data[TARGET_NAME].values, test_pred.data[:, 0])))

In [68]:
t = list(test_pred.data[:,0])

In [69]:
df = pd.read_csv("test.csv")

In [70]:
ids  = df['ID']

In [72]:
ans=pd.DataFrame()
ans['ID']=ids
ans['Is_Lead']=t
ans.to_csv("auto.csv",index=False)