<a href="https://colab.research.google.com/github/nbrrawal/Kaggle/blob/master/LAMA_Create_your_own_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Install LAMA from pip**

In [None]:
!pip install lightautoml
!pip install albumentations==0.4.6

Collecting lightautoml
  Downloading LightAutoML-0.2.16-py3-none-any.whl (262 kB)
[?25l[K     |█▎                              | 10 kB 27.3 MB/s eta 0:00:01[K     |██▌                             | 20 kB 32.0 MB/s eta 0:00:01[K     |███▊                            | 30 kB 36.3 MB/s eta 0:00:01[K     |█████                           | 40 kB 22.4 MB/s eta 0:00:01[K     |██████▎                         | 51 kB 17.4 MB/s eta 0:00:01[K     |███████▌                        | 61 kB 14.0 MB/s eta 0:00:01[K     |████████▊                       | 71 kB 12.9 MB/s eta 0:00:01[K     |██████████                      | 81 kB 14.2 MB/s eta 0:00:01[K     |███████████▎                    | 92 kB 15.5 MB/s eta 0:00:01[K     |████████████▌                   | 102 kB 11.7 MB/s eta 0:00:01[K     |█████████████▊                  | 112 kB 11.7 MB/s eta 0:00:01[K     |███████████████                 | 122 kB 11.7 MB/s eta 0:00:01[K     |████████████████▎               | 133 kB 11.7 M

# **Import all the required libraries**

In [None]:
# Standard python libraries
import os
import time
import logging

# Installed libraries
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import torch
# Imports from our package
import lightautoml
from lightautoml.automl.base import AutoML
from lightautoml.ml_algo.boost_lgbm import BoostLGBM
from lightautoml.ml_algo.tuning.optuna import OptunaTuner
from lightautoml.pipelines.features.lgb_pipeline import LGBSimpleFeatures
from lightautoml.pipelines.ml.base import MLPipeline
from lightautoml.pipelines.selection.importance_based import ImportanceCutoffSelector, ModelBasedImportanceEstimator
from lightautoml.reader.base import PandasToPandasReader
from lightautoml.tasks import Task
from lightautoml.utils.profiler import Profiler
from lightautoml.automl.blend import WeightedBlender
from lightautoml.dataset.roles import DatetimeRole

In [None]:
#define all the parameters
N_THREADS = 8 # threads cnt for lgbm and linear models
N_FOLDS = 5 # folds cnt for AutoML
RANDOM_STATE = 42 # fixed random state for various reasons
TEST_SIZE = 0.2 # Test size for metric check
TARGET_NAME = 'TARGET' # Target column name

In [None]:
#Fix torch number of threads and numpy seed
np.random.seed(RANDOM_STATE)
torch.set_num_threads(N_THREADS)

In [None]:
# Change profiling decorators settings
#By default, profiling decorators are turned off for speed and memory reduction. 
#If you want to see profiling report after using LAMA, you need to turn on the decorators using command below:
#create a Profile object to get the profile report
p = Profiler()
p.change_deco_settings({'enabled': True})

In [None]:
#load the dataset
data = pd.read_csv('https://raw.githubusercontent.com/sberbank-ai-lab/LightAutoML/master/example_data/test_data_files/sampled_app_train.csv')
data.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,NAME_TYPE_SUITE,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,OWN_CAR_AGE,FLAG_MOBIL,FLAG_EMP_PHONE,FLAG_WORK_PHONE,FLAG_CONT_MOBILE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,REG_REGION_NOT_LIVE_REGION,REG_REGION_NOT_WORK_REGION,LIVE_REGION_NOT_WORK_REGION,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,LIVE_CITY_NOT_WORK_CITY,...,LIVINGAPARTMENTS_MEDI,LIVINGAREA_MEDI,NONLIVINGAPARTMENTS_MEDI,NONLIVINGAREA_MEDI,FONDKAPREMONT_MODE,HOUSETYPE_MODE,TOTALAREA_MODE,WALLSMATERIAL_MODE,EMERGENCYSTATE_MODE,OBS_30_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,FLAG_DOCUMENT_2,FLAG_DOCUMENT_3,FLAG_DOCUMENT_4,FLAG_DOCUMENT_5,FLAG_DOCUMENT_6,FLAG_DOCUMENT_7,FLAG_DOCUMENT_8,FLAG_DOCUMENT_9,FLAG_DOCUMENT_10,FLAG_DOCUMENT_11,FLAG_DOCUMENT_12,FLAG_DOCUMENT_13,FLAG_DOCUMENT_14,FLAG_DOCUMENT_15,FLAG_DOCUMENT_16,FLAG_DOCUMENT_17,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,313802,0,Cash loans,M,N,Y,0,270000.0,327024.0,15372.0,270000.0,Unaccompanied,Working,Higher education,Married,House / apartment,0.072508,-18590,-226,-2432.0,-2137,,1,1,0,1,0,0,High skill tech staff,2.0,1,1,FRIDAY,14,0,1,1,0,0,0,...,0.0701,0.0684,0.0078,0.0738,reg oper account,block of flats,0.0686,Panel,No,0.0,0.0,0.0,0.0,-1275.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,319656,0,Cash loans,F,N,N,0,108000.0,675000.0,19737.0,675000.0,Unaccompanied,Working,Secondary / secondary special,Married,House / apartment,0.035792,-20316,-3822,-13156.0,-3758,,1,1,1,1,0,0,Security staff,2.0,2,2,WEDNESDAY,15,0,0,0,0,1,1,...,,,,,,,,,,1.0,1.0,1.0,1.0,-1732.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,207678,0,Revolving loans,F,Y,Y,2,112500.0,270000.0,13500.0,270000.0,Unaccompanied,Working,Secondary / secondary special,Married,Municipal apartment,0.031329,-9545,-1076,-4077.0,-1058,18.0,1,1,0,1,0,0,Laborers,4.0,2,2,THURSDAY,12,0,0,0,0,0,0,...,,0.0023,,0.0,,block of flats,0.0018,Wooden,No,4.0,1.0,4.0,1.0,-1597.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
3,381593,0,Cash loans,F,N,N,1,67500.0,142200.0,9630.0,112500.0,Unaccompanied,Working,Higher education,Married,With parents,0.008474,-11421,-2272,-5106.0,-1556,,1,1,0,1,0,0,,3.0,2,2,FRIDAY,15,0,0,0,0,0,0,...,,0.0343,,0.0133,,block of flats,0.0214,"Stone, brick",No,0.0,0.0,0.0,0.0,-1069.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,4.0
4,258153,0,Cash loans,F,Y,Y,0,337500.0,1483231.5,46570.5,1354500.0,Family,Working,Secondary / secondary special,Married,House / apartment,0.008474,-13203,-1775,-5743.0,-4256,9.0,1,1,1,1,1,0,Managers,2.0,2,2,FRIDAY,10,0,0,0,0,0,0,...,0.0761,0.0878,0.0039,0.0043,,block of flats,0.0753,Panel,No,0.0,0.0,0.0,0.0,-1104.0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,2.0,0.0,0.0


In [None]:
# (Optional Step)
# This Cell shows some user feature preparations to create task more difficult. Some feature engineering

%%time
#creating a new columns to make the existing features more understandable 
data['BIRTH_DATE'] = (np.datetime64('2018-01-01') + data['DAYS_BIRTH'].astype(np.dtype('timedelta64[D]'))).astype(str)
data['EMP_DATE'] = (np.datetime64('2018-01-01') + np.clip(data['DAYS_EMPLOYED'], None, 0).astype(np.dtype('timedelta64[D]'))
                    ).astype(str)
#creating three new columns
data['constant'] = 1
data['allnan'] = np.nan
data['report_dt'] = np.datetime64('2018-01-01')
#drop 'DAYS_BIRTH' and 'DAYS_EMPLOYE' column from dataset
data.drop(['DAYS_BIRTH', 'DAYS_EMPLOYED'], axis=1, inplace=True)

CPU times: user 139 ms, sys: 8.96 ms, total: 148 ms
Wall time: 149 ms


In [None]:
# (Optional Step) Data splitting for train-test
# Block below can be omitted if you are going to train model only or you have specific train and test files:
%%time
train_data, test_data = train_test_split(data, 
                                         test_size=TEST_SIZE, 
                                         stratify=data[TARGET_NAME], 
                                         random_state=RANDOM_STATE)
logging.info('Data splitted. Parts sizes: train_data = {}, test_data = {}'
              .format(train_data.shape, test_data.shape))

CPU times: user 15.2 ms, sys: 1.04 ms, total: 16.2 ms
Wall time: 18.3 ms


In [None]:
train_data.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,NAME_TYPE_SUITE,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,REGION_POPULATION_RELATIVE,DAYS_REGISTRATION,DAYS_ID_PUBLISH,OWN_CAR_AGE,FLAG_MOBIL,FLAG_EMP_PHONE,FLAG_WORK_PHONE,FLAG_CONT_MOBILE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,REG_REGION_NOT_LIVE_REGION,REG_REGION_NOT_WORK_REGION,LIVE_REGION_NOT_WORK_REGION,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,LIVE_CITY_NOT_WORK_CITY,ORGANIZATION_TYPE,EXT_SOURCE_1,...,HOUSETYPE_MODE,TOTALAREA_MODE,WALLSMATERIAL_MODE,EMERGENCYSTATE_MODE,OBS_30_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,FLAG_DOCUMENT_2,FLAG_DOCUMENT_3,FLAG_DOCUMENT_4,FLAG_DOCUMENT_5,FLAG_DOCUMENT_6,FLAG_DOCUMENT_7,FLAG_DOCUMENT_8,FLAG_DOCUMENT_9,FLAG_DOCUMENT_10,FLAG_DOCUMENT_11,FLAG_DOCUMENT_12,FLAG_DOCUMENT_13,FLAG_DOCUMENT_14,FLAG_DOCUMENT_15,FLAG_DOCUMENT_16,FLAG_DOCUMENT_17,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,BIRTH_DATE,EMP_DATE,constant,allnan,report_dt
6444,112261,0,Cash loans,F,N,N,1,90000.0,640080.0,31261.5,450000.0,Unaccompanied,Working,Secondary / secondary special,Married,House / apartment,0.019101,-2282.0,-3618,,1,1,0,1,0,0,Sales staff,3.0,2,2,TUESDAY,12,0,0,0,0,0,0,Self-employed,,...,block of flats,0.0726,"Stone, brick",No,2.0,0.0,2.0,0.0,-327.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,1.0,0.0,1985-06-28,2012-06-21,1,,2018-01-01
3586,115058,0,Cash loans,F,N,Y,0,180000.0,239850.0,23850.0,225000.0,Family,Pensioner,Higher education,Married,House / apartment,0.008575,-6264.0,-4008,,1,0,0,1,0,0,,2.0,2,2,SUNDAY,12,0,0,0,0,0,0,XNA,,...,block of flats,0.2889,"Stone, brick",No,2.0,0.0,2.0,0.0,-1002.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0,1953-12-27,2018-01-01,1,,2018-01-01
9349,326623,0,Cash loans,F,N,Y,0,112500.0,337500.0,31086.0,337500.0,Unaccompanied,Working,Incomplete higher,Married,House / apartment,0.026392,-8007.0,-4693,,1,1,1,1,1,0,,2.0,2,2,SUNDAY,14,0,0,0,0,0,0,Business Entity Type 3,0.803434,...,block of flats,0.0811,Panel,No,1.0,0.0,1.0,0.0,-1520.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,2.0,1975-06-21,2016-06-17,1,,2018-01-01
7734,191976,0,Cash loans,M,Y,Y,1,67500.0,135000.0,9018.0,135000.0,Unaccompanied,Working,Secondary / secondary special,Married,House / apartment,0.025164,-1889.0,-3102,64.0,1,1,1,1,0,0,Core staff,3.0,2,2,THURSDAY,15,0,0,0,0,0,0,Self-employed,0.265465,...,block of flats,0.0572,Block,No,3.0,0.0,3.0,0.0,-1042.0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,,,,1988-04-27,2009-06-05,1,,2018-01-01
2174,281519,0,Revolving loans,F,N,Y,0,67500.0,202500.0,10125.0,202500.0,Unaccompanied,Commercial associate,Higher education,Married,House / apartment,0.031329,-7976.0,-4276,,1,1,0,1,0,0,Core staff,2.0,2,2,MONDAY,11,0,0,0,0,0,0,School,0.807457,...,block of flats,0.0099,"Stone, brick",No,0.0,0.0,0.0,0.0,-1248.0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,2.0,1975-06-13,1997-01-22,1,,2018-01-01


# **AutoML Pipeline Creation**

In [None]:
# Step 1. Create Task and PandasReader
%%time
#We are going to do a binary classification on the given dataset
task = Task('binary')
#PandasToPandasReader convert pd.DataFrame to AutoML's PandasDataset.
reader = PandasToPandasReader(task, cv=N_FOLDS, random_state=RANDOM_STATE)

CPU times: user 6.92 ms, sys: 762 µs, total: 7.68 ms
Wall time: 8.7 ms


In [None]:
# Create feature selector (if necessary)
# We basically achieved that by creating light gbm and letting the feature importance from that to choose the best features.
# Now, I don't know much about lgbm; but as that is not a uniquely better algorithm; 
#therefore this may serve as a bottleneck for the performance of the automl model as it depends on that
%%time
#create a lightGBM model with default parameters as shown below
model0 = BoostLGBM(
    default_params={'learning_rate': 0.05, 'num_leaves': 64, 'seed': 42, 'num_threads': N_THREADS}
)
#Creates simple pipeline for tree based models.
#Simple but is ok for select features Numeric stay as is, Datetime transforms to numeric,
#Categorical label encoding Maps input to output features exactly one-to-one
pipe0 = LGBSimpleFeatures()
#Base class for performing feature selection using model feature importances.
mbie = ModelBasedImportanceEstimator()
#Selector based on importance threshold.
#It is important that data which passed to .fit should be ok to fit ml_algo or preprocessing pipeline should be defined.
selector = ImportanceCutoffSelector(pipe0, model0, mbie, cutoff=0)

Copying TaskTimer may affect the parent PipelineTimer, so copy will create new unlimited TaskTimer


CPU times: user 5.04 ms, sys: 0 ns, total: 5.04 ms
Wall time: 5.63 ms


In [None]:
# Create 1st level ML pipeline for AutoML
%%time 
#simple feature pipeline
pipe = LGBSimpleFeatures()
#initializing OptunaTuner for hyperparameter optimization
params_tuner1 = OptunaTuner(n_trials=20, timeout=30) # stop after 20 iterations or after 30 seconds 
#LGBM model with OptunaTuner
model1 = BoostLGBM(
    default_params={'learning_rate': 0.05, 'num_leaves': 128, 'seed': 1, 'num_threads': N_THREADS}
)
#Simple LGBM model with heuristic parameter
model2 = BoostLGBM(
    default_params={'learning_rate': 0.025, 'num_leaves': 64, 'seed': 2, 'num_threads': N_THREADS}
)
#Created two layers for the pipeline and then add them together to create the pipe(shown below):
pipeline_lvl1 = MLPipeline([
    (model1, params_tuner1),
    model2
], pre_selection=selector, features_pipeline=pipe, post_selection=None)

CPU times: user 1.65 ms, sys: 0 ns, total: 1.65 ms
Wall time: 1.66 ms


In [None]:
# Create 2nd level ML pipeline for AutoML
%%time
#creating another simple pipeline for features
pipe1 = LGBSimpleFeatures()
#creating another LGBM without tuning parameters
model = BoostLGBM(
    default_params={'learning_rate': 0.05, 'num_leaves': 64, 'max_bin': 1024, 'seed': 3, 'num_threads': N_THREADS},
    freeze_defaults=True
)
#Merging above two pipelines into one pipeline
pipeline_lvl2 = MLPipeline([model], pre_selection=None, features_pipeline=pipe1, post_selection=None)

CPU times: user 1.08 ms, sys: 18 µs, total: 1.1 ms
Wall time: 1.1 ms


In [None]:
# AutoML pipeline consist of:
#     Reader for data preparation
#     First level ML pipeline (as built in step 3.1)
#     Second level ML pipeline (as built in step 3.2)
#     Skip_conn = False equals here "not to use initial features on the second level pipeline"

%%time 
automl = AutoML(reader, [
    [pipeline_lvl1],
    [pipeline_lvl2],
], skip_conn=False)

CPU times: user 1.03 ms, sys: 7 µs, total: 1.04 ms
Wall time: 1.05 ms


In [None]:
# Train AutoML on loaded data
%%time 
#Now, fit the model on train data with target column as “TARGET” and get OOF predictions.
oof_pred = automl.fit_predict(train_data, roles={'target': TARGET_NAME})
logging.info('oof_pred:\n{}\nShape = {}'.format(oof_pred, oof_pred.shape))

Train data shape: (8000, 125)
Feats was rejected during automatic roles guess: []


Layer 1 ...
Train process start. Time left 9999999982.506426 secs
Start fitting LightGBM ...
Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.716183
Early stopping, best iteration is:
[16]	valid's auc: 0.720694
LightGBM fitting and predicting completed
Optuna may run 6299999981.687336 secs


INFO:optuna.storages._in_memory:A new study created in memory with name: no-name-9c3ff6e7-2835-4118-86e5-5199dc05b9c4


Start fitting Lvl_0_Pipe_0_Mod_0_LightGBM ...
Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.718332
[200]	valid's auc: 0.716862
Early stopping, best iteration is:
[133]	valid's auc: 0.722645
Lvl_0_Pipe_0_Mod_0_LightGBM fitting and predicting completed


INFO:optuna.study:Trial 0 finished with value: 0.7226454127042673 and parameters: {'feature_fraction': 0.6872700594236812, 'num_leaves': 108}. Best is trial 0 with value: 0.7226454127042673.


Start fitting Lvl_0_Pipe_0_Mod_0_LightGBM ...
Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.729792
[200]	valid's auc: 0.731156
Early stopping, best iteration is:
[137]	valid's auc: 0.733123
Lvl_0_Pipe_0_Mod_0_LightGBM fitting and predicting completed


INFO:optuna.study:Trial 1 finished with value: 0.7331227181123745 and parameters: {'feature_fraction': 0.5917173949330818, 'num_leaves': 87}. Best is trial 1 with value: 0.7331227181123745.


Start fitting Lvl_0_Pipe_0_Mod_0_LightGBM ...
Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.725099
Early stopping, best iteration is:
[49]	valid's auc: 0.732246
Lvl_0_Pipe_0_Mod_0_LightGBM fitting and predicting completed


INFO:optuna.study:Trial 2 finished with value: 0.732246045619043 and parameters: {'feature_fraction': 0.7993292420985183, 'num_leaves': 118}. Best is trial 1 with value: 0.7331227181123745.


Start fitting Lvl_0_Pipe_0_Mod_0_LightGBM ...
Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.724741
[200]	valid's auc: 0.727237
[300]	valid's auc: 0.728632
Early stopping, best iteration is:
[262]	valid's auc: 0.732492
Lvl_0_Pipe_0_Mod_0_LightGBM fitting and predicting completed


INFO:optuna.study:Trial 3 finished with value: 0.7324919415622945 and parameters: {'feature_fraction': 0.7229163764267956, 'num_leaves': 230}. Best is trial 1 with value: 0.7331227181123745.


Start fitting Lvl_0_Pipe_0_Mod_0_LightGBM ...
Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.729792
[200]	valid's auc: 0.731156
Early stopping, best iteration is:
[137]	valid's auc: 0.733123
Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.717227
Early stopping, best iteration is:
[70]	valid's auc: 0.721696
Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.719159
Early stopping, best iteration is:
[26]	valid's auc: 0.727571
Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.712264
[200]	valid's auc: 0.713735
Early stopping, best iteration is:
[148]	valid's auc: 0.718267
Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.724742
[200]	valid's auc: 0.724981
Early stopping, best iteration is:
[126]	valid's auc: 0.728149
Lvl_0_Pipe_0_Mod_0_LightGBM fitting and predicting completed
Start fitting Lvl_0_Pipe_0_Mod_1_LightGBM ...


INFO:root:oof_pred:
array([[0.08736689],
       [0.0646801 ],
       [0.06023028],
       ...,
       [0.05426202],
       [0.24257046],
       [0.13158381]], dtype=float32)
Shape = (8000, 1)


CPU times: user 2min 4s, sys: 1min 6s, total: 3min 10s
Wall time: 2min 12s


In [None]:
# Analyze fitted model
logging.info('Feature importances of selector:\n{}'
              .format(selector.get_features_score()))
logging.info('=' * 70)

logging.info('Feature importances of top level algorithm:\n{}'
              .format(automl.levels[-1][0].ml_algos[0].get_features_score()))
logging.info('=' * 70)

logging.info('Feature importances of lowest level algorithm - model 0:\n{}'
              .format(automl.levels[0][0].ml_algos[0].get_features_score()))
logging.info('=' * 70)

logging.info('Feature importances of lowest level algorithm - model 1:\n{}'
              .format(automl.levels[0][0].ml_algos[1].get_features_score()))
logging.info('=' * 70)

INFO:root:Feature importances of selector:
EXT_SOURCE_3              1029.681686
EXT_SOURCE_2               894.265428
BIRTH_DATE                 537.081401
EXT_SOURCE_1               424.764621
DAYS_LAST_PHONE_CHANGE     262.583100
                             ...     
FLAG_DOCUMENT_16             0.000000
FLAG_DOCUMENT_14             0.000000
FLAG_DOCUMENT_13             0.000000
FLAG_DOCUMENT_11             0.000000
FLAG_PHONE                   0.000000
Length: 110, dtype: float64
INFO:root:Feature importances of top level algorithm:
Lvl_0_Pipe_0_Mod_1_LightGBM_prediction_0    3314.700118
Lvl_0_Pipe_0_Mod_0_LightGBM_prediction_0    2325.774722
dtype: float64
INFO:root:Feature importances of lowest level algorithm - model 0:
EXT_SOURCE_2                  1732.125584
EXT_SOURCE_3                  1701.069305
dtdiff__BIRTH_DATE            1119.114075
DAYS_ID_PUBLISH               1044.948929
DAYS_LAST_PHONE_CHANGE         935.152708
                                 ...     
ord__FLAG_O

In [None]:
# Predict to test data and check scores
# 
%%time
test_pred = automl.predict(test_data)
logging.info('Prediction for test data:\n{}\nShape = {}'
              .format(test_pred, test_pred.shape))
logging.info('Check scores...')
logging.info('OOF score: {}'.format(roc_auc_score(train_data[TARGET_NAME].values, oof_pred.data[:, 0])))
logging.info('TEST score: {}'.format(roc_auc_score(test_data[TARGET_NAME].values, test_pred.data[:, 0])))

INFO:root:Prediction for test data:
array([[0.08588904],
       [0.06949789],
       [0.05792223],
       ...,
       [0.04920309],
       [0.04291723],
       [0.24163769]], dtype=float32)
Shape = (2000, 1)
INFO:root:Check scores...
INFO:root:OOF score: 0.6880159126505019
INFO:root:TEST score: 0.7096450407608695


CPU times: user 525 ms, sys: 2.34 ms, total: 527 ms
Wall time: 353 ms


In [None]:
# Profiling AutoML
%%time
p.profile('my_report_profile.html')
assert os.path.exists('my_report_profile.html'), 'Profile report failed to build'

CPU times: user 1.63 s, sys: 4.97 ms, total: 1.63 s
Wall time: 1.64 s
