In [1]:
import datetime
import gc
import os
import pathlib
import random
import sys
sys.path.append('..')
from typing import Any, Dict, List, Tuple, Union

from catboost import CatBoostClassifier,  EShapCalcType, EFeaturesSelectionAlgorithm, Pool
from joblib import dump
import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import polars as pl
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import StratifiedGroupKFold, TimeSeriesSplit
from sklearn.preprocessing import OrdinalEncoder

from scripts.get_depth_paths import get_depth_paths
from scripts.merge_dataset import merge_dataset


gc.enable()

In [2]:
class PathHandler:
    competition_dir = pathlib.Path('../../inputs')
    parquet_files_dir = competition_dir.joinpath('parquet_files')
    feature_dir = pathlib.Path('../../outputs/features')
    now_time = datetime.datetime.now()
    output_dir = pathlib.Path('../../dataset/')
    if not output_dir.is_dir():
        output_dir.mkdir(parents=True)


paths = PathHandler()

In [3]:
train_depth_paths = get_depth_paths(paths.parquet_files_dir, 'train')
test_depth_paths = get_depth_paths(paths.parquet_files_dir, 'test')
print(f'number of train paths: {sum(len(v1) for v1 in train_depth_paths.values())}')
print(f'number of test paths: {sum(len(v1) for v1 in test_depth_paths.values())}')

number of train paths: 31
number of test paths: 35


In [4]:
train_base_data = pl.read_parquet(
    paths.parquet_files_dir.joinpath('train/train_base.parquet')
)
train_base_data = train_base_data.cast(
    {
        'case_id': pl.Int64,
        'date_decision': pl.Date,
        'MONTH': pl.Int64,
        'WEEK_NUM': pl.Int64,
        'target': pl.Int64,
    }
)
display(train_base_data)

case_id,date_decision,MONTH,WEEK_NUM,target
i64,date,i64,i64,i64
0,2019-01-03,201901,0,0
1,2019-01-03,201901,0,0
2,2019-01-04,201901,0,0
3,2019-01-03,201901,0,0
4,2019-01-04,201901,0,1
…,…,…,…,…
2703450,2020-10-05,202010,91,0
2703451,2020-10-05,202010,91,0
2703452,2020-10-05,202010,91,0
2703453,2020-10-05,202010,91,0


In [5]:
test_base_data = pl.read_parquet(
    paths.parquet_files_dir.joinpath('test/test_base.parquet')
)
test_base_data = test_base_data.cast(
    {
        'case_id': pl.Int64,
        'date_decision': pl.Date,
        'MONTH': pl.Int64,
        'WEEK_NUM': pl.Int64,
    }
)
display(test_base_data)

case_id,date_decision,MONTH,WEEK_NUM
i64,date,i64,i64
57543,2020-10-06,202010,92
57549,2020-10-06,202010,92
57551,2020-10-06,202010,92
57552,2020-10-07,202010,92
57569,2020-10-06,202010,92
57630,2020-10-06,202010,92
57631,2020-10-06,202010,92
57632,2020-10-06,202010,92
57633,2020-10-06,202010,92
57634,2020-10-06,202010,92


### merge datasets

In [6]:
%%time


train_depth_data = merge_dataset(
    train_base_data,
    train_depth_paths,
    '012',
)
display(train_depth_data)
display(train_depth_data.dtypes.value_counts())

loading `static_0`
	(1003757, 168)
	(522902, 168)
loading `static_cb_0`
	(1500476, 53)
loading `applprev_1`
	(782997, 97)
	(438525, 97)
loading `other_1`
	(51109, 16)
loading `tax_registry_a_1`
	(457934, 7)
loading `tax_registry_b_1`
	(150732, 7)
loading `tax_registry_c_1`
	(482265, 7)
loading `credit_bureau_a_1`
	(335275, 193)
	(549263, 193)
	(325127, 193)
	(176608, 193)
loading `credit_bureau_b_1`
	(36500, 110)
loading `deposit_1`
	(105111, 10)
loading `person_1`
	(1526659, 69)
loading `debitcard_1`
	(111772, 13)
loading `applprev_2`
	(1221522, 13)
loading `person_2`
	(1435105, 30)
loading `credit_bureau_a_2`
	(98303, 85)
	(118481, 85)
	(23734, 85)
	(156749, 85)
	(190486, 85)
	(190313, 85)
	(231250, 85)
	(150426, 85)
	(45056, 85)
	(77457, 85)
	(103033, 85)
loading `credit_bureau_b_2`
	(36447, 28)


Unnamed: 0,case_id,date_decision,MONTH,WEEK_NUM,target,actualdpdtolerance_344P_0,amtinstpaidbefduel24m_4187115A_0,annuity_780A_0,annuitynextmonth_57A_0,applicationcnt_361L_0,...,diff_numinstunpaidmax_3546851L_0_cntpmts24_3658933L_0,sum_numinstunpaidmax_3546851L_0_cntpmts24_3658933L_0,diff_numinstunpaidmax_3546851L_0_numinstlsallpaid_934L_0,sum_numinstunpaidmax_3546851L_0_numinstlsallpaid_934L_0,diff_cntpmts24_3658933L_0_numinstlsallpaid_934L_0,sum_cntpmts24_3658933L_0_numinstlsallpaid_934L_0,year_date_decision,month_date_decision,day_date_decision,weekday_date_decision
0,0,2019-01-03,201901,0,0,,,1917.599976,0.000000,0.0,...,,,,,,,2019,1,3,4
1,1,2019-01-03,201901,0,0,,,3134.000000,0.000000,0.0,...,,,,,,,2019,1,3,4
2,2,2019-01-04,201901,0,0,,,4937.000000,0.000000,0.0,...,,,,,,,2019,1,4,5
3,3,2019-01-03,201901,0,0,,,4643.600098,0.000000,0.0,...,,,,,,,2019,1,3,4
4,4,2019-01-04,201901,0,1,,,3390.199951,0.000000,0.0,...,,,,,,,2019,1,4,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1526654,2703450,2020-10-05,202010,91,0,0.0,176561.359375,3675.400146,0.000000,0.0,...,-24.0,24.0,-112.0,112.0,-88.0,136.0,2020,10,5,1
1526655,2703451,2020-10-05,202010,91,0,0.0,301276.468750,7088.600098,6191.600098,0.0,...,-10.0,32.0,-62.0,84.0,-52.0,94.0,2020,10,5,1
1526656,2703452,2020-10-05,202010,91,0,0.0,14232.400391,7788.800293,0.000000,0.0,...,-7.0,7.0,-6.0,6.0,1.0,13.0,2020,10,5,1
1526657,2703453,2020-10-05,202010,91,0,0.0,197371.578125,1195.400024,2827.199951,0.0,...,-6.0,40.0,-72.0,106.0,-66.0,112.0,2020,10,5,1


float32           915
object            202
int64               4
int8                3
datetime64[ms]      1
bool                1
int16               1
Name: count, dtype: int64

CPU times: user 4min 52s, sys: 41.3 s, total: 5min 34s
Wall time: 25.4 s


In [7]:
display(train_depth_data.select_dtypes('int8'))
display(train_depth_data.select_dtypes('int32'))

Unnamed: 0,month_date_decision,day_date_decision,weekday_date_decision
0,1,3,4
1,1,3,4
2,1,4,5
3,1,3,4
4,1,4,5
...,...,...,...
1526654,10,5,1
1526655,10,5,1
1526656,10,5,1
1526657,10,5,1


0
1
2
3
4
...
1526654
1526655
1526656
1526657
1526658


In [8]:
train_depth_data[[col for col in train_depth_data.columns if 'date_decision' in col]]

Unnamed: 0,date_decision,year_date_decision,month_date_decision,day_date_decision,weekday_date_decision
0,2019-01-03,2019,1,3,4
1,2019-01-03,2019,1,3,4
2,2019-01-04,2019,1,4,5
3,2019-01-03,2019,1,3,4
4,2019-01-04,2019,1,4,5
...,...,...,...,...,...
1526654,2020-10-05,2020,10,5,1
1526655,2020-10-05,2020,10,5,1
1526656,2020-10-05,2020,10,5,1
1526657,2020-10-05,2020,10,5,1


In [9]:
%%time


test_depth_data = merge_dataset(
    test_base_data,
    test_depth_paths,
    '012',
)
display(test_depth_data)
display(test_depth_data.dtypes.value_counts())

loading `static_0`
	(10, 168)
	(10, 168)
	(10, 168)
loading `static_cb_0`
	(10, 53)
loading `applprev_1`
	(6, 97)
	(4, 97)
	(4, 97)
loading `other_1`
	(10, 16)
loading `tax_registry_a_1`
	(2, 7)
loading `tax_registry_b_1`
	(2, 7)
loading `tax_registry_c_1`
	(0, 7)
loading `credit_bureau_a_1`
	(2, 193)
	(2, 193)
	(1, 193)
	(2, 193)
	(1, 193)
loading `credit_bureau_b_1`
	(5, 110)
loading `deposit_1`
	(10, 10)
loading `person_1`
	(3, 69)
loading `debitcard_1`
	(10, 13)
loading `applprev_2`
	(4, 13)
loading `person_2`
	(10, 30)
loading `credit_bureau_a_2`
	(1, 85)
	(1, 85)
	(1, 85)
	(1, 85)
	(1, 85)
	(1, 85)
	(1, 85)
	(1, 85)
	(1, 85)
	(1, 85)
	(1, 85)
	(1, 85)
loading `credit_bureau_b_2`
	(1, 28)


Unnamed: 0,case_id,date_decision,MONTH,WEEK_NUM,actualdpdtolerance_344P_0,amtinstpaidbefduel24m_4187115A_0,annuity_780A_0,annuitynextmonth_57A_0,applicationcnt_361L_0,applications30d_658L_0,...,diff_numinstunpaidmax_3546851L_0_cntpmts24_3658933L_0,sum_numinstunpaidmax_3546851L_0_cntpmts24_3658933L_0,diff_numinstunpaidmax_3546851L_0_numinstlsallpaid_934L_0,sum_numinstunpaidmax_3546851L_0_numinstlsallpaid_934L_0,diff_cntpmts24_3658933L_0_numinstlsallpaid_934L_0,sum_cntpmts24_3658933L_0_numinstlsallpaid_934L_0,year_date_decision,month_date_decision,day_date_decision,weekday_date_decision
0,57543,2020-10-06,202010,92,,,7637.200195,0.0,0.0,0.0,...,,,,,,,2020,10,6,2
1,57549,2020-10-06,202010,92,,,902.600037,0.0,0.0,0.0,...,,,,,,,2020,10,6,2
2,57551,2020-10-06,202010,92,,,3610.199951,0.0,0.0,0.0,...,,,,,,,2020,10,6,2
3,57552,2020-10-07,202010,92,,,6964.399902,0.0,0.0,0.0,...,,,,,,,2020,10,7,3
4,57569,2020-10-06,202010,92,,,5553.399902,0.0,0.0,0.0,...,,,,,,,2020,10,6,2
5,57630,2020-10-06,202010,92,,,7404.800293,0.0,0.0,0.0,...,,,,,,,2020,10,6,2
6,57631,2020-10-06,202010,92,,,2872.800049,0.0,0.0,0.0,...,,,,,,,2020,10,6,2
7,57632,2020-10-06,202010,92,,,6225.800293,0.0,0.0,1.0,...,,,,,,,2020,10,6,2
8,57633,2020-10-06,202010,92,0.0,,7917.0,0.0,0.0,0.0,...,,,,,,,2020,10,6,2
9,57634,2020-10-06,202010,92,,,5894.0,0.0,0.0,0.0,...,,,,,,,2020,10,6,2


float32           915
object            202
int64               3
int8                3
datetime64[ms]      1
bool                1
int16               1
Name: count, dtype: int64

CPU times: user 3.88 s, sys: 1.21 s, total: 5.09 s
Wall time: 2.75 s


### concatenate training and test

In [10]:
train_depth_data['is_test'] = 0
test_depth_data['is_test'] = 1

display(train_depth_data.shape, test_depth_data.shape)
depth_data = pd.concat([train_depth_data, test_depth_data], axis=0)
display(depth_data)

(1526659, 1128)

(10, 1127)

Unnamed: 0,case_id,date_decision,MONTH,WEEK_NUM,target,actualdpdtolerance_344P_0,amtinstpaidbefduel24m_4187115A_0,annuity_780A_0,annuitynextmonth_57A_0,applicationcnt_361L_0,...,sum_numinstunpaidmax_3546851L_0_cntpmts24_3658933L_0,diff_numinstunpaidmax_3546851L_0_numinstlsallpaid_934L_0,sum_numinstunpaidmax_3546851L_0_numinstlsallpaid_934L_0,diff_cntpmts24_3658933L_0_numinstlsallpaid_934L_0,sum_cntpmts24_3658933L_0_numinstlsallpaid_934L_0,year_date_decision,month_date_decision,day_date_decision,weekday_date_decision,is_test
0,0,2019-01-03,201901,0,0.0,,,1917.599976,0.0,0.0,...,,,,,,2019,1,3,4,0
1,1,2019-01-03,201901,0,0.0,,,3134.000000,0.0,0.0,...,,,,,,2019,1,3,4,0
2,2,2019-01-04,201901,0,0.0,,,4937.000000,0.0,0.0,...,,,,,,2019,1,4,5,0
3,3,2019-01-03,201901,0,0.0,,,4643.600098,0.0,0.0,...,,,,,,2019,1,3,4,0
4,4,2019-01-04,201901,0,1.0,,,3390.199951,0.0,0.0,...,,,,,,2019,1,4,5,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5,57630,2020-10-06,202010,92,,,,7404.800293,0.0,0.0,...,,,,,,2020,10,6,2,1
6,57631,2020-10-06,202010,92,,,,2872.800049,0.0,0.0,...,,,,,,2020,10,6,2,1
7,57632,2020-10-06,202010,92,,,,6225.800293,0.0,0.0,...,,,,,,2020,10,6,2,1
8,57633,2020-10-06,202010,92,,0.0,,7917.000000,0.0,0.0,...,,,,,,2020,10,6,2,1


### drop high frequency or only one columns

In [11]:
categorical_features = train_depth_data.dtypes.index[train_depth_data.dtypes==object].to_list()
if 'date_decision' in categorical_features:
    categorical_features.remove('date_decision')
nunique_categorical_features = train_depth_data[categorical_features].nunique().sort_values()
display(nunique_categorical_features.sort_values())
drop_categorical_features = nunique_categorical_features[(nunique_categorical_features > 200) | (nunique_categorical_features == 1)].index
print(len(drop_categorical_features))
print(drop_categorical_features)

del train_depth_data, test_depth_data
gc.collect()

display(depth_data)
depth_data.drop(columns=drop_categorical_features, inplace=True)
display(depth_data)

bankacctype_710L_0                      1
last_last_addres_role_871L_2            1
max_last_subjectroles_name_838M_2       1
last_last_subjectroles_name_838M_2      1
last_subjectrole_93M_1                  1
                                     ... 
max_contaddr_district_15M_1           613
last_contaddr_district_15M_1          613
max_district_544M_1                   625
last_registaddr_district_1083M_1      636
max_registaddr_district_1083M_1       636
Length: 202, dtype: int64

31
Index(['bankacctype_710L_0', 'max_contaddr_matchlist_1032L_1',
       'paytype_783L_0', 'paytype1st_925L_0', 'last_role_993L_1',
       'last_remitter_829L_1', 'max_isreference_387L_1', 'max_remitter_829L_1',
       'max_role_993L_1', 'last_isreference_387L_1', 'typesuite_864L_0',
       'last_contaddr_matchlist_1032L_1', 'isdebitcard_729L_0',
       'max_last_addres_role_871L_2', 'last_subjectrole_93M_1',
       'last_subjectrole_182M_1', 'last_last_subjectroles_name_838M_2',
       'max_last_subjectroles_name_838M_2', 'last_last_addres_role_871L_2',
       'lastapprcommoditytypec_5251766M_0', 'previouscontdistrict_112M_0',
       'max_max_addres_district_368M_2', 'max_last_addres_district_368M_2',
       'last_max_addres_district_368M_2', 'last_district_544M_1',
       'last_last_addres_district_368M_2', 'max_contaddr_district_15M_1',
       'last_contaddr_district_15M_1', 'max_district_544M_1',
       'last_registaddr_district_1083M_1', 'max_registaddr_district_1083M_1'],
      d

Unnamed: 0,case_id,date_decision,MONTH,WEEK_NUM,target,actualdpdtolerance_344P_0,amtinstpaidbefduel24m_4187115A_0,annuity_780A_0,annuitynextmonth_57A_0,applicationcnt_361L_0,...,sum_numinstunpaidmax_3546851L_0_cntpmts24_3658933L_0,diff_numinstunpaidmax_3546851L_0_numinstlsallpaid_934L_0,sum_numinstunpaidmax_3546851L_0_numinstlsallpaid_934L_0,diff_cntpmts24_3658933L_0_numinstlsallpaid_934L_0,sum_cntpmts24_3658933L_0_numinstlsallpaid_934L_0,year_date_decision,month_date_decision,day_date_decision,weekday_date_decision,is_test
0,0,2019-01-03,201901,0,0.0,,,1917.599976,0.0,0.0,...,,,,,,2019,1,3,4,0
1,1,2019-01-03,201901,0,0.0,,,3134.000000,0.0,0.0,...,,,,,,2019,1,3,4,0
2,2,2019-01-04,201901,0,0.0,,,4937.000000,0.0,0.0,...,,,,,,2019,1,4,5,0
3,3,2019-01-03,201901,0,0.0,,,4643.600098,0.0,0.0,...,,,,,,2019,1,3,4,0
4,4,2019-01-04,201901,0,1.0,,,3390.199951,0.0,0.0,...,,,,,,2019,1,4,5,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5,57630,2020-10-06,202010,92,,,,7404.800293,0.0,0.0,...,,,,,,2020,10,6,2,1
6,57631,2020-10-06,202010,92,,,,2872.800049,0.0,0.0,...,,,,,,2020,10,6,2,1
7,57632,2020-10-06,202010,92,,,,6225.800293,0.0,0.0,...,,,,,,2020,10,6,2,1
8,57633,2020-10-06,202010,92,,0.0,,7917.000000,0.0,0.0,...,,,,,,2020,10,6,2,1


Unnamed: 0,case_id,date_decision,MONTH,WEEK_NUM,target,actualdpdtolerance_344P_0,amtinstpaidbefduel24m_4187115A_0,annuity_780A_0,annuitynextmonth_57A_0,applicationcnt_361L_0,...,sum_numinstunpaidmax_3546851L_0_cntpmts24_3658933L_0,diff_numinstunpaidmax_3546851L_0_numinstlsallpaid_934L_0,sum_numinstunpaidmax_3546851L_0_numinstlsallpaid_934L_0,diff_cntpmts24_3658933L_0_numinstlsallpaid_934L_0,sum_cntpmts24_3658933L_0_numinstlsallpaid_934L_0,year_date_decision,month_date_decision,day_date_decision,weekday_date_decision,is_test
0,0,2019-01-03,201901,0,0.0,,,1917.599976,0.0,0.0,...,,,,,,2019,1,3,4,0
1,1,2019-01-03,201901,0,0.0,,,3134.000000,0.0,0.0,...,,,,,,2019,1,3,4,0
2,2,2019-01-04,201901,0,0.0,,,4937.000000,0.0,0.0,...,,,,,,2019,1,4,5,0
3,3,2019-01-03,201901,0,0.0,,,4643.600098,0.0,0.0,...,,,,,,2019,1,3,4,0
4,4,2019-01-04,201901,0,1.0,,,3390.199951,0.0,0.0,...,,,,,,2019,1,4,5,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5,57630,2020-10-06,202010,92,,,,7404.800293,0.0,0.0,...,,,,,,2020,10,6,2,1
6,57631,2020-10-06,202010,92,,,,2872.800049,0.0,0.0,...,,,,,,2020,10,6,2,1
7,57632,2020-10-06,202010,92,,,,6225.800293,0.0,0.0,...,,,,,,2020,10,6,2,1
8,57633,2020-10-06,202010,92,,0.0,,7917.000000,0.0,0.0,...,,,,,,2020,10,6,2,1


### drop many nan features

In [12]:
ratio_nans = depth_data.isna().mean()
high_ratio_nan_features = ratio_nans[ratio_nans > 0.7].index
print(high_ratio_nan_features)
print(len(high_ratio_nan_features))

display(depth_data)
depth_data.drop(columns=high_ratio_nan_features, inplace=True)
display(depth_data)

Index(['avglnamtstart24m_4525187A_0', 'cardtype_51L_0', 'clientscnt_136L_0',
       'datelastinstal40dpd_247D_0', 'equalitydataagreement_891L_0',
       'equalityempfrom_62L_0', 'inittransactionamount_650A_0',
       'interestrategrace_34L_0', 'isbidproductrequest_292L_0',
       'lastdependentsnum_448L_0',
       ...
       'diff_pctinstlsallpaidlate1d_3546856L_0_pmtscount_423L_0',
       'sum_pctinstlsallpaidlate1d_3546856L_0_pmtscount_423L_0',
       'diff_numinstpaidearly3d_3546850L_0_pmtscount_423L_0',
       'sum_numinstpaidearly3d_3546850L_0_pmtscount_423L_0',
       'diff_pmtscount_423L_0_numinstunpaidmax_3546851L_0',
       'sum_pmtscount_423L_0_numinstunpaidmax_3546851L_0',
       'diff_pmtscount_423L_0_cntpmts24_3658933L_0',
       'sum_pmtscount_423L_0_cntpmts24_3658933L_0',
       'diff_pmtscount_423L_0_numinstlsallpaid_934L_0',
       'sum_pmtscount_423L_0_numinstlsallpaid_934L_0'],
      dtype='object', length=389)
389


Unnamed: 0,case_id,date_decision,MONTH,WEEK_NUM,target,actualdpdtolerance_344P_0,amtinstpaidbefduel24m_4187115A_0,annuity_780A_0,annuitynextmonth_57A_0,applicationcnt_361L_0,...,sum_numinstunpaidmax_3546851L_0_cntpmts24_3658933L_0,diff_numinstunpaidmax_3546851L_0_numinstlsallpaid_934L_0,sum_numinstunpaidmax_3546851L_0_numinstlsallpaid_934L_0,diff_cntpmts24_3658933L_0_numinstlsallpaid_934L_0,sum_cntpmts24_3658933L_0_numinstlsallpaid_934L_0,year_date_decision,month_date_decision,day_date_decision,weekday_date_decision,is_test
0,0,2019-01-03,201901,0,0.0,,,1917.599976,0.0,0.0,...,,,,,,2019,1,3,4,0
1,1,2019-01-03,201901,0,0.0,,,3134.000000,0.0,0.0,...,,,,,,2019,1,3,4,0
2,2,2019-01-04,201901,0,0.0,,,4937.000000,0.0,0.0,...,,,,,,2019,1,4,5,0
3,3,2019-01-03,201901,0,0.0,,,4643.600098,0.0,0.0,...,,,,,,2019,1,3,4,0
4,4,2019-01-04,201901,0,1.0,,,3390.199951,0.0,0.0,...,,,,,,2019,1,4,5,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5,57630,2020-10-06,202010,92,,,,7404.800293,0.0,0.0,...,,,,,,2020,10,6,2,1
6,57631,2020-10-06,202010,92,,,,2872.800049,0.0,0.0,...,,,,,,2020,10,6,2,1
7,57632,2020-10-06,202010,92,,,,6225.800293,0.0,0.0,...,,,,,,2020,10,6,2,1
8,57633,2020-10-06,202010,92,,0.0,,7917.000000,0.0,0.0,...,,,,,,2020,10,6,2,1


Unnamed: 0,case_id,date_decision,MONTH,WEEK_NUM,target,actualdpdtolerance_344P_0,amtinstpaidbefduel24m_4187115A_0,annuity_780A_0,annuitynextmonth_57A_0,applicationcnt_361L_0,...,sum_numinstunpaidmax_3546851L_0_cntpmts24_3658933L_0,diff_numinstunpaidmax_3546851L_0_numinstlsallpaid_934L_0,sum_numinstunpaidmax_3546851L_0_numinstlsallpaid_934L_0,diff_cntpmts24_3658933L_0_numinstlsallpaid_934L_0,sum_cntpmts24_3658933L_0_numinstlsallpaid_934L_0,year_date_decision,month_date_decision,day_date_decision,weekday_date_decision,is_test
0,0,2019-01-03,201901,0,0.0,,,1917.599976,0.0,0.0,...,,,,,,2019,1,3,4,0
1,1,2019-01-03,201901,0,0.0,,,3134.000000,0.0,0.0,...,,,,,,2019,1,3,4,0
2,2,2019-01-04,201901,0,0.0,,,4937.000000,0.0,0.0,...,,,,,,2019,1,4,5,0
3,3,2019-01-03,201901,0,0.0,,,4643.600098,0.0,0.0,...,,,,,,2019,1,3,4,0
4,4,2019-01-04,201901,0,1.0,,,3390.199951,0.0,0.0,...,,,,,,2019,1,4,5,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5,57630,2020-10-06,202010,92,,,,7404.800293,0.0,0.0,...,,,,,,2020,10,6,2,1
6,57631,2020-10-06,202010,92,,,,2872.800049,0.0,0.0,...,,,,,,2020,10,6,2,1
7,57632,2020-10-06,202010,92,,,,6225.800293,0.0,0.0,...,,,,,,2020,10,6,2,1
8,57633,2020-10-06,202010,92,,0.0,,7917.000000,0.0,0.0,...,,,,,,2020,10,6,2,1


### drop highly correlated column from pairs

In [13]:
# numerical_features = depth_data.select_dtypes(exclude='object').columns.to_list()
# numerical_features = [col for col in numerical_features if col not in ['case_id', 'MONTH', 'date_decision', 'WEEK_NUM', 'target', 'is_test']]

# corr_matrix = depth_data[numerical_features].corr().abs()
# upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
# to_drop = [column for column in upper.columns if any(upper[column] > 0.9)]
# depth_data.drop(to_drop, axis=1, inplace=True)

# display(depth_data)

In [14]:
categorical_features = depth_data.select_dtypes(include='object').columns
if 'date_decision' in categorical_features:
    categorical_features.remove('datane_decision')
print(len(categorical_features))

120


In [15]:
depth_data.to_parquet(paths.output_dir.joinpath('depth_data.parquet'), index=False)
np.save(paths.output_dir.joinpath('categorical_features.npy'), categorical_features)

In [16]:
mode_cols = [col for col in depth_data.columns if 'mode' in col]

display(depth_data[mode_cols])

0
1
2
3
4
...
5
6
7
8
9


In [17]:
depth_data[[col for col in depth_data.columns if 'num_group' in col]]

0
1
2
3
4
...
5
6
7
8
9
