In [1]:
from collections import OrderedDict
import gc
import pathlib
from pprint import pprint
from typing import Dict, List, Tuple, Union
import sys

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import polars as pl

sys.path.append('..')

from scripts.get_depth_paths import get_depth_paths
from scripts.merge_dataset import merge_dataset

In [2]:
class PathHandler:
    competition_dir = pathlib.Path('../../inputs')
    parquet_files_dir = competition_dir.joinpath('parquet_files')
    feature_dir = pathlib.Path('../../outputs/features')
    output_dir = pathlib.Path('../../outputs')


paths = PathHandler()

In [3]:
bool_features = pd.read_csv(paths.feature_dir.joinpath('bool_features.csv'))
float64_features = pd.read_csv(paths.feature_dir.joinpath('float64_features.csv'))
string_features = pd.read_csv(paths.feature_dir.joinpath('string_features.csv'))
date_features = pd.read_csv(paths.feature_dir.joinpath('date_features.csv'))

bool_features['cast_dtype'] = pl.Boolean
float64_features['cast_dtype'] = pl.Float32
string_features['cast_dtype'] = pl.String
date_features['cast_dtype'] = pl.Date

cast_features = pd.concat([bool_features, float64_features, string_features, date_features])
display(cast_features)

Unnamed: 0,Variable,Description,train_num_unique_values,train_unique_types,train_num_nan,train_files,train_num_unique_types,test_num_unique_values,test_unique_types,test_num_nan,test_files,test_num_unique_types,transform,cast_dtype
0,remitter_829L,Flag indicating whether the client is a remitter.,2,bool,2168942,['person_1'],1,2,bool,7,['person_1'],1,L,Boolean
1,contaddr_matchlist_1032L,Indicates whether the contact address is found...,2,bool,1447773,['person_1'],1,2,bool,7,['person_1'],1,L,Boolean
2,isbidproduct_1095L,Flag indicating if the product is a cross-sell.,2,bool,0,['static_0'],1,2,bool,0,['static_0'],1,L,Boolean
3,isdebitcard_729L,Flag indicating if the product is a debit card.,2,bool,1334357,['static_0'],1,2,bool,9,['static_0'],1,L,Boolean
4,contaddr_smempladdr_334L,Indicates whether the contact address is the s...,3,bool,1447773,['person_1'],1,2,bool,7,['person_1'],1,L,Boolean
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52,lastdelinqdate_224D,Date of the last delinquency occurrence.,4402,str,977975,['static_0'],1,1,,30,['static_0'],0,D,Date
53,datelastinstal40dpd_247D,Date of last instalment that was more than 40 ...,5176,str,1392841,['static_0'],1,1,,30,['static_0'],0,D,Date
54,datelastunpaid_3546854D,Date of the last unpaid instalment.,5232,str,887659,['static_0'],1,1,,30,['static_0'],0,D,Date
55,maxdpdinstldate_3546855D,Date of instalment on which client was most da...,5288,str,826000,['static_0'],1,1,,30,['static_0'],0,D,Date


In [4]:
cast_features['transform'] = cast_features['Variable'].map(lambda x: x[-1])
display(pd.crosstab(cast_features['transform'], cast_features['cast_dtype'].astype(str)))

cast_dtype,Boolean,Date,Float32,String
transform,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A,0,0,102,0
D,0,57,0,0
L,13,0,143,31
M,0,0,0,63
P,0,0,33,0
T,0,0,17,5


In [5]:
train_depth_paths = get_depth_paths(paths.parquet_files_dir, 'train')
print(f'the number of test paths: {sum(len(v1) for v1 in train_depth_paths.values())}')

the number of test paths: 31


In [6]:
train_base_data = pl.read_parquet(
    paths.parquet_files_dir.joinpath('train/train_base.parquet')
)
train_base_data = train_base_data.cast(
    {
        'case_id': pl.Int64,
        'date_decision': pl.String,
        'MONTH': pl.Int64,
        'WEEK_NUM': pl.Int64,
        'target': pl.Int64,
    }
)
display(train_base_data)

case_id,date_decision,MONTH,WEEK_NUM,target
i64,str,i64,i64,i64
0,"""2019-01-03""",201901,0,0
1,"""2019-01-03""",201901,0,0
2,"""2019-01-04""",201901,0,0
3,"""2019-01-03""",201901,0,0
4,"""2019-01-04""",201901,0,1
…,…,…,…,…
2703450,"""2020-10-05""",202010,91,0
2703451,"""2020-10-05""",202010,91,0
2703452,"""2020-10-05""",202010,91,0
2703453,"""2020-10-05""",202010,91,0


In [7]:
%%time


depth_data = merge_dataset(
    train_base_data,
    train_depth_paths,
    bool_features,
    float64_features,
    string_features,
    date_features,
    None,
    '012'
)
display(depth_data)

loading `static_0`


bankacctype_710L,cardtype_51L,credtype_322L,disbursementtype_67L,inittransactioncode_186L,lastapprcommoditycat_1041M,lastapprcommoditytypec_5251766M,lastcancelreason_561M,lastrejectcommoditycat_161M,lastrejectcommodtypec_5251769M,lastrejectreason_759M,lastrejectreasonclient_4145040M,lastst_736L,paytype1st_925L,paytype_783L,previouscontdistrict_112M,twobodfilling_608L,typesuite_864L,avgdpdtolclosure24_3658938P
str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,f32
,,"""CAL""","""GBA""","""CASH""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",,"""OTHER""","""OTHER""","""a55475b1""","""BO""",,
,,"""CAL""","""GBA""","""CASH""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",,"""OTHER""","""OTHER""","""a55475b1""","""BO""",,
,,"""CAL""","""GBA""","""CASH""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""D""","""OTHER""","""OTHER""","""a55475b1""","""BO""","""AL""",
,,"""CAL""","""GBA""","""CASH""","""a55475b1""","""a55475b1""","""P94_109_143""","""a55475b1""","""a55475b1""","""P94_109_143""","""a55475b1""","""D""","""OTHER""","""OTHER""","""a55475b1""","""BO""","""AL""",
,,"""CAL""","""GBA""","""CASH""","""a55475b1""","""a55475b1""","""P24_27_36""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""T""","""OTHER""","""OTHER""","""a55475b1""","""BO""","""AL""",
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
,"""INSTANT""","""REL""","""DD""","""NDF""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""P94_109_143""","""P94_109_143""","""A""","""OTHER""","""OTHER""","""P178_112_160""","""FO""",,0.0
,"""INSTANT""","""REL""","""DD""","""NDF""","""P12_6_178""","""a55475b1""","""a55475b1""","""P12_6_178""","""a55475b1""","""P94_109_143""","""P94_109_143""","""A""","""OTHER""","""OTHER""","""P173_115_85""","""FO""",,0.0
,"""INSTANT""","""REL""","""DD""","""NDF""","""P148_110_5""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""K""","""OTHER""","""OTHER""","""P178_112_160""","""FO""",,0.0
"""CA""",,"""CAL""","""GBA""","""CASH""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""A""","""OTHER""","""OTHER""","""P121_97_69""","""FO""",,0.0


	(1003757, 168)


bankacctype_710L,cardtype_51L,credtype_322L,disbursementtype_67L,inittransactioncode_186L,lastapprcommoditycat_1041M,lastapprcommoditytypec_5251766M,lastcancelreason_561M,lastrejectcommoditycat_161M,lastrejectcommodtypec_5251769M,lastrejectreason_759M,lastrejectreasonclient_4145040M,lastst_736L,paytype1st_925L,paytype_783L,previouscontdistrict_112M,twobodfilling_608L,typesuite_864L,avgdpdtolclosure24_3658938P
str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,f32
,,"""CAL""","""GBA""","""CASH""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",,"""OTHER""","""OTHER""","""a55475b1""","""FO""","""AL""",
,,"""CAL""","""GBA""","""CASH""","""a55475b1""","""a55475b1""","""P94_109_143""","""a55475b1""","""a55475b1""","""P198_131_9""","""P94_109_143""","""D""","""OTHER""","""OTHER""","""a55475b1""","""FO""","""AL""",
,,"""CAL""","""GBA""","""CASH""","""a55475b1""","""a55475b1""","""P94_109_143""","""a55475b1""","""a55475b1""","""P45_84_106""","""P94_109_143""","""D""","""OTHER""","""OTHER""","""a55475b1""","""FO""","""AL""",
,,"""CAL""","""GBA""","""CASH""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""D""","""OTHER""","""OTHER""","""a55475b1""","""FO""","""AL""",
,,"""CAL""","""GBA""","""CASH""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",,"""OTHER""","""OTHER""","""a55475b1""","""FO""","""AL""",
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""CA""",,"""CAL""","""GBA""","""CASH""","""P12_6_178""","""P142_50_170""","""a55475b1""","""a55475b1""","""a55475b1""","""P94_109_143""","""P94_109_143""","""K""","""OTHER""","""OTHER""","""P123_39_170""","""FO""",,0.0
"""CA""",,"""CAL""","""GBA""","""CASH""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""A""","""OTHER""","""OTHER""","""P162_18_172""","""FO""",,0.0
"""CA""",,"""CAL""","""GBA""","""CASH""","""P159_130_59""","""P75_90_70""","""P180_60_137""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""T""","""OTHER""","""OTHER""","""P133_44_167""","""BO""",,1.0
"""CA""",,"""CAL""","""GBA""","""CASH""","""a55475b1""","""a55475b1""","""a55475b1""","""P159_130_59""","""P174_113_42""","""a55475b1""","""a55475b1""","""A""","""OTHER""","""OTHER""","""P123_6_84""","""BO""",,0.0


	(522902, 168)
loading `static_cb_0`
	(1500476, 53)
loading `applprev_1`


case_id,num_group1
i64,i64
2,0
2,1
3,0
4,0
5,0
…,…
2651092,3
2651092,4
2651092,5
2651092,6


	(782997, 40)


case_id,num_group1
i64,i64
40704,0
40734,0
40737,0
40791,0
40791,1
…,…
2703453,6
2703453,7
2703453,8
2703454,0


	(438525, 40)
loading `other_1`


case_id,num_group1
i64,i64
43801,0
43991,0
44001,0
44053,0
44130,0
…,…
2703443,0
2703448,0
2703450,0
2703451,0


	(51109, 6)
loading `tax_registry_a_1`


case_id,num_group1
i64,i64
28631,2
28631,3
28631,0
28631,1
28632,5
…,…
2701515,7
2701515,6
2702290,1
2702290,2


	(457934, 4)
loading `tax_registry_b_1`


case_id,num_group1
i64,i64
49435,7
49435,1
49435,8
49435,3
49435,4
…,…
2703452,2
2703452,4
2703452,3
2703452,0


	(150732, 4)
loading `tax_registry_c_1`


case_id,num_group1
i64,i64
357,5
357,1
357,4
357,0
357,3
…,…
2629815,10
2629815,1
2629815,6
2629815,3


	(482265, 4)
loading `credit_bureau_a_1`


case_id,num_group1
i64,i64
388,0
388,1
388,2
388,3
388,4
…,…
2588481,6
2588481,7
2588481,8
2588481,9


	(335275, 78)


case_id,num_group1
i64,i64
19694,0
19694,1
19694,2
19694,3
19694,4
…,…
2651092,5
2651092,6
2651092,7
2651092,8


	(549263, 78)


case_id,num_group1
i64,i64
40626,0
40626,1
40626,2
40626,3
40626,4
…,…
2683578,4
2683578,5
2683578,6
2683578,7


	(325127, 78)


case_id,num_group1
i64,i64
51903,0
51903,1
51903,2
51903,3
51903,4
…,…
2703454,5
2703454,6
2703454,7
2703454,8


	(176608, 78)
loading `credit_bureau_b_1`


case_id,num_group1
i64,i64
467,0
467,1
467,2
1445,0
1445,1
…,…
2703357,0
2703357,1
2703377,0
2703436,0


	(36500, 44)
loading `deposit_1`


case_id,num_group1
i64,i64
225,0
331,0
358,0
390,0
390,2
…,…
2703430,8
2703430,2
2703439,0
2703453,1


	(105111, 4)
loading `person_1`


case_id,num_group1
i64,i64
0,0
0,1
0,2
0,3
1,0
…,…
2703451,1
2703452,0
2703453,0
2703453,1


	(1526659, 36)
loading `debitcard_1`


case_id,num_group1
i64,i64
225,0
331,0
358,0
390,0
390,2
…,…
2703430,8
2703430,2
2703439,0
2703453,1


	(111772, 5)
loading `applprev_2`


case_id,num_group1,num_group2
i64,i64,i64
2,1,1
2,0,1
2,0,0
2,1,0
3,0,1
…,…,…
2703454,0,0
2703454,1,3
2703454,0,1
2703454,1,0


	(1221522, 4)
loading `person_2`


case_id,num_group1,num_group2
i64,i64,i64
5,0,0
6,0,0
6,0,1
6,1,0
6,1,1
…,…,…
2703450,0,0
2703451,0,0
2703452,0,0
2703453,0,0


	(1435041, 9)
loading `credit_bureau_a_2`


case_id,num_group1,num_group2
i64,i64,i64
388,0,0
388,0,1
388,0,2
388,0,3
388,0,4
…,…,…
2548729,2,31
2548729,2,32
2548729,2,33
2548729,2,34


	(98303, 17)


case_id,num_group1,num_group2
i64,i64,i64
6683,0,0
6683,0,1
6683,0,2
6683,0,3
6683,0,4
…,…,…
2570525,0,7
2570525,0,8
2570525,0,9
2570525,0,10


	(118481, 17)


case_id,num_group1,num_group2
i64,i64,i64
56408,0,0
56408,0,1
56408,0,2
56408,0,3
56408,0,4
…,…,…
2703454,8,19
2703454,8,20
2703454,8,21
2703454,8,22


	(23734, 17)


case_id,num_group1,num_group2
i64,i64,i64
13927,0,0
13927,0,1
13927,0,2
13927,0,3
13927,0,4
…,…,…
2593511,5,19
2593511,5,20
2593511,5,21
2593511,5,22


	(156749, 17)


case_id,num_group1,num_group2
i64,i64,i64
21161,0,0
21161,0,1
21161,0,2
21161,0,3
21161,0,4
…,…,…
2619253,9,19
2619253,9,20
2619253,9,21
2619253,9,22


	(190486, 17)


case_id,num_group1,num_group2
i64,i64,i64
29427,0,0
29427,0,1
29427,0,2
29427,0,3
29427,0,4
…,…,…
2640040,1,31
2640040,1,32
2640040,1,33
2640040,1,34


	(190313, 17)


case_id,num_group1,num_group2
i64,i64,i64
36830,0,0
36830,0,1
36830,0,2
36830,0,3
36830,0,4
…,…,…
2658153,0,31
2658153,0,32
2658153,0,33
2658153,0,34


	(231250, 17)


case_id,num_group1,num_group2
i64,i64,i64
42865,0,0
42865,0,1
42865,0,2
42865,0,3
42865,0,4
…,…,…
2677343,0,19
2677343,0,20
2677343,0,21
2677343,0,22


	(150426, 17)


case_id,num_group1,num_group2
i64,i64,i64
49417,0,0
49417,0,1
49417,0,2
49417,0,3
49417,0,4
…,…,…
2681255,15,19
2681255,15,20
2681255,15,21
2681255,15,22


	(45056, 17)


case_id,num_group1,num_group2
i64,i64,i64
51083,0,0
51083,0,1
51083,0,2
51083,0,3
51083,0,4
…,…,…
2688744,0,19
2688744,0,20
2688744,0,21
2688744,0,22


	(77457, 17)


case_id,num_group1,num_group2
i64,i64,i64
53716,0,0
53716,0,1
53716,0,2
53716,0,3
53716,0,4
…,…,…
2700533,7,31
2700533,7,32
2700533,7,33
2700533,7,34


	(103033, 17)
loading `credit_bureau_b_2`


case_id,num_group1,num_group2
i64,i64,i64
467,2,19
467,2,25
467,2,18
467,2,0
467,2,6
…,…,…
2703436,0,10
2703436,1,28
2703436,0,6
2703436,1,21


	(36446, 4)
(1526659, 469)


Unnamed: 0,case_id,date_decision,MONTH,WEEK_NUM,target,actualdpdtolerance_344P,amtinstpaidbefduel24m_4187115A,annuity_780A,annuitynextmonth_57A,applicationcnt_361L,...,day_contractmaturitydate_151D,day_lastupdate_260D,day_contractenddate_991D,day_openingdate_313D,day_birth_259D,day_birthdate_87D,day_empl_employedfrom_271D,day_openingdate_857D,day_empls_employedfrom_796D,day_pmts_date_1107D
0,0,2019-01-03,201901,0,0,,,1917.599976,0.000000,0.0,...,,,,,1.0,,15.0,,,
1,1,2019-01-03,201901,0,0,,,3134.000000,0.000000,0.0,...,,,,,1.0,,29.0,,,
2,2,2019-01-04,201901,0,0,,,4937.000000,0.000000,0.0,...,,,,,1.0,,15.0,,,
3,3,2019-01-03,201901,0,0,,,4643.600098,0.000000,0.0,...,,,,,1.0,,15.0,,,
4,4,2019-01-04,201901,0,1,,,3390.199951,0.000000,0.0,...,,,,,1.0,,15.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1526654,2703450,2020-10-05,202010,91,0,0.0,176561.359375,3675.400146,0.000000,0.0,...,,,,,1.0,,,,,
1526655,2703451,2020-10-05,202010,91,0,0.0,301276.468750,7088.600098,6191.600098,0.0,...,,,,,1.0,,,,,
1526656,2703452,2020-10-05,202010,91,0,0.0,14232.400391,7788.800293,0.000000,0.0,...,,,,,1.0,,,,,
1526657,2703453,2020-10-05,202010,91,0,0.0,197371.578125,1195.400024,2827.199951,0.0,...,,,,18.0,1.0,,,18.0,,


CPU times: user 1min 13s, sys: 25 s, total: 1min 38s
Wall time: 23.4 s


In [8]:
display(depth_data.dtypes.value_counts())
display(depth_data[depth_data.dtypes.index[depth_data.dtypes==object]])

float32    466
object     102
bool        11
int64        4
Name: count, dtype: int64

Unnamed: 0,date_decision,bankacctype_710L,cardtype_51L,credtype_322L,disbursementtype_67L,inittransactioncode_186L,lastapprcommoditycat_1041M,lastapprcommoditytypec_5251766M,lastcancelreason_561M,lastrejectcommoditycat_161M,...,conts_role_79M,empls_economicalst_849M,empls_employer_name_740M,relatedpersons_role_762T,collater_typofvalofguarant_298M,collater_typofvalofguarant_407M,collaterals_typeofguarante_359M,collaterals_typeofguarante_669M,subjectroles_name_541M,subjectroles_name_838M
0,2019-01-03,,,CAL,GBA,CASH,a55475b1,a55475b1,a55475b1,a55475b1,...,,,,,,,,,,
1,2019-01-03,,,CAL,GBA,CASH,a55475b1,a55475b1,a55475b1,a55475b1,...,,,,,,,,,,
2,2019-01-04,,,CAL,GBA,CASH,a55475b1,a55475b1,a55475b1,a55475b1,...,,,,,,,,,,
3,2019-01-03,,,CAL,GBA,CASH,a55475b1,a55475b1,P94_109_143,a55475b1,...,,,,,,,,,,
4,2019-01-04,,,CAL,GBA,CASH,a55475b1,a55475b1,P24_27_36,a55475b1,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1526654,2020-10-05,CA,,CAL,GBA,CASH,P12_6_178,P142_50_170,a55475b1,a55475b1,...,a55475b1,a55475b1,a55475b1,,9a0c095e,9a0c095e,c7a5ad39,c7a5ad39,ab3c25cf,ab3c25cf
1526655,2020-10-05,CA,,CAL,GBA,CASH,a55475b1,a55475b1,a55475b1,a55475b1,...,a55475b1,a55475b1,a55475b1,,9a0c095e,9a0c095e,c7a5ad39,c7a5ad39,ab3c25cf,ab3c25cf
1526656,2020-10-05,CA,,CAL,GBA,CASH,P159_130_59,P75_90_70,P180_60_137,a55475b1,...,a55475b1,a55475b1,a55475b1,,9a0c095e,9a0c095e,c7a5ad39,c7a5ad39,ab3c25cf,ab3c25cf
1526657,2020-10-05,CA,,CAL,GBA,CASH,a55475b1,a55475b1,a55475b1,P159_130_59,...,a55475b1,a55475b1,a55475b1,,9a0c095e,8fd95e4b,3cbe86ba,c7a5ad39,ab3c25cf,ab3c25cf
