In [21]:
from collections import OrderedDict
import gc
import pathlib
from pprint import pprint
from typing import Dict, List, Tuple
import sys

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import polars as pl

sys.path.append('..')
from scripts.get_depth_paths import get_depth_paths


gc.enable()

In [22]:
class PathHandler:
    competition_dir = pathlib.Path('../../inputs')
    parquet_files_dir = competition_dir.joinpath('parquet_files')
    output_dir = pathlib.Path('../../outputs/features')
    if not output_dir.is_dir():
        output_dir.mkdir(parents=True)


paths = PathHandler()

In [23]:
feature_definition = pd.read_csv(
    paths.competition_dir.joinpath('feature_definitions.csv')
)
display(feature_definition)

Unnamed: 0,Variable,Description
0,actualdpd_943P,Days Past Due (DPD) of previous contract (actu...
1,actualdpdtolerance_344P,DPD of client with tolerance.
2,addres_district_368M,District of the person's address.
3,addres_role_871L,Role of person's address.
4,addres_zip_823M,Zip code of the address.
...,...,...
460,totinstallast1m_4525188A,Total amount of monthly instalments paid in th...
461,twobodfilling_608L,Type of application process.
462,type_25L,Contact type of a person.
463,typesuite_864L,Persons accompanying the client during the loa...


In [24]:
train_base = pd.read_parquet(paths.parquet_files_dir.joinpath('train/train_base.parquet'))
test_base = pd.read_parquet(paths.parquet_files_dir.joinpath('test/test_base.parquet'))
display(train_base)
display(test_base)

Unnamed: 0,case_id,date_decision,MONTH,WEEK_NUM,target
0,0,2019-01-03,201901,0,0
1,1,2019-01-03,201901,0,0
2,2,2019-01-04,201901,0,0
3,3,2019-01-03,201901,0,0
4,4,2019-01-04,201901,0,1
...,...,...,...,...,...
1526654,2703450,2020-10-05,202010,91,0
1526655,2703451,2020-10-05,202010,91,0
1526656,2703452,2020-10-05,202010,91,0
1526657,2703453,2020-10-05,202010,91,0


Unnamed: 0,case_id,date_decision,MONTH,WEEK_NUM
0,57543,2020-10-06,202010,92
1,57549,2020-10-06,202010,92
2,57551,2020-10-06,202010,92
3,57552,2020-10-07,202010,92
4,57569,2020-10-06,202010,92
5,57630,2020-10-06,202010,92
6,57631,2020-10-06,202010,92
7,57632,2020-10-06,202010,92
8,57633,2020-10-06,202010,92
9,57634,2020-10-06,202010,92


In [25]:
train_depth_paths = get_depth_paths(paths.parquet_files_dir, 'train')
print(f'the number of train paths: {sum(len(v1) for v1 in train_depth_paths.values())}')

the number of train paths: 31


In [26]:
test_depth_paths = get_depth_paths(paths.parquet_files_dir, 'test')
print(f'the number of test paths: {sum(len(v1) for v1 in test_depth_paths.values())}')

the number of test paths: 35


In [27]:
def get_values(depth_paths: Dict[str, List[str]]) -> pd.DataFrame:
    unique_values = OrderedDict()
    num_nan_values = OrderedDict()
    unique_files = OrderedDict()
    for k, path_list in depth_paths.items():
        print(f'loading `{k}`')
        for path in path_list:
            print(f'\tloading {path.stem}')
            data = pd.read_parquet(path)
            for col in data.columns:
                if col == 'case_id':
                    continue
                value = data[col].unique()
                if col not in unique_values.keys():
                    unique_values[col] = list(value)
                    num_nan_values[col] = data[col].isna().sum()
                    unique_files[col] = [k]
                else:
                    unique_values[col] += list(value)
                    num_nan_values[col] += data[col].isna().sum()
                    unique_files[col] += [k]
                del value
                gc.collect()
            del data
            gc.collect()
            
    unique_values = [
        (
            k,
            len(set(v)),
            set(map(lambda x: type(x).__name__, list(set(v))))
        )
        for k, v in unique_values.items()
    ]
    unique_values = pd.DataFrame(
        unique_values,
        columns=['Variable', 'num_unique_values', 'unique_types']
    )
    
    num_nan_values = pd.DataFrame.from_dict(
        num_nan_values,
        orient='index',
    )
    num_nan_values.reset_index(inplace=True)
    num_nan_values.columns = ['Variable', 'num_nan']
    unique_files = [(k, list(set(v))) for k, v in unique_files.items()]
    unique_files = pd.DataFrame(
        unique_files,
        columns=['Variable', 'files']
    )
    
    data = unique_values.merge(num_nan_values, on='Variable')
    data = data.merge(unique_files, on='Variable')
    return data


def drop_nontype(x: set) -> list:
    x = list(x)
    if 'NoneType' in x:
        x.remove('NoneType')
    return x

In [28]:
%%time


train_depth_values = get_values(train_depth_paths)
train_depth_values['unique_types'] = train_depth_values['unique_types'].map(drop_nontype)
train_depth_values['num_unique_types'] = train_depth_values['unique_types'].map(len)
display(train_depth_values)

loading `static_0`
	loading train_static_0_0


KeyboardInterrupt: 

In [None]:
%%time


test_depth_values = get_values(test_depth_paths)
test_depth_values['unique_types'] = test_depth_values['unique_types'].map(drop_nontype)
test_depth_values['num_unique_types'] = test_depth_values['unique_types'].map(len)
display(test_depth_values)

loading `static_0`
	loading test_static_0_0
	loading test_static_0_1
	loading test_static_0_2
loading `static_cb_0`
	loading test_static_cb_0
loading `applprev_1`
	loading test_applprev_1_0
	loading test_applprev_1_1
	loading test_applprev_1_2
loading `other_1`
	loading test_other_1
loading `tax_registry_a_1`
	loading test_tax_registry_a_1
loading `tax_registry_b_1`
	loading test_tax_registry_b_1
loading `tax_registry_c_1`
	loading test_tax_registry_c_1
loading `credit_bureau_a_1`
	loading test_credit_bureau_a_1_0
	loading test_credit_bureau_a_1_1
	loading test_credit_bureau_a_1_2
	loading test_credit_bureau_a_1_3
	loading test_credit_bureau_a_1_4
loading `credit_bureau_b_1`
	loading test_credit_bureau_b_1
loading `deposit_1`
	loading test_deposit_1
loading `person_1`
	loading test_person_1
loading `debitcard_1`
	loading test_debitcard_1
loading `applprev_2`
	loading test_applprev_2
loading `person_2`
	loading test_person_2
loading `credit_bureau_a_2`
	loading test_credit_bureau_a_2_0


Unnamed: 0,Variable,num_unique_values,unique_types,num_nan,files,num_unique_types
0,actualdpdtolerance_344P,2,[float64],9,[static_0],1
1,amtinstpaidbefduel24m_4187115A,6,[float64],26,[static_0],1
2,annuity_780A,30,[float64],0,[static_0],1
3,annuitynextmonth_57A,3,[float64],0,[static_0],1
4,applicationcnt_361L,1,[float64],0,[static_0],1
...,...,...,...,...,...,...
461,subjectroles_name_541M,2,[str],0,[credit_bureau_a_2],1
462,subjectroles_name_838M,2,[str],0,[credit_bureau_a_2],1
463,pmts_date_1107D,9,[str],0,[credit_bureau_b_2],1
464,pmts_dpdvalue_108P,3,[float64],0,[credit_bureau_b_2],1


CPU times: user 31.4 s, sys: 84 ms, total: 31.4 s
Wall time: 31.4 s


In [None]:
print(set(feature_definition['Variable']) - set(train_depth_values['Variable']))
print(set(train_depth_values['Variable']) - set(feature_definition['Variable']))

print(set(feature_definition['Variable']) - set(test_depth_values['Variable']))
print(set(test_depth_values['Variable']) - set(feature_definition['Variable']))

{'score_940'}
{'num_group1', 'num_group2'}
{'score_940'}
{'num_group1', 'num_group2'}


In [None]:
train_depth_values.columns = [
    'Variable', 'train_num_unique_values', 'train_unique_types', 'train_num_nan',  'train_files', 'train_num_unique_types'
]
test_depth_values.columns = [
    'Variable', 'test_num_unique_values', 'test_unique_types', 'test_num_nan', 'test_files', 'test_num_unique_types'
]

feature_definition = feature_definition.merge(train_depth_values, on='Variable')
feature_definition = feature_definition.merge(test_depth_values, on='Variable')
display(feature_definition)

Unnamed: 0,Variable,Description,train_num_unique_values,train_unique_types,train_num_nan,train_files,train_num_unique_types,test_num_unique_values,test_unique_types,test_num_nan,test_files,test_num_unique_types
0,actualdpd_943P,Days Past Due (DPD) of previous contract (actu...,191,[float64],2500,[applprev_1],1,1,[float64],0,[applprev_1],1
1,actualdpdtolerance_344P,DPD of client with tolerance.,172,[float64],418178,[static_0],1,2,[float64],9,[static_0],1
2,addres_district_368M,District of the person's address.,508,[str],0,[person_2],1,1,[str],0,[person_2],1
3,addres_role_871L,Role of person's address.,9,[str],1575736,[person_2],1,1,[],10,[person_2],0
4,addres_zip_823M,Zip code of the address.,2027,[str],0,[person_2],1,1,[str],0,[person_2],1
...,...,...,...,...,...,...,...,...,...,...,...,...
459,totinstallast1m_4525188A,Total amount of monthly instalments paid in th...,130317,[float64],1174211,[static_0],1,5,[float64],28,[static_0],1
460,twobodfilling_608L,Type of application process.,3,[str],91,[static_0],1,2,[str],0,[static_0],1
461,type_25L,Contact type of a person.,9,[str],6117,[person_1],1,3,[str],0,[person_1],1
462,typesuite_864L,Persons accompanying the client during the loa...,2,[str],1121505,[static_0],1,2,[str],11,[static_0],1


In [None]:
feature_definition = (
    feature_definition
    .sort_values('train_num_unique_values')
    .reset_index(drop=True)
)
display(feature_definition)
display(feature_definition['train_num_unique_types'].value_counts())
display(feature_definition['test_num_unique_types'].value_counts())

Unnamed: 0,Variable,Description,train_num_unique_values,train_unique_types,train_num_nan,train_files,train_num_unique_types,test_num_unique_values,test_unique_types,test_num_nan,test_files,test_num_unique_types
0,deferredmnthsnum_166L,Number of deferred months.,1,[float64],0,[static_0],1,3,[float64],20,[static_0],1
1,paytype_783L,Type of payment.,2,[str],58575,[static_0],1,2,[str],11,[static_0],1
2,remitter_829L,Flag indicating whether the client is a remitter.,2,[bool],2168942,[person_1],1,2,[bool],7,[person_1],1
3,formonth_118L,Number of rejections in a month.,2,[float64],1463962,[static_cb_0],1,1,[float64],10,[static_cb_0],1
4,role_993L,Person's role.,2,[str],2949075,[person_1],1,1,[],10,[person_1],0
...,...,...,...,...,...,...,...,...,...,...,...,...
459,debtoutstand_525A,Outstanding amount of existing contract.,1053264,[float64],14554700,[credit_bureau_a_1],1,10,[float64],42,[credit_bureau_a_1],1
460,totaloutstanddebtvalue_39A,Total outstanding debt for active contracts in...,1081144,[float64],14635534,[credit_bureau_a_1],1,10,[float64],44,[credit_bureau_a_1],1
461,pmts_overdue_1140A,Overdue payment for an active contract (num_gr...,1115386,[float64],152850520,[credit_bureau_a_2],1,14,[float64],65,[credit_bureau_a_2],1
462,outstandingamount_362A,Active contract's outstanding amount.,1255021,[float64],14536309,[credit_bureau_a_1],1,7,[float64],48,[credit_bureau_a_1],1


train_num_unique_types
1    464
Name: count, dtype: int64

test_num_unique_types
1    432
0     32
Name: count, dtype: int64

In [None]:
display(feature_definition.query('train_num_unique_types==1 & test_num_unique_types==0'))

Unnamed: 0,Variable,Description,train_num_unique_values,train_unique_types,train_num_nan,train_files,train_num_unique_types,test_num_unique_values,test_unique_types,test_num_nan,test_files,test_num_unique_types
4,role_993L,Person's role.,2,[str],2949075,[person_1],1,1,[],10,[person_1],0
15,equalitydataagreement_891L,Flag indicating sudden changes in client's soc...,3,[bool],1448632,[static_0],1,1,[],30,[static_0],0
17,equalityempfrom_62L,Flag indicating a sudden change in the client'...,3,[bool],1488847,[static_0],1,1,[],30,[static_0],0
26,isreference_387L,Flag indicating whether the person is a refere...,3,[bool],2949075,[person_1],1,1,[],10,[person_1],0
32,gender_992L,Gender of a person.,3,[str],2949075,[person_1],1,1,[],10,[person_1],0
50,requesttype_4525192L,Tax authority request type.,4,[str],827212,[static_cb_0],1,1,[],10,[static_cb_0],0
64,maritalst_703L,Marital status of the client.,6,[str],2962646,[person_1],1,1,[],10,[person_1],0
70,periodicityofpmts_997L,Frequency of instalments for active credit con...,6,[str],83764,[credit_bureau_b_1],1,1,[],10,[credit_bureau_b_1],0
73,housingtype_772L,Type of housing of the person.,7,[str],2964176,[person_1],1,1,[],10,[person_1],0
74,credacc_cards_status_52L,Card status of the previous credit account.,7,[str],13733404,[applprev_2],1,1,[],10,[applprev_2],0


In [None]:
def get_unique_type(x: list) -> str:
    if isinstance(x, list) and len(x) > 0:
        return x[0]


feature_definition['train_unique_types'] = (
    feature_definition['train_unique_types']
    .map(get_unique_type)
)
feature_definition['test_unique_types'] = (
    feature_definition['test_unique_types']
    .map(get_unique_type)
)

In [None]:
feature_definition.replace(
    {'train_unique_types': {'bool_': 'bool'}, 'test_unique_types': {'bool_': 'bool'}},
    inplace=True
)
display(feature_definition['train_unique_types'].value_counts())
display(feature_definition['test_unique_types'].value_counts())

fea_def_both_f64_types = (
    feature_definition
    .query('train_unique_types=="float64" & test_unique_types=="float64"')
    .reset_index(drop=True)
)
fea_def_both_str_types = (
    feature_definition
    .query('train_unique_types=="str" & test_unique_types=="str"')
    .reset_index(drop=True)
)
fea_def_both_bool_types = (
    feature_definition
    .query('train_unique_types=="bool" & test_unique_types=="bool"')
    .reset_index(drop=True)
)
fea_def_complex_types = (
    feature_definition
    .query('train_unique_types != test_unique_types')
    .reset_index(drop=True)
)
display(fea_def_both_f64_types)
display(fea_def_both_str_types)
display(fea_def_both_bool_types)
display(fea_def_complex_types)

train_unique_types
float64    295
str        156
bool        13
Name: count, dtype: int64

test_unique_types
float64    298
str        124
bool        10
Name: count, dtype: int64

Unnamed: 0,Variable,Description,train_num_unique_values,train_unique_types,train_num_nan,train_files,train_num_unique_types,test_num_unique_values,test_unique_types,test_num_nan,test_files,test_num_unique_types
0,deferredmnthsnum_166L,Number of deferred months.,1,float64,0,[static_0],1,3,float64,20,[static_0],1
1,formonth_118L,Number of rejections in a month.,2,float64,1463962,[static_cb_0],1,1,float64,10,[static_cb_0],1
2,forweek_601L,Number of rejected applications in the last week.,2,float64,1463962,[static_cb_0],1,1,float64,10,[static_cb_0],1
3,forquarter_462L,Number of credit applications that were reject...,2,float64,1463962,[static_cb_0],1,1,float64,10,[static_cb_0],1
4,residualamount_1093A,Residual amount of closed guarantee contract.,3,float64,69666,[credit_bureau_b_1],1,2,float64,9,[credit_bureau_b_1],1
...,...,...,...,...,...,...,...,...,...,...,...,...
289,debtoutstand_525A,Outstanding amount of existing contract.,1053264,float64,14554700,[credit_bureau_a_1],1,10,float64,42,[credit_bureau_a_1],1
290,totaloutstanddebtvalue_39A,Total outstanding debt for active contracts in...,1081144,float64,14635534,[credit_bureau_a_1],1,10,float64,44,[credit_bureau_a_1],1
291,pmts_overdue_1140A,Overdue payment for an active contract (num_gr...,1115386,float64,152850520,[credit_bureau_a_2],1,14,float64,65,[credit_bureau_a_2],1
292,outstandingamount_362A,Active contract's outstanding amount.,1255021,float64,14536309,[credit_bureau_a_1],1,7,float64,48,[credit_bureau_a_1],1


Unnamed: 0,Variable,Description,train_num_unique_values,train_unique_types,train_num_nan,train_files,train_num_unique_types,test_num_unique_values,test_unique_types,test_num_nan,test_files,test_num_unique_types
0,paytype_783L,Type of payment.,2,str,58575,[static_0],1,2,str,11,[static_0],1
1,typesuite_864L,Persons accompanying the client during the loa...,2,str,1121505,[static_0],1,2,str,11,[static_0],1
2,bankacctype_710L,Type of applicant's bank account.,2,str,1109629,[static_0],1,2,str,14,[static_0],1
3,paytype1st_925L,Type of first payment of the client.,2,str,58575,[static_0],1,2,str,11,[static_0],1
4,description_5085714M,Categorization of clients by credit bureau.,2,str,0,[static_cb_0],1,2,str,0,[static_cb_0],1
...,...,...,...,...,...,...,...,...,...,...,...,...
119,dateofcredend_289D,End date of an active credit contract.,10749,str,13281261,[credit_bureau_a_1],1,8,str,43,[credit_bureau_a_1],1
120,dateofcredend_353D,End date of a closed credit contract.,11060,str,9188054,[credit_bureau_a_1],1,12,str,39,[credit_bureau_a_1],1
121,profession_152M,Profession of the client during their previous...,11508,str,0,[applprev_1],1,1,str,0,[applprev_1],1
122,name_4917606M,Name of employer.,55857,str,0,[tax_registry_b_1],1,3,str,0,[tax_registry_b_1],1


Unnamed: 0,Variable,Description,train_num_unique_values,train_unique_types,train_num_nan,train_files,train_num_unique_types,test_num_unique_values,test_unique_types,test_num_nan,test_files,test_num_unique_types
0,remitter_829L,Flag indicating whether the client is a remitter.,2,bool,2168942,[person_1],1,2,bool,7,[person_1],1
1,contaddr_matchlist_1032L,Indicates whether the contact address is found...,2,bool,1447773,[person_1],1,2,bool,7,[person_1],1
2,isbidproduct_1095L,Flag indicating if the product is a cross-sell.,2,bool,0,[static_0],1,2,bool,0,[static_0],1
3,isdebitcard_729L,Flag indicating if the product is a debit card.,2,bool,1334357,[static_0],1,2,bool,9,[static_0],1
4,contaddr_smempladdr_334L,Indicates whether the contact address is the s...,3,bool,1447773,[person_1],1,2,bool,7,[person_1],1
5,opencred_647L,Number of active loans from the previous appli...,3,bool,305137,[static_0],1,3,bool,8,[static_0],1
6,isdebitcard_527L,Previous application flag indicating if produc...,3,bool,6062966,[applprev_1],1,3,bool,23,[applprev_1],1
7,isbidproductrequest_292L,Flag indicating if the product is a cross-sell.,3,bool,1514201,[static_0],1,2,bool,29,[static_0],1
8,safeguarantyflag_411L,Flag indicating if client is using a flexible ...,3,bool,1447334,[person_1],1,2,bool,7,[person_1],1
9,isbidproduct_390L,Flag for determining if the product is a cross...,3,bool,66,[applprev_1],1,2,bool,0,[applprev_1],1


Unnamed: 0,Variable,Description,train_num_unique_values,train_unique_types,train_num_nan,train_files,train_num_unique_types,test_num_unique_values,test_unique_types,test_num_nan,test_files,test_num_unique_types
0,role_993L,Person's role.,2,str,2949075,[person_1],1,1,,10,[person_1],0
1,equalitydataagreement_891L,Flag indicating sudden changes in client's soc...,3,bool,1448632,[static_0],1,1,,30,[static_0],0
2,equalityempfrom_62L,Flag indicating a sudden change in the client'...,3,bool,1488847,[static_0],1,1,,30,[static_0],0
3,isreference_387L,Flag indicating whether the person is a refere...,3,bool,2949075,[person_1],1,1,,10,[person_1],0
4,gender_992L,Gender of a person.,3,str,2949075,[person_1],1,1,,10,[person_1],0
5,requesttype_4525192L,Tax authority request type.,4,str,827212,[static_cb_0],1,1,,10,[static_cb_0],0
6,maritalst_703L,Marital status of the client.,6,str,2962646,[person_1],1,1,,10,[person_1],0
7,periodicityofpmts_997L,Frequency of instalments for active credit con...,6,str,83764,[credit_bureau_b_1],1,1,,10,[credit_bureau_b_1],0
8,housingtype_772L,Type of housing of the person.,7,str,2964176,[person_1],1,1,,10,[person_1],0
9,credacc_cards_status_52L,Card status of the previous credit account.,7,str,13733404,[applprev_2],1,1,,10,[applprev_2],0


In [None]:
fea_def_f64_types = pd.concat(
    [
        fea_def_both_f64_types,
        fea_def_complex_types.query('train_unique_types=="float64"')
    ],
    ignore_index=True
)

fea_def_str_types = pd.concat(
    [
        fea_def_both_str_types,
        fea_def_complex_types.query('train_unique_types=="str"')
    ],
    ignore_index=True
)

fea_def_bool_types = pd.concat(
    [
        fea_def_both_bool_types,
        fea_def_complex_types.query('train_unique_types=="bool"')
    ],
    ignore_index=True
)

display(fea_def_f64_types)
display(fea_def_str_types)
display(fea_def_bool_types)

Unnamed: 0,Variable,Description,train_num_unique_values,train_unique_types,train_num_nan,train_files,train_num_unique_types,test_num_unique_values,test_unique_types,test_num_nan,test_files,test_num_unique_types
0,deferredmnthsnum_166L,Number of deferred months.,1,float64,0,[static_0],1,3,float64,20,[static_0],1
1,formonth_118L,Number of rejections in a month.,2,float64,1463962,[static_cb_0],1,1,float64,10,[static_cb_0],1
2,forweek_601L,Number of rejected applications in the last week.,2,float64,1463962,[static_cb_0],1,1,float64,10,[static_cb_0],1
3,forquarter_462L,Number of credit applications that were reject...,2,float64,1463962,[static_cb_0],1,1,float64,10,[static_cb_0],1
4,residualamount_1093A,Residual amount of closed guarantee contract.,3,float64,69666,[credit_bureau_b_1],1,2,float64,9,[credit_bureau_b_1],1
...,...,...,...,...,...,...,...,...,...,...,...,...
290,totaloutstanddebtvalue_39A,Total outstanding debt for active contracts in...,1081144,float64,14635534,[credit_bureau_a_1],1,10,float64,44,[credit_bureau_a_1],1
291,pmts_overdue_1140A,Overdue payment for an active contract (num_gr...,1115386,float64,152850520,[credit_bureau_a_2],1,14,float64,65,[credit_bureau_a_2],1
292,outstandingamount_362A,Active contract's outstanding amount.,1255021,float64,14536309,[credit_bureau_a_1],1,7,float64,48,[credit_bureau_a_1],1
293,pmts_overdue_1152A,Overdue payment for a closed contract (num_gro...,3260916,float64,113377640,[credit_bureau_a_2],1,19,float64,100,[credit_bureau_a_2],1


Unnamed: 0,Variable,Description,train_num_unique_values,train_unique_types,train_num_nan,train_files,train_num_unique_types,test_num_unique_values,test_unique_types,test_num_nan,test_files,test_num_unique_types
0,paytype_783L,Type of payment.,2,str,58575,[static_0],1,2,str,11,[static_0],1
1,typesuite_864L,Persons accompanying the client during the loa...,2,str,1121505,[static_0],1,2,str,11,[static_0],1
2,bankacctype_710L,Type of applicant's bank account.,2,str,1109629,[static_0],1,2,str,14,[static_0],1
3,paytype1st_925L,Type of first payment of the client.,2,str,58575,[static_0],1,2,str,11,[static_0],1
4,description_5085714M,Categorization of clients by credit bureau.,2,str,0,[static_cb_0],1,2,str,0,[static_cb_0],1
...,...,...,...,...,...,...,...,...,...,...,...,...
151,datelastinstal40dpd_247D,Date of last instalment that was more than 40 ...,5176,str,1392841,[static_0],1,1,,30,[static_0],0
152,datelastunpaid_3546854D,Date of the last unpaid instalment.,5232,str,887659,[static_0],1,1,,30,[static_0],0
153,maxdpdinstldate_3546855D,Date of instalment on which client was most da...,5288,str,826000,[static_0],1,1,,30,[static_0],0
154,assignmentdate_238D,Tax authority data - date of assignment.,8888,str,1363480,[static_cb_0],1,1,float64,10,[static_cb_0],1


Unnamed: 0,Variable,Description,train_num_unique_values,train_unique_types,train_num_nan,train_files,train_num_unique_types,test_num_unique_values,test_unique_types,test_num_nan,test_files,test_num_unique_types
0,remitter_829L,Flag indicating whether the client is a remitter.,2,bool,2168942,[person_1],1,2,bool,7,[person_1],1
1,contaddr_matchlist_1032L,Indicates whether the contact address is found...,2,bool,1447773,[person_1],1,2,bool,7,[person_1],1
2,isbidproduct_1095L,Flag indicating if the product is a cross-sell.,2,bool,0,[static_0],1,2,bool,0,[static_0],1
3,isdebitcard_729L,Flag indicating if the product is a debit card.,2,bool,1334357,[static_0],1,2,bool,9,[static_0],1
4,contaddr_smempladdr_334L,Indicates whether the contact address is the s...,3,bool,1447773,[person_1],1,2,bool,7,[person_1],1
5,opencred_647L,Number of active loans from the previous appli...,3,bool,305137,[static_0],1,3,bool,8,[static_0],1
6,isdebitcard_527L,Previous application flag indicating if produc...,3,bool,6062966,[applprev_1],1,3,bool,23,[applprev_1],1
7,isbidproductrequest_292L,Flag indicating if the product is a cross-sell.,3,bool,1514201,[static_0],1,2,bool,29,[static_0],1
8,safeguarantyflag_411L,Flag indicating if client is using a flexible ...,3,bool,1447334,[person_1],1,2,bool,7,[person_1],1
9,isbidproduct_390L,Flag for determining if the product is a cross...,3,bool,66,[applprev_1],1,2,bool,0,[applprev_1],1


In [None]:
fea_def_f64_types['transform'] = fea_def_f64_types['Variable'].map(lambda x: x[-1])
fea_def_str_types['transform'] = fea_def_str_types['Variable'].map(lambda x: x[-1])
fea_def_bool_types['transform'] = fea_def_bool_types['Variable'].map(lambda x: x[-1])

display(fea_def_both_f64_types)
display(fea_def_both_str_types)
display(fea_def_both_bool_types)

display(fea_def_f64_types['transform'].value_counts())
display(fea_def_str_types['transform'].value_counts())
display(fea_def_bool_types['transform'].value_counts())

Unnamed: 0,Variable,Description,train_num_unique_values,train_unique_types,train_num_nan,train_files,train_num_unique_types,test_num_unique_values,test_unique_types,test_num_nan,test_files,test_num_unique_types
0,deferredmnthsnum_166L,Number of deferred months.,1,float64,0,[static_0],1,3,float64,20,[static_0],1
1,formonth_118L,Number of rejections in a month.,2,float64,1463962,[static_cb_0],1,1,float64,10,[static_cb_0],1
2,forweek_601L,Number of rejected applications in the last week.,2,float64,1463962,[static_cb_0],1,1,float64,10,[static_cb_0],1
3,forquarter_462L,Number of credit applications that were reject...,2,float64,1463962,[static_cb_0],1,1,float64,10,[static_cb_0],1
4,residualamount_1093A,Residual amount of closed guarantee contract.,3,float64,69666,[credit_bureau_b_1],1,2,float64,9,[credit_bureau_b_1],1
...,...,...,...,...,...,...,...,...,...,...,...,...
289,debtoutstand_525A,Outstanding amount of existing contract.,1053264,float64,14554700,[credit_bureau_a_1],1,10,float64,42,[credit_bureau_a_1],1
290,totaloutstanddebtvalue_39A,Total outstanding debt for active contracts in...,1081144,float64,14635534,[credit_bureau_a_1],1,10,float64,44,[credit_bureau_a_1],1
291,pmts_overdue_1140A,Overdue payment for an active contract (num_gr...,1115386,float64,152850520,[credit_bureau_a_2],1,14,float64,65,[credit_bureau_a_2],1
292,outstandingamount_362A,Active contract's outstanding amount.,1255021,float64,14536309,[credit_bureau_a_1],1,7,float64,48,[credit_bureau_a_1],1


Unnamed: 0,Variable,Description,train_num_unique_values,train_unique_types,train_num_nan,train_files,train_num_unique_types,test_num_unique_values,test_unique_types,test_num_nan,test_files,test_num_unique_types
0,paytype_783L,Type of payment.,2,str,58575,[static_0],1,2,str,11,[static_0],1
1,typesuite_864L,Persons accompanying the client during the loa...,2,str,1121505,[static_0],1,2,str,11,[static_0],1
2,bankacctype_710L,Type of applicant's bank account.,2,str,1109629,[static_0],1,2,str,14,[static_0],1
3,paytype1st_925L,Type of first payment of the client.,2,str,58575,[static_0],1,2,str,11,[static_0],1
4,description_5085714M,Categorization of clients by credit bureau.,2,str,0,[static_cb_0],1,2,str,0,[static_cb_0],1
...,...,...,...,...,...,...,...,...,...,...,...,...
119,dateofcredend_289D,End date of an active credit contract.,10749,str,13281261,[credit_bureau_a_1],1,8,str,43,[credit_bureau_a_1],1
120,dateofcredend_353D,End date of a closed credit contract.,11060,str,9188054,[credit_bureau_a_1],1,12,str,39,[credit_bureau_a_1],1
121,profession_152M,Profession of the client during their previous...,11508,str,0,[applprev_1],1,1,str,0,[applprev_1],1
122,name_4917606M,Name of employer.,55857,str,0,[tax_registry_b_1],1,3,str,0,[tax_registry_b_1],1


Unnamed: 0,Variable,Description,train_num_unique_values,train_unique_types,train_num_nan,train_files,train_num_unique_types,test_num_unique_values,test_unique_types,test_num_nan,test_files,test_num_unique_types
0,remitter_829L,Flag indicating whether the client is a remitter.,2,bool,2168942,[person_1],1,2,bool,7,[person_1],1
1,contaddr_matchlist_1032L,Indicates whether the contact address is found...,2,bool,1447773,[person_1],1,2,bool,7,[person_1],1
2,isbidproduct_1095L,Flag indicating if the product is a cross-sell.,2,bool,0,[static_0],1,2,bool,0,[static_0],1
3,isdebitcard_729L,Flag indicating if the product is a debit card.,2,bool,1334357,[static_0],1,2,bool,9,[static_0],1
4,contaddr_smempladdr_334L,Indicates whether the contact address is the s...,3,bool,1447773,[person_1],1,2,bool,7,[person_1],1
5,opencred_647L,Number of active loans from the previous appli...,3,bool,305137,[static_0],1,3,bool,8,[static_0],1
6,isdebitcard_527L,Previous application flag indicating if produc...,3,bool,6062966,[applprev_1],1,3,bool,23,[applprev_1],1
7,isbidproductrequest_292L,Flag indicating if the product is a cross-sell.,3,bool,1514201,[static_0],1,2,bool,29,[static_0],1
8,safeguarantyflag_411L,Flag indicating if client is using a flexible ...,3,bool,1447334,[person_1],1,2,bool,7,[person_1],1
9,isbidproduct_390L,Flag for determining if the product is a cross...,3,bool,66,[applprev_1],1,2,bool,0,[applprev_1],1


transform
L    143
A    102
P     33
T     17
Name: count, dtype: int64

transform
M    63
D    57
L    31
T     5
Name: count, dtype: int64

transform
L    13
Name: count, dtype: int64

In [None]:
fea_def_data_types = fea_def_str_types.query('transform=="D"')
display(fea_def_data_types)

Unnamed: 0,Variable,Description,train_num_unique_values,train_unique_types,train_num_nan,train_files,train_num_unique_types,test_num_unique_values,test_unique_types,test_num_nan,test_files,test_num_unique_types,transform
63,pmts_date_1107D,Payment date for an active contract according ...,58,str,0,[credit_bureau_b_2],1,9,str,0,[credit_bureau_b_2],1,D
68,responsedate_4917613D,Tax authority's response date.,208,str,1275564,[static_cb_0],1,2,str,0,[static_cb_0],1,D
73,deductiondate_4917603D,Tax deduction date.,260,str,0,[tax_registry_b_1],1,9,str,0,[tax_registry_b_1],1,D
77,recorddate_4527225D,Date of tax deduction record.,397,str,0,[tax_registry_a_1],1,1,str,0,[tax_registry_a_1],1,D
79,lastupdate_260D,Last update date for the active contracts.,612,str,3892,[credit_bureau_b_1],1,7,str,0,[credit_bureau_b_1],1,D
80,refreshdate_3813885D,Date when the credit bureau's public sources h...,644,str,4855840,[credit_bureau_a_1],1,28,str,6,[credit_bureau_a_1],1,D
81,birth_259D,Date of birth of the person.,681,str,1447332,[person_1],1,4,str,7,[person_1],1,D
82,lastupdate_1112D,Date of last update for an active contract fro...,713,str,13281261,[credit_bureau_a_1],1,7,str,43,[credit_bureau_a_1],1,D
83,dateofbirth_337D,Client's date of birth.,725,str,114785,[static_cb_0],1,10,str,1,[static_cb_0],1,D
87,contractenddate_991D,End date of deposit contract.,1525,str,79682,[deposit_1],1,4,str,7,[deposit_1],1,D


In [None]:
fea_def_f64_types.to_csv(
    paths.output_dir.joinpath('float64_features.csv'),
    index=False
)
fea_def_str_types.query('transform!="D"').to_csv(
    paths.output_dir.joinpath('string_features.csv'),
    index=False
)
fea_def_str_types.query('transform=="D"').to_csv(
    paths.output_dir.joinpath('date_features.csv'),
    index=False
)
fea_def_bool_types.to_csv(
    paths.output_dir.joinpath('bool_features.csv'),
    index=False
)