In [1]:
from copy import deepcopy
import datetime
import gc
import os
import pathlib
import random
import sys
sys.path.append('..')
from typing import Any, Dict, List, Tuple, Union

from joblib import load
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import polars as pl
import seaborn as sns
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.preprocessing import OrdinalEncoder
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm

from scripts.dae import DAE
from scripts.get_depth_paths import get_depth_paths
from scripts.get_logger import get_logger
from scripts.merge_dataset import merge_dataset

gc.enable()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class PathHandler:
    competition_dir = pathlib.Path('../../inputs')
    parquet_files_dir = competition_dir.joinpath('parquet_files')
    feature_dir = pathlib.Path('../../outputs/features')
    output_dir = sorted(pathlib.Path('../../outputs/output_dae/').glob('model_outputs*'))[-1]
    if not output_dir.is_dir():
        output_dir.mkdir(parents=True)


paths = PathHandler()
print(paths.output_dir)

../../outputs/output_dae/model_outputs_2024-05-06-09-11


- P - Transform DPD (Days past due)
- M - Masking categories
- A - Transform amount
- D - Transform date
- T - Unspecified Transform
- L - Unspecified Transform

In [3]:
bool_features = pd.read_csv(paths.feature_dir.joinpath('bool_features.csv'))
float64_features = pd.read_csv(paths.feature_dir.joinpath('float64_features.csv'))
string_features = pd.read_csv(paths.feature_dir.joinpath('string_features.csv'))
date_features = pd.read_csv(paths.feature_dir.joinpath('date_features.csv'))

bool_features['cast_dtype'] = pl.Int8
float64_features['cast_dtype'] = pl.Float32
string_features['cast_dtype'] = pl.String
date_features['cast_dtype'] = pl.Date

cast_features = pd.concat([bool_features, float64_features, string_features])
display(cast_features)

Unnamed: 0,Variable,Description,train_num_unique_values,train_unique_types,train_num_nan,train_files,train_num_unique_types,test_num_unique_values,test_unique_types,test_num_nan,test_files,test_num_unique_types,transform,cast_dtype
0,remitter_829L,Flag indicating whether the client is a remitter.,2,bool,2168942,['person_1'],1,2,bool,7,['person_1'],1,L,Int8
1,contaddr_matchlist_1032L,Indicates whether the contact address is found...,2,bool,1447773,['person_1'],1,2,bool,7,['person_1'],1,L,Int8
2,isbidproduct_1095L,Flag indicating if the product is a cross-sell.,2,bool,0,['static_0'],1,2,bool,0,['static_0'],1,L,Int8
3,isdebitcard_729L,Flag indicating if the product is a debit card.,2,bool,1334357,['static_0'],1,2,bool,9,['static_0'],1,L,Int8
4,contaddr_smempladdr_334L,Indicates whether the contact address is the s...,3,bool,1447773,['person_1'],1,2,bool,7,['person_1'],1,L,Int8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94,addres_role_871L,Role of person's address.,9,str,1575736,['person_2'],1,1,,10,['person_2'],0,L,String
95,cacccardblochreas_147M,Card blocking reason.,10,str,109249,['applprev_2'],1,1,,10,['applprev_2'],0,M,String
96,relatedpersons_role_762T,Relationship type of a client's related person...,11,str,1614684,['person_2'],1,1,,10,['person_2'],0,T,String
97,riskassesment_302T,Estimated probability that the client will def...,17,str,1446917,['static_cb_0'],1,1,,10,['static_cb_0'],0,T,String


In [4]:
display(string_features.query('transform=="M"'))

Unnamed: 0,Variable,Description,train_num_unique_values,train_unique_types,train_num_nan,train_files,train_num_unique_types,test_num_unique_values,test_unique_types,test_num_nan,test_files,test_num_unique_types,transform,cast_dtype
4,description_5085714M,Categorization of clients by credit bureau.,2,str,0,['static_cb_0'],1,2,str,0,['static_cb_0'],1,M,String
5,language1_981M,The primary language of the person.,3,str,0,['person_1'],1,3,str,0,['person_1'],1,M,String
16,collater_typofvalofguarant_298M,Collateral valuation type (active contract).,5,str,0,['credit_bureau_a_2'],1,3,str,0,['credit_bureau_a_2'],1,M,String
17,education_88M,Education level of the client.,5,str,0,['static_cb_0'],1,1,str,0,['static_cb_0'],1,M,String
18,education_1103M,Level of education of the client provided by e...,5,str,0,['static_cb_0'],1,2,str,0,['static_cb_0'],1,M,String
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83,profession_152M,Profession of the client during their previous...,11508,str,0,['applprev_1'],1,1,str,0,['applprev_1'],1,M,String
84,name_4917606M,Name of employer.,55857,str,0,['tax_registry_b_1'],1,3,str,0,['tax_registry_b_1'],1,M,String
85,name_4527232M,Name of employer.,147037,str,0,['tax_registry_a_1'],1,2,str,0,['tax_registry_a_1'],1,M,String
95,cacccardblochreas_147M,Card blocking reason.,10,str,109249,['applprev_2'],1,1,,10,['applprev_2'],0,M,String


### load feature definition, out-of-fold and feature importances.

In [5]:
feature_definition = pd.read_csv(paths.competition_dir.joinpath('feature_definitions.csv'))
oof = pd.read_csv(paths.output_dir.joinpath('oof.csv'))

display(feature_definition)
display(oof)

Unnamed: 0,Variable,Description
0,actualdpd_943P,Days Past Due (DPD) of previous contract (actu...
1,actualdpdtolerance_344P,DPD of client with tolerance.
2,addres_district_368M,District of the person's address.
3,addres_role_871L,Role of person's address.
4,addres_zip_823M,Zip code of the address.
...,...,...
460,totinstallast1m_4525188A,Total amount of monthly instalments paid in th...
461,twobodfilling_608L,Type of application process.
462,type_25L,Contact type of a person.
463,typesuite_864L,Persons accompanying the client during the loa...


Unnamed: 0,case_id,date_decision,MONTH,WEEK_NUM,target,loss1,loss2,cossim,fold
0,0,2019-01-03,201901,0,0,0.015749,4.842071,0.912992,0.0
1,1,2019-01-03,201901,0,0,0.086462,5.322999,0.703418,0.0
2,2,2019-01-04,201901,0,0,0.056739,3.662865,0.762971,0.0
3,3,2019-01-03,201901,0,0,0.172068,1.928418,0.420912,0.0
4,5,2019-01-02,201901,0,0,0.046806,4.076612,0.642091,0.0
...,...,...,...,...,...,...,...,...,...
1478660,2703450,2020-10-05,202010,91,0,0.174230,0.819802,0.852726,3.0
1478661,2703451,2020-10-05,202010,91,0,0.180311,0.762218,0.816764,3.0
1478662,2703452,2020-10-05,202010,91,0,0.097672,0.906576,0.802836,3.0
1478663,2703453,2020-10-05,202010,91,0,0.507093,0.909053,0.676809,3.0


### merge dataset

In [6]:
train_base_data = pl.read_parquet(
    paths.parquet_files_dir.joinpath('train/train_base.parquet')
)
train_base_data = train_base_data.cast(
    {
        'case_id': pl.Int64,
        'date_decision': pl.String,
        'MONTH': pl.Int64,
        'WEEK_NUM': pl.Int64,
        'target': pl.Int64,
    }
)
display(train_base_data)

case_id,date_decision,MONTH,WEEK_NUM,target
i64,str,i64,i64,i64
0,"""2019-01-03""",201901,0,0
1,"""2019-01-03""",201901,0,0
2,"""2019-01-04""",201901,0,0
3,"""2019-01-03""",201901,0,0
4,"""2019-01-04""",201901,0,1
…,…,…,…,…
2703450,"""2020-10-05""",202010,91,0
2703451,"""2020-10-05""",202010,91,0
2703452,"""2020-10-05""",202010,91,0
2703453,"""2020-10-05""",202010,91,0


In [7]:
train_depth_paths = get_depth_paths(paths.parquet_files_dir, 'train')
print(f'number of test paths: {sum(len(v1) for v1 in train_depth_paths.values())}')

number of test paths: 31


In [8]:
%%time


depth_data = merge_dataset(
    train_base_data,
    train_depth_paths,
    bool_features,
    float64_features,
    string_features,
    date_features,
    None,
    '012'
)
display(depth_data)
display(depth_data.dtypes.value_counts())

loading `static_0`
	(1003757, 168)
	(522902, 168)
loading `static_cb_0`
	(1500476, 53)
loading `applprev_1`
	(782997, 92)
	(438525, 92)
loading `other_1`
	(51109, 21)
loading `tax_registry_a_1`
	(457934, 6)
loading `tax_registry_b_1`
	(150732, 7)
loading `tax_registry_c_1`
	(482265, 6)
loading `credit_bureau_a_1`
	(335275, 240)
	(549263, 240)
	(325127, 240)
	(176608, 240)
loading `credit_bureau_b_1`
	(36500, 134)
loading `deposit_1`
	(105111, 7)
loading `person_1`
	(1526659, 46)
loading `debitcard_1`
	(111772, 14)
loading `applprev_2`
	(1221522, 4)
loading `person_2`
	(1435105, 8)
loading `credit_bureau_a_2`
	(98303, 127)
	(118481, 127)
	(23734, 127)
	(156749, 127)
	(190486, 127)
	(190313, 127)
	(231250, 127)
	(150426, 127)
	(45056, 127)
	(77457, 127)
	(103033, 127)
loading `credit_bureau_b_2`
	(36447, 26)


Unnamed: 0,case_id,date_decision,MONTH,WEEK_NUM,target,actualdpdtolerance_344P_0,amtinstpaidbefduel24m_4187115A_0,annuity_780A_0,annuitynextmonth_57A_0,applicationcnt_361L_0,...,day_first_contractmaturitydate_151D_1,day_first_lastupdate_260D_1,day_first_contractenddate_991D_1,day_first_openingdate_313D_1,day_first_birth_259D_1,day_first_birthdate_87D_1,day_first_empl_employedfrom_271D_1,day_first_openingdate_857D_1,day_first_first_empls_employedfrom_796D_2,day_first_first_pmts_date_1107D_2
0,0,2019-01-03,201901,0,0,,,1917.599976,0.000000,0.0,...,,,,,1,,15.0,,,
1,1,2019-01-03,201901,0,0,,,3134.000000,0.000000,0.0,...,,,,,1,,29.0,,,
2,2,2019-01-04,201901,0,0,,,4937.000000,0.000000,0.0,...,,,,,1,,15.0,,,
3,3,2019-01-03,201901,0,0,,,4643.600098,0.000000,0.0,...,,,,,1,,15.0,,,
4,4,2019-01-04,201901,0,1,,,3390.199951,0.000000,0.0,...,,,,,1,,15.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1526654,2703450,2020-10-05,202010,91,0,0.0,176561.359375,3675.400146,0.000000,0.0,...,,,,,1,,,,,
1526655,2703451,2020-10-05,202010,91,0,0.0,301276.468750,7088.600098,6191.600098,0.0,...,,,,,1,,,,,
1526656,2703452,2020-10-05,202010,91,0,0.0,14232.400391,7788.800293,0.000000,0.0,...,,,,,1,,,,,
1526657,2703453,2020-10-05,202010,91,0,0.0,197371.578125,1195.400024,2827.199951,0.0,...,,,28.0,29.0,1,,,29.0,,


float32    784
float64    168
object      97
int8         6
int64        4
int32        3
Name: count, dtype: int64

CPU times: user 17min 3s, sys: 38.1 s, total: 17min 41s
Wall time: 1min 9s


### encode object columns

In [9]:
def encode_objects(depth_data: pd.DataFrame, paths: object) -> pd.DataFrame:
    object_columns = depth_data.dtypes.index[depth_data.dtypes==object].to_list()
    if 'date_decision' in object_columns:
        object_columns.remove('date_decision')
    
    object_data = []
    for col in object_columns:
        encoder = load(paths.output_dir.joinpath(f'encoders/encoder_{col}.joblib'))
        object_data.append(encoder.transform(depth_data[col].values.reshape(-1, 1)).astype(np.float32))
    depth_data.drop(columns=object_columns, inplace=True)
    object_data = np.concatenate(object_data, axis=1)
    object_data = pd.DataFrame(object_data, columns=object_columns)
    
    return pd.concat([depth_data, object_data], axis=1)
    
display(depth_data)
depth_data = encode_objects(depth_data, paths)
display(depth_data)

Unnamed: 0,case_id,date_decision,MONTH,WEEK_NUM,target,actualdpdtolerance_344P_0,amtinstpaidbefduel24m_4187115A_0,annuity_780A_0,annuitynextmonth_57A_0,applicationcnt_361L_0,...,day_first_contractmaturitydate_151D_1,day_first_lastupdate_260D_1,day_first_contractenddate_991D_1,day_first_openingdate_313D_1,day_first_birth_259D_1,day_first_birthdate_87D_1,day_first_empl_employedfrom_271D_1,day_first_openingdate_857D_1,day_first_first_empls_employedfrom_796D_2,day_first_first_pmts_date_1107D_2
0,0,2019-01-03,201901,0,0,,,1917.599976,0.000000,0.0,...,,,,,1,,15.0,,,
1,1,2019-01-03,201901,0,0,,,3134.000000,0.000000,0.0,...,,,,,1,,29.0,,,
2,2,2019-01-04,201901,0,0,,,4937.000000,0.000000,0.0,...,,,,,1,,15.0,,,
3,3,2019-01-03,201901,0,0,,,4643.600098,0.000000,0.0,...,,,,,1,,15.0,,,
4,4,2019-01-04,201901,0,1,,,3390.199951,0.000000,0.0,...,,,,,1,,15.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1526654,2703450,2020-10-05,202010,91,0,0.0,176561.359375,3675.400146,0.000000,0.0,...,,,,,1,,,,,
1526655,2703451,2020-10-05,202010,91,0,0.0,301276.468750,7088.600098,6191.600098,0.0,...,,,,,1,,,,,
1526656,2703452,2020-10-05,202010,91,0,0.0,14232.400391,7788.800293,0.000000,0.0,...,,,,,1,,,,,
1526657,2703453,2020-10-05,202010,91,0,0.0,197371.578125,1195.400024,2827.199951,0.0,...,,,28.0,29.0,1,,,29.0,,


FileNotFoundError: [Errno 2] No such file or directory: '../../outputs/output_dae/model_outputs_2024-05-06-09-11/encoders/encoder_first_name_4917606M_1.joblib'

In [None]:
%%time


depth_data, categorical_features = encode_objects(
    depth_data,
    paths.output_dir.joinpath('encoders')
)
display(depth_data)

categorical_features += [col for col in depth_data.columns if col.startswith('year_') or col.startswith('month_') or col.startswith('day_')]
print(len(categorical_features))