In [1]:
from collections import OrderedDict
import gc
import pathlib
from pprint import pprint
import random
from typing import Dict, List, Tuple, Union

from joblib import load
import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import polars as pl
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import OrdinalEncoder


gc.enable()

In [2]:
competition_dir = pathlib.Path('/kaggle/input/home-credit-credit-risk-model-stability')
feature_dtype_dir = pathlib.Path('/kaggle/input/0-home-credit-eda')
model_dir = pathlib.Path('/kaggle/input/2-home-credit-train')

In [3]:
!ls ../input

0-home-credit-eda  2-home-credit-train	home-credit-credit-risk-model-stability


In [4]:
bool_features = pd.read_csv(feature_dtype_dir.joinpath('bool_features.csv'))
float64_features = pd.read_csv(feature_dtype_dir.joinpath('float64_features.csv'))
string_features = pd.read_csv(feature_dtype_dir.joinpath('string_features.csv'))

bool_features['cast_dtype'] = pl.Boolean
float64_features['cast_dtype'] = pl.Float32
string_features['cast_dtype'] = pl.String

Unnamed: 0,Variable,Description,train_num_unique_values,train_unique_types,train_num_nan,train_files,train_num_unique_types,test_num_unique_values,test_unique_types,test_num_nan,test_files,test_num_unique_types,cast_dtype
0,remitter_829L,Flag indicating whether the client is a remitter.,2,bool,2168942,['train_person_1'],1,2,bool,6,['test_person_1'],1,Boolean
1,contaddr_matchlist_1032L,Indicates whether the contact address is found...,2,bool,1447773,['train_person_1'],1,2,bool,5,['test_person_1'],1,Boolean
2,isbidproduct_1095L,Flag indicating if the product is a cross-sell.,2,bool,0,['train_static_0'],1,2,bool,0,['train_static_0'],1,Boolean
3,isdebitcard_729L,Flag indicating if the product is a debit card.,2,bool,1334357,['train_static_0'],1,2,bool,2,['train_static_0'],1,Boolean
4,contaddr_smempladdr_334L,Indicates whether the contact address is the s...,3,bool,1447773,['train_person_1'],1,2,bool,5,['test_person_1'],1,Boolean
...,...,...,...,...,...,...,...,...,...,...,...,...,...
151,dateofbirth_342D,Client's date of birth.,665,str,1463976,['train_static_cb_0'],1,1,,10,['test_static_cb_0'],0,String
152,birthdate_574D,Client's date of birth (credit bureau data).,668,str,892605,['train_static_cb_0'],1,1,float64,10,['test_static_cb_0'],1,String
153,empls_employedfrom_796D,"Start of employment (num_group1 - person, num_...",802,str,1637653,['train_person_2'],1,1,,10,['test_person_2'],0,String
154,assignmentdate_238D,Tax authority data - date of assignment.,8888,str,1363480,['train_static_cb_0'],1,1,float64,10,['test_static_cb_0'],1,String


In [5]:
def get_depth_paths(load_dir: pathlib.Path, prefix: str):
    
    assert prefix in ['test', 'train']
    
    depth_paths = OrderedDict()

    depth_paths['static_0'] = []
    depth_paths['static_cb_0'] = []
    depth_paths['applprev_1'] = []
    depth_paths['other_1'] = []
    depth_paths['tax_registry_a_1'] = []
    depth_paths['tax_registry_b_1'] = []
    depth_paths['tax_registry_c_1'] = []
    depth_paths['credit_bureau_a_1'] = []
    depth_paths['credit_bureau_b_1'] = []
    depth_paths['deposit_1'] = []
    depth_paths['person_1'] = []
    depth_paths['debitcard_1'] = []
    depth_paths['applprev_2'] = []
    depth_paths['person_2'] = []
    depth_paths['credit_bureau_a_2'] = []
    depth_paths['credit_bureau_b_2'] = []

    for k in depth_paths.keys():
        depth_paths[k] = sorted(
            [p for p in load_dir.joinpath(f'parquet_files/{prefix}').glob(f'{prefix}_{k}*.parquet')]
        )
    return depth_paths

In [6]:
test_depth_paths = get_depth_paths(competition_dir, 'test')
print(f'the number of test paths: {sum(len(v1) for v1 in test_depth_paths.values())}')

the number of test paths: 35


In [7]:
def process_D_features(data: pl.DataFrame, D_columns: List[str]):
    for col in D_columns:
        data = data.with_columns(
            pl.col(col)
            .fill_null('0000-00-00')
            .str.split_exact('-', n=2)
            .struct.rename_fields(['year_' + col, 'month_' + col, 'day_' + col])
            .cast(pl.Int16)
            .alias('fields')
        )
        data = data.drop(col).unnest('fields')
    return data

In [8]:
def merge_dataset(
        base_data: pl.DataFrame,
        depth_paths: Dict[str, List[pathlib.Path]],
        bool_features: pd.DataFrame,
        float64_features: pd.DataFrame,
        string_features: pd.DataFrame,
        depth: str,
    ) -> pd.DataFrame:
    
    assert depth in ['0', '1', '2', '012']
    
    for i, (k, path_list) in enumerate(depth_paths.items()):
        
        if depth == '012':
            pass
        elif depth != k[-1]:
            continue
            
        print(f'loading `{k}`')
        depth_data = []
        for p in path_list:
            sub_data = pl.read_parquet(p).cast({'case_id': pl.Int64})
            if k[-1] == '1':
                sub_data = sub_data.drop('num_group1').group_by('case_id').max().sort('case_id')
            elif k[-1] == '2':
                sub_data = sub_data.drop(['num_group1', 'num_group2']).group_by('case_id').max().sort('case_id')
                
            depth_data.append(sub_data)
            
            print(f'\t{p.stem}: {sub_data.shape}')
            
            del sub_data
            gc.collect()
        
        depth_data = pl.concat(depth_data, how='vertical_relaxed')
        base_data = base_data.join(depth_data, how='left', on='case_id', suffix=f'_{i}')
        
        del depth_data
        gc.collect()
    
    '''
    cast dtypes
    '''
    for _, row in bool_features[['Variable', 'cast_dtype']].iterrows():
        col = row['Variable']
        cast_dtype = row['cast_dtype']
        if col in base_data.columns:
            base_data = base_data.with_columns(pl.col(col).fill_null(np.nan).cast(cast_dtype))
        
    for _, row in float64_features[['Variable', 'cast_dtype']].iterrows():
        col = row['Variable']
        cast_dtype = row['cast_dtype']
        if col in base_data.columns:
            base_data = base_data.with_columns(pl.col(col).cast(cast_dtype))
            
    for _, row in string_features[['Variable', 'cast_dtype']].iterrows():
        col = row['Variable']
        cast_dtype = row['cast_dtype']
        if col in base_data.columns:
            base_data = base_data.with_columns(pl.col(col).cast(cast_dtype))
    
    '''
    process D features
    '''
    D_features = [col for col in base_data.columns if col[-1] == 'D']
    base_data = process_D_features(base_data, D_features)
    base_data = base_data.to_pandas()
    return base_data

In [9]:
test_base_data = pl.read_parquet(
    competition_dir.joinpath('parquet_files/test/test_base.parquet')
)
test_base_data = test_base_data.cast(
    {
        'case_id': pl.Int64,
        'date_decision': pl.String,
        'MONTH': pl.Int64,
        'WEEK_NUM': pl.Int64,
    }
)
display(test_base_data)

case_id,date_decision,MONTH,WEEK_NUM
i64,str,i64,i64
57543,"""2021-05-14""",202201,100
57549,"""2022-01-17""",202201,100
57551,"""2020-11-27""",202201,100
57552,"""2020-11-27""",202201,100
57569,"""2021-12-20""",202201,100
57630,"""2021-03-16""",202201,100
57631,"""2022-06-04""",202201,100
57632,"""2022-02-05""",202201,100
57633,"""2022-01-25""",202201,100
57634,"""2021-01-27""",202201,100


In [10]:
%%time


depth_data = merge_dataset(
    test_base_data,
    test_depth_paths,
    bool_features,
    float64_features,
    string_features,
    '012'
)
display(depth_data)

loading `static_0`
	test_static_0_0: (10, 168)
	test_static_0_1: (10, 168)
	test_static_0_2: (10, 168)
loading `static_cb_0`
	test_static_cb_0: (10, 53)
loading `applprev_1`
	test_applprev_1_0: (1, 40)
	test_applprev_1_1: (1, 40)
	test_applprev_1_2: (2, 40)
loading `other_1`
	test_other_1: (9, 6)
loading `tax_registry_a_1`
	test_tax_registry_a_1: (2, 4)
loading `tax_registry_b_1`
	test_tax_registry_b_1: (2, 4)
loading `tax_registry_c_1`
	test_tax_registry_c_1: (0, 4)
loading `credit_bureau_a_1`
	test_credit_bureau_a_1_0: (1, 78)
	test_credit_bureau_a_1_1: (1, 78)
	test_credit_bureau_a_1_2: (1, 78)
	test_credit_bureau_a_1_3: (1, 78)
	test_credit_bureau_a_1_4: (1, 78)
loading `credit_bureau_b_1`
	test_credit_bureau_b_1: (3, 44)
loading `deposit_1`
	test_deposit_1: (5, 4)
loading `person_1`
	test_person_1: (6, 36)
loading `debitcard_1`
	test_debitcard_1: (5, 5)
loading `applprev_2`
	test_applprev_2: (1, 4)
loading `person_2`
	test_person_2: (3, 9)
loading `credit_bureau_a_2`
	test_credit_

Unnamed: 0,case_id,date_decision,MONTH,WEEK_NUM,actualdpdtolerance_344P,amtinstpaidbefduel24m_4187115A,annuity_780A,annuitynextmonth_57A,applicationcnt_361L,applications30d_658L,...,day_empl_employedfrom_271D,year_openingdate_857D,month_openingdate_857D,day_openingdate_857D,year_empls_employedfrom_796D,month_empls_employedfrom_796D,day_empls_employedfrom_796D,year_pmts_date_1107D,month_pmts_date_1107D,day_pmts_date_1107D
0,57543,2021-05-14,202201,100,0.0,191767.359375,3674.600098,1218.200073,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,57549,2022-01-17,202201,100,0.0,129704.398438,5742.600098,3546.600098,0.0,2.0,...,0,0,0,0,0,0,0,0,0,0
2,57551,2020-11-27,202201,100,0.0,71036.398438,2844.600098,0.0,0.0,1.0,...,8,0,0,0,0,0,0,0,0,0
3,57552,2020-11-27,202201,100,0.0,183992.0,6298.800293,12155.400391,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,57569,2021-12-20,202201,100,0.0,0.0,4682.600098,0.0,0.0,1.0,...,0,0,0,0,0,0,0,0,0,0
5,57630,2021-03-16,202201,100,0.0,0.0,8905.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
6,57631,2022-06-04,202201,100,0.0,,2540.600098,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
7,57632,2022-02-05,202201,100,0.0,63647.402344,4732.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
8,57633,2022-01-25,202201,100,0.0,,8273.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
9,57634,2021-01-27,202201,100,0.0,39948.800781,1165.800049,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


CPU times: user 5.69 s, sys: 116 ms, total: 5.81 s
Wall time: 6.04 s


In [11]:
def encode_objects(depth_data: pd.DataFrame) -> pd.DataFrame:
    object_columns = depth_data.dtypes.index[depth_data.dtypes==object].to_list()
    if 'date_decision' in object_columns:
        object_columns.remove('date_decision')
    
    object_data = []
    for col in object_columns:
        encoder = load(model_dir.joinpath(f'encoder_{col}.joblib'))
        object_data.append(encoder.transform(depth_data[col].values.reshape(-1, 1)).astype(np.float32))
    depth_data.drop(columns=object_columns, inplace=True)
    object_data = np.concatenate(object_data, axis=1)
    object_data = pd.DataFrame(object_data, columns=object_columns)
    
    return pd.concat([depth_data, object_data], axis=1)
    
display(depth_data)
depth_data = encode_objects(depth_data)
display(depth_data)

Unnamed: 0,case_id,date_decision,MONTH,WEEK_NUM,actualdpdtolerance_344P,amtinstpaidbefduel24m_4187115A,annuity_780A,annuitynextmonth_57A,applicationcnt_361L,applications30d_658L,...,day_empl_employedfrom_271D,year_openingdate_857D,month_openingdate_857D,day_openingdate_857D,year_empls_employedfrom_796D,month_empls_employedfrom_796D,day_empls_employedfrom_796D,year_pmts_date_1107D,month_pmts_date_1107D,day_pmts_date_1107D
0,57543,2021-05-14,202201,100,0.0,191767.359375,3674.600098,1218.200073,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,57549,2022-01-17,202201,100,0.0,129704.398438,5742.600098,3546.600098,0.0,2.0,...,0,0,0,0,0,0,0,0,0,0
2,57551,2020-11-27,202201,100,0.0,71036.398438,2844.600098,0.0,0.0,1.0,...,8,0,0,0,0,0,0,0,0,0
3,57552,2020-11-27,202201,100,0.0,183992.0,6298.800293,12155.400391,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,57569,2021-12-20,202201,100,0.0,0.0,4682.600098,0.0,0.0,1.0,...,0,0,0,0,0,0,0,0,0,0
5,57630,2021-03-16,202201,100,0.0,0.0,8905.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
6,57631,2022-06-04,202201,100,0.0,,2540.600098,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
7,57632,2022-02-05,202201,100,0.0,63647.402344,4732.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
8,57633,2022-01-25,202201,100,0.0,,8273.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
9,57634,2021-01-27,202201,100,0.0,39948.800781,1165.800049,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


Unnamed: 0,case_id,date_decision,MONTH,WEEK_NUM,actualdpdtolerance_344P,amtinstpaidbefduel24m_4187115A,annuity_780A,annuitynextmonth_57A,applicationcnt_361L,applications30d_658L,...,conts_role_79M,empls_economicalst_849M,empls_employer_name_740M,relatedpersons_role_762T,collater_typofvalofguarant_298M,collater_typofvalofguarant_407M,collaterals_typeofguarante_359M,collaterals_typeofguarante_669M,subjectroles_name_541M,subjectroles_name_838M
0,57543,2021-05-14,202201,100,0.0,191767.359375,3674.600098,1218.200073,0.0,0.0,...,5.0,6.0,1.0,10.0,1.0,1.0,3.0,3.0,1.0,1.0
1,57549,2022-01-17,202201,100,0.0,129704.398438,5742.600098,3546.600098,0.0,2.0,...,5.0,6.0,1.0,10.0,1.0,1.0,3.0,3.0,1.0,1.0
2,57551,2020-11-27,202201,100,0.0,71036.398438,2844.600098,0.0,0.0,1.0,...,4.0,5.0,0.0,10.0,1.0,1.0,3.0,3.0,1.0,1.0
3,57552,2020-11-27,202201,100,0.0,183992.0,6298.800293,12155.400391,0.0,0.0,...,4.0,5.0,0.0,10.0,2.0,5.0,5.0,5.0,4.0,4.0
4,57569,2021-12-20,202201,100,0.0,0.0,4682.600098,0.0,0.0,1.0,...,4.0,5.0,0.0,7.0,1.0,2.0,3.0,1.0,1.0,0.0
5,57630,2021-03-16,202201,100,0.0,0.0,8905.0,0.0,0.0,0.0,...,5.0,6.0,1.0,10.0,1.0,2.0,3.0,3.0,1.0,1.0
6,57631,2022-06-04,202201,100,0.0,,2540.600098,0.0,0.0,0.0,...,5.0,6.0,1.0,10.0,1.0,2.0,3.0,3.0,1.0,1.0
7,57632,2022-02-05,202201,100,0.0,63647.402344,4732.0,0.0,0.0,0.0,...,5.0,6.0,1.0,10.0,1.0,2.0,1.0,3.0,0.0,1.0
8,57633,2022-01-25,202201,100,0.0,,8273.0,0.0,0.0,0.0,...,5.0,6.0,1.0,10.0,1.0,2.0,3.0,3.0,1.0,1.0
9,57634,2021-01-27,202201,100,0.0,39948.800781,1165.800049,0.0,0.0,0.0,...,5.0,6.0,1.0,10.0,2.0,5.0,5.0,5.0,4.0,4.0


In [12]:
def predict(
        models: List[object],
        submit: pd.DataFrame,
        X: pd.DataFrame,
        batch_size: int = 128,
    ) -> pd.DataFrame:
    
    probas = np.zeros(len(X), dtype=np.float32)
    for i in range(0, len(X), batch_size):
        limit = i + batch_size if i + batch_size < len(X) else len(X)
        batch_X = X.iloc[i:limit]
        
        for model in models:
            proba = model.predict(batch_X, num_iteration=model.best_iteration)
            probas[i:limit] += proba.astype(np.float32)
            
            del proba
            gc.collect()
            
        del batch_X
        gc.collect()
    
    probas /= len(models)
    
    submit['score'] = probas
    
    return submit

In [13]:
models = []
for k in range(5):
    models.append(lgb.Booster(model_file=model_dir.joinpath(f'lgb_fold{k+1}.txt')))
    
training_features = np.load(model_dir.joinpath('training_features.npy'), allow_pickle=True)
depth_data = depth_data[training_features]
display(depth_data)

submission = predict(models, test_base_data.to_pandas(), depth_data, 2048)
display(submission)

submission = submission[['case_id', 'score']]
display(submission)

Unnamed: 0,actualdpdtolerance_344P,amtinstpaidbefduel24m_4187115A,annuity_780A,annuitynextmonth_57A,applicationcnt_361L,applications30d_658L,applicationscnt_1086L,applicationscnt_464L,applicationscnt_629L,applicationscnt_867L,...,conts_role_79M,empls_economicalst_849M,empls_employer_name_740M,relatedpersons_role_762T,collater_typofvalofguarant_298M,collater_typofvalofguarant_407M,collaterals_typeofguarante_359M,collaterals_typeofguarante_669M,subjectroles_name_541M,subjectroles_name_838M
0,0.0,191767.359375,3674.600098,1218.200073,0.0,0.0,0.0,0.0,0.0,9.0,...,5.0,6.0,1.0,10.0,1.0,1.0,3.0,3.0,1.0,1.0
1,0.0,129704.398438,5742.600098,3546.600098,0.0,2.0,0.0,0.0,0.0,10.0,...,5.0,6.0,1.0,10.0,1.0,1.0,3.0,3.0,1.0,1.0
2,0.0,71036.398438,2844.600098,0.0,0.0,1.0,0.0,0.0,0.0,2.0,...,4.0,5.0,0.0,10.0,1.0,1.0,3.0,3.0,1.0,1.0
3,0.0,183992.0,6298.800293,12155.400391,0.0,0.0,0.0,0.0,0.0,9.0,...,4.0,5.0,0.0,10.0,2.0,5.0,5.0,5.0,4.0,4.0
4,0.0,0.0,4682.600098,0.0,0.0,1.0,0.0,0.0,0.0,6.0,...,4.0,5.0,0.0,7.0,1.0,2.0,3.0,1.0,1.0,0.0
5,0.0,0.0,8905.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,5.0,6.0,1.0,10.0,1.0,2.0,3.0,3.0,1.0,1.0
6,0.0,,2540.600098,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.0,6.0,1.0,10.0,1.0,2.0,3.0,3.0,1.0,1.0
7,0.0,63647.402344,4732.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,5.0,6.0,1.0,10.0,1.0,2.0,1.0,3.0,0.0,1.0
8,0.0,,8273.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,...,5.0,6.0,1.0,10.0,1.0,2.0,3.0,3.0,1.0,1.0
9,0.0,39948.800781,1165.800049,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.0,6.0,1.0,10.0,2.0,5.0,5.0,5.0,4.0,4.0


Unnamed: 0,case_id,date_decision,MONTH,WEEK_NUM,score
0,57543,2021-05-14,202201,100,0.005338
1,57549,2022-01-17,202201,100,0.099182
2,57551,2020-11-27,202201,100,0.003017
3,57552,2020-11-27,202201,100,0.00915
4,57569,2021-12-20,202201,100,0.107124
5,57630,2021-03-16,202201,100,0.007665
6,57631,2022-06-04,202201,100,0.045314
7,57632,2022-02-05,202201,100,0.003059
8,57633,2022-01-25,202201,100,0.023403
9,57634,2021-01-27,202201,100,0.014238


Unnamed: 0,case_id,score
0,57543,0.005338
1,57549,0.099182
2,57551,0.003017
3,57552,0.00915
4,57569,0.107124
5,57630,0.007665
6,57631,0.045314
7,57632,0.003059
8,57633,0.023403
9,57634,0.014238


# 

In [14]:
submission.to_csv('submission.csv', index=False)