In [None]:
from collections import OrderedDict
import gc
import pathlib
from pprint import pprint
import random
from typing import Dict, List, Tuple, Union

import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import polars as pl
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.preprocessing import LabelEncoder


gc.enable()

In [None]:
competition_dir = pathlib.Path('/kaggle/input/home-credit-credit-risk-model-stability')
feature_dtype_dir = pathlib.Path('/kaggle/input/0-home-credit-eda')
model_dir = pathlib.Path('/kaggle/input/1-home-credit-train')

In [None]:
bool_features = pl.read_csv(feature_dtype_dir.joinpath('bool_features.csv'))
float64_features = pl.read_csv(feature_dtype_dir.joinpath('float64_features.csv'))
string_features = pl.read_csv(feature_dtype_dir.joinpath('string_features.csv'))

display(bool_features)
display(float64_features)
display(string_features)

cast_features = OrderedDict()
cast_features['case_id'] = pl.Int64

for k in bool_features.select('Variable').to_series():
    cast_features[k] = pl.Boolean

for k in float64_features.select('Variable').to_series():
    cast_features[k] = pl.Float32

for k in string_features.select('Variable').to_series():
    cast_features[k] = pl.String

print(f'the number of features: {len(cast_features.keys())}')
pprint(cast_features)

In [None]:
test_depth_files = OrderedDict()

test_depth_files['test_static_0'] = ['test_static_0_0.parquet',
                                     'test_static_0_1.parquet']
test_depth_files['test_static_cb_0'] = ['test_static_cb_0.parquet']
test_depth_files['test_applprev_1'] = ['test_applprev_1_0.parquet',
                                       'test_applprev_1_1.parquet']
test_depth_files['test_other_1'] = ['test_other_1.parquet']
test_depth_files['test_tax_registry_a_1'] = ['test_tax_registry_a_1.parquet']
test_depth_files['test_tax_registry_b_1'] = ['test_tax_registry_b_1.parquet']
test_depth_files['test_tax_registry_c_1'] = ['test_tax_registry_c_1.parquet']
test_depth_files['test_credit_bureau_a_1'] = ['test_credit_bureau_a_1_0.parquet',
                                              'test_credit_bureau_a_1_1.parquet',
                                              'test_credit_bureau_a_1_2.parquet',
                                              'test_credit_bureau_a_1_3.parquet']
test_depth_files['test_credit_bureau_b_1'] = ['test_credit_bureau_b_1.parquet']
test_depth_files['test_deposit_1'] = ['test_deposit_1.parquet']
test_depth_files['test_person_1'] = ['test_person_1.parquet']
test_depth_files['test_debitcard_1'] = ['test_debitcard_1.parquet']

test_depth_files['test_applprev_2'] = ['test_applprev_2.parquet']
test_depth_files['test_person_2'] = ['test_person_2.parquet']
test_depth_files['test_credit_bureau_a_2'] = ['test_credit_bureau_a_2_0.parquet',
                                              'test_credit_bureau_a_2_1.parquet',
                                              'test_credit_bureau_a_2_2.parquet',
                                              'test_credit_bureau_a_2_3.parquet',
                                              'test_credit_bureau_a_2_4.parquet',
                                              'test_credit_bureau_a_2_5.parquet',
                                              'test_credit_bureau_a_2_6.parquet',
                                              'test_credit_bureau_a_2_7.parquet',
                                              'test_credit_bureau_a_2_8.parquet',
                                              'test_credit_bureau_a_2_9.parquet',
                                              'test_credit_bureau_a_2_10.parquet']
test_depth_files['test_credit_bureau_b_2'] = ['test_credit_bureau_b_2.parquet']

print(f'the number of test files: {sum(len(v1) for v1 in test_depth_files.values())}')

test_depth_paths = OrderedDict()
for k, path_list in test_depth_files.items():
    test_depth_paths[k] = list(
        map(lambda p: competition_dir.joinpath('parquet_files/test/'+p), path_list)
    )
    
print(f'number of test paths: {sum(len(v1) for v1 in test_depth_paths.values())}')

In [None]:
def merge_dataset(
        base_data: pl.DataFrame,
        depth_paths: Dict[str, List[pathlib.Path]],
        cast_features: Dict[str, object],
        depth: str,
    ) -> pd.DataFrame:
    
    assert depth in ['0', '1', '2', '012']
    
    for i, (k, path_list) in enumerate(depth_paths.items()):
        
        if depth == '012':
            pass
        elif depth != k[-1]:
            continue
            
        print(f'loading `{k}`')
        depth_data = []
        for p in path_list:
            sub_data = pl.read_parquet(p)
            sub_data = sub_data.cast(
                {k: v for k, v in cast_features.items() if k in sub_data.columns}
            )
            if k[-1] == '1':
                sub_data = sub_data.drop('num_group1')
            elif k[-1] == '2':
                sub_data = sub_data.drop(['num_group1', 'num_group2'])
            if k[-1] == '1' or k[-1] == '2':
                sub_data = sub_data.group_by('case_id').sum().sort('case_id')
                
            depth_data.append(sub_data)
            
            print(f'\t{sub_data.shape}')
            
            del sub_data
            gc.collect()
        
        depth_data = pl.concat(depth_data, how='vertical_relaxed')
        base_data = base_data.join(depth_data, how='left', on='case_id', suffix=f'_{i}')
        
        del depth_data
        gc.collect()
    
    return base_data.to_pandas()

In [None]:
test_base_data = pl.read_parquet(competition_dir.joinpath('parquet_files/test/test_base.parquet'))
test_base_data = test_base_data.cast(
    {
        'case_id': pl.Int64,
        'date_decision': pl.String,
        'MONTH': pl.Int32,
        'WEEK_NUM': pl.Int16,
    }
)
display(test_base_data)

In [None]:
%%time


depth_data = merge_dataset(test_base_data, test_depth_paths, cast_features, '012')
display(depth_data)

In [None]:
def predict(
        models: List[object],
        submit: pd.DataFrame,
        X: pd.DataFrame,
        batch_size: int = 128,
    ) -> pd.DataFrame:
    
    probas = np.zeros(len(X), dtype=np.float32)
    for i in range(0, len(X), batch_size):
        limit = i + batch_size if i + batch_size < len(X) else len(X)
        batch_depth_data = X.iloc[i:limit]
        
        for model in models:
            proba = model.predict(batch_depth_data, num_iteration=model.best_iteration)
            proba = np.nan_to_num(proba, 0)
            proba = np.clip(proba, 0, 1)
            probas[i:limit] += proba.astype(np.float32)
            
            del proba
            gc.collect()
            
        del batch_depth_data
        gc.collect()
    
    probas /= len(models)
    probas = np.nan_to_num(probas)
    
    submit['score'] = probas
    
    return submit

In [None]:
models = []
for k in range(5):
    models.append(lgb.Booster(model_file=model_dir.joinpath(f'lgb_fold{k+1}.txt')))
    
training_features = np.load(model_dir.joinpath('training_features.npy'), allow_pickle=True)
depth_data = depth_data[training_features]
display(depth_data)

submission = predict(models, test_base_data.to_pandas(), depth_data, 1024)
display(submission)

submission = submission[['case_id', 'score']]
display(submission)

In [None]:
submission.to_csv('submission.csv', index=False)