In [None]:
!pip install polars

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import polars as pl

import os

DATA_PATH = "~/public/home-credit-credit-risk-model-stability"

In [2]:
def set_table_dtypes(df: pl.DataFrame) -> pl.DataFrame:
    # implement here all desired dtypes for tables
    # the following is just an example
    for col in df.columns:
        # last letter of column name will help you determine the type
        if col[-1] in ("P", "A"):
            df = df.with_columns(pl.col(col).cast(pl.Float64).alias(col))

    return df


def convert_strings(df: pd.DataFrame) -> pd.DataFrame:
    for col in df.columns:
        if df[col].dtype.name in ["object", "string"]:
            df[col] = df[col].astype("string").astype("category")
            current_categories = df[col].cat.categories
            new_categories = current_categories.to_list() + ["Unknown"]
            new_dtype = pd.CategoricalDtype(categories=new_categories, ordered=True)
            df[col] = df[col].astype(new_dtype)
    return df

# Train Data

In [3]:
train_basetable = pl.read_parquet(os.path.join(DATA_PATH, "parquet_files/train/train_base.parquet"))

train_static = pl.concat(
    [
        pl.read_parquet(os.path.join(DATA_PATH, "parquet_files/train/train_static_0_0.parquet")).pipe(set_table_dtypes),
        pl.read_parquet(os.path.join(DATA_PATH, "parquet_files/train/train_static_0_1.parquet")).pipe(set_table_dtypes),
    ],
    how="vertical_relaxed",
)

train_static_cb = pl.read_parquet(os.path.join(DATA_PATH, "parquet_files/train/train_static_cb_0.parquet")).pipe(set_table_dtypes)

train_applprev_1 = pl.concat(
    [
        pl.read_parquet(os.path.join(DATA_PATH, "parquet_files/train/train_applprev_1_0.parquet")).pipe(set_table_dtypes),
        pl.read_parquet(os.path.join(DATA_PATH, "parquet_files/train/train_applprev_1_1.parquet")).pipe(set_table_dtypes),
    ],
    how="vertical_relaxed",
) 

train_other_1 = pl.read_parquet(os.path.join(DATA_PATH, "parquet_files/train/train_other_1.parquet")).pipe(set_table_dtypes) 

train_tax_registry_a_1 = pl.read_parquet(os.path.join(DATA_PATH, "parquet_files/train/train_tax_registry_a_1.parquet")).pipe(set_table_dtypes)  

train_tax_registry_b_1 = pl.read_parquet(os.path.join(DATA_PATH, "parquet_files/train/train_tax_registry_b_1.parquet")).pipe(set_table_dtypes)  

train_tax_registry_c_1 = pl.read_parquet(os.path.join(DATA_PATH, "parquet_files/train/train_tax_registry_c_1.parquet")).pipe(set_table_dtypes)  

train_credit_bureau_a_1 = pl.concat(
    [
        pl.read_parquet(os.path.join(DATA_PATH, "parquet_files/train/train_credit_bureau_a_1_0.parquet")).pipe(set_table_dtypes),
        pl.read_parquet(os.path.join(DATA_PATH, "parquet_files/train/train_credit_bureau_a_1_1.parquet")).pipe(set_table_dtypes),
        pl.read_parquet(os.path.join(DATA_PATH, "parquet_files/train/train_credit_bureau_a_1_2.parquet")).pipe(set_table_dtypes),
        pl.read_parquet(os.path.join(DATA_PATH, "parquet_files/train/train_credit_bureau_a_1_3.parquet")).pipe(set_table_dtypes),
    ],
    how="vertical_relaxed",
)  

train_credit_bureau_b_1 = pl.read_parquet(os.path.join(DATA_PATH, "parquet_files/train/train_credit_bureau_b_1.parquet")).pipe(set_table_dtypes)   

train_deposit_1 = pl.read_parquet(os.path.join(DATA_PATH, "parquet_files/train/train_deposit_1.parquet")).pipe(set_table_dtypes)   

train_person_1 = pl.read_parquet(os.path.join(DATA_PATH, "parquet_files/train/train_person_1.parquet")).pipe(set_table_dtypes)   

train_debitcard_1 = pl.read_parquet(os.path.join(DATA_PATH, "parquet_files/train/train_debitcard_1.parquet")).pipe(set_table_dtypes)   

train_applprev_2 = pl.read_parquet(os.path.join(DATA_PATH, "parquet_files/train/train_applprev_2.parquet")).pipe(set_table_dtypes)   

train_person_2 = pl.read_parquet(os.path.join(DATA_PATH, "parquet_files/train/train_person_2.parquet")).pipe(set_table_dtypes)   

train_credit_bureau_a_2 = pl.concat(
    [
        pl.read_parquet(os.path.join(DATA_PATH, "parquet_files/train/train_credit_bureau_a_2_0.parquet")).pipe(set_table_dtypes),
        pl.read_parquet(os.path.join(DATA_PATH, "parquet_files/train/train_credit_bureau_a_2_1.parquet")).pipe(set_table_dtypes),
        pl.read_parquet(os.path.join(DATA_PATH, "parquet_files/train/train_credit_bureau_a_2_2.parquet")).pipe(set_table_dtypes),
        pl.read_parquet(os.path.join(DATA_PATH, "parquet_files/train/train_credit_bureau_a_2_3.parquet")).pipe(set_table_dtypes),
        pl.read_parquet(os.path.join(DATA_PATH, "parquet_files/train/train_credit_bureau_a_2_4.parquet")).pipe(set_table_dtypes),
        pl.read_parquet(os.path.join(DATA_PATH, "parquet_files/train/train_credit_bureau_a_2_5.parquet")).pipe(set_table_dtypes),
        pl.read_parquet(os.path.join(DATA_PATH, "parquet_files/train/train_credit_bureau_a_2_6.parquet")).pipe(set_table_dtypes),
        pl.read_parquet(os.path.join(DATA_PATH, "parquet_files/train/train_credit_bureau_a_2_7.parquet")).pipe(set_table_dtypes),
        pl.read_parquet(os.path.join(DATA_PATH, "parquet_files/train/train_credit_bureau_a_2_8.parquet")).pipe(set_table_dtypes),
        pl.read_parquet(os.path.join(DATA_PATH, "parquet_files/train/train_credit_bureau_a_2_9.parquet")).pipe(set_table_dtypes),
        pl.read_parquet(os.path.join(DATA_PATH, "parquet_files/train/train_credit_bureau_a_2_10.parquet")).pipe(set_table_dtypes),
    ],
    how="vertical_relaxed",
) 

train_credit_bureau_b_2 = pl.read_parquet(os.path.join(DATA_PATH, "parquet_files/train/train_credit_bureau_b_2.parquet")).pipe(set_table_dtypes)   

In [4]:
train_sample = (
    train_basetable.sample(500_000)
    .join(train_static, how="left", on="case_id")
    .join(train_static_cb, how="left", on="case_id")
    .join(train_applprev_1, how="left", on="case_id")
    .join(train_other_1, how="left", on=["case_id", "num_group1"])
    .join(train_tax_registry_a_1, how="left", on=["case_id", "num_group1"])
    .join(train_tax_registry_b_1, how="left", on=["case_id", "num_group1"])
    .join(train_tax_registry_c_1, how="left", on=["case_id", "num_group1"])
    .join(train_credit_bureau_a_1, how="left", on=["case_id", "num_group1"])
    .join(train_credit_bureau_b_1, how="left", on=["case_id", "num_group1"])
    .join(train_deposit_1, how="left", on=["case_id", "num_group1"])
    .join(train_person_1, how="left", on=["case_id", "num_group1"])
    .join(train_debitcard_1, how="left", on=["case_id", "num_group1"])
    .join(train_applprev_2, how="left", on=["case_id", "num_group1"])
    .join(train_person_2, how="left", on=["case_id", "num_group1", "num_group2"])
    .join(
        train_credit_bureau_a_2, how="left", on=["case_id", "num_group1", "num_group2"]
    )
    .join(
        train_credit_bureau_b_2, how="left", on=["case_id", "num_group1", "num_group2"]
    )
)

# Test Data

In [None]:
test_basetable = pl.read_parquet(os.path.join(DATA_PATH, "parquet_files/test/test_base.parquet"))

test_static = pl.concat(
    [
        pl.read_parquet(os.path.join(DATA_PATH, "parquet_files/test/test_static_0_0.parquet")).pipe(set_table_dtypes),
        pl.read_parquet(os.path.join(DATA_PATH, "parquet_files/test/test_static_0_1.parquet")).pipe(set_table_dtypes),
        pl.read_parquet(os.path.join(DATA_PATH, "parquet_files/test/test_static_0_2.parquet")).pipe(set_table_dtypes),
    ],
    how="vertical_relaxed",
)

test_static_cb = pl.read_parquet(os.path.join(DATA_PATH, "parquet_files/test/test_static_cb_0.parquet")).pipe(set_table_dtypes)

test_applprev_1 = pl.concat(
    [
        pl.read_parquet(os.path.join(DATA_PATH, "parquet_files/test/test_applprev_1_0.parquet")).pipe(set_table_dtypes),
        pl.read_parquet(os.path.join(DATA_PATH, "parquet_files/test/test_applprev_1_1.parquet")).pipe(set_table_dtypes),
        pl.read_parquet(os.path.join(DATA_PATH, "parquet_files/test/test_applprev_1_2.parquet")).pipe(set_table_dtypes),
    ],
    how="vertical_relaxed",
) 

test_other_1 = pl.read_parquet(os.path.join(DATA_PATH, "parquet_files/test/test_other_1.parquet")).pipe(set_table_dtypes) 

test_tax_registry_a_1 = pl.read_parquet(os.path.join(DATA_PATH, "parquet_files/test/test_tax_registry_a_1.parquet")).pipe(set_table_dtypes)  

test_tax_registry_b_1 = pl.read_parquet(os.path.join(DATA_PATH, "parquet_files/test/test_tax_registry_b_1.parquet")).pipe(set_table_dtypes)  

test_tax_registry_c_1 = pl.read_parquet(os.path.join(DATA_PATH, "parquet_files/test/test_tax_registry_c_1.parquet")).pipe(set_table_dtypes)  

test_credit_bureau_a_1 = pl.concat(
    [
        pl.read_parquet(os.path.join(DATA_PATH, "parquet_files/test/test_credit_bureau_a_1_0.parquet")).pipe(set_table_dtypes),
        pl.read_parquet(os.path.join(DATA_PATH, "parquet_files/test/test_credit_bureau_a_1_1.parquet")).pipe(set_table_dtypes),
        pl.read_parquet(os.path.join(DATA_PATH, "parquet_files/test/test_credit_bureau_a_1_2.parquet")).pipe(set_table_dtypes),
        pl.read_parquet(os.path.join(DATA_PATH, "parquet_files/test/test_credit_bureau_a_1_3.parquet")).pipe(set_table_dtypes),
        pl.read_parquet(os.path.join(DATA_PATH, "parquet_files/test/test_credit_bureau_a_1_4.parquet")).pipe(set_table_dtypes),
    ],
    how="vertical_relaxed",
)  

test_credit_bureau_b_1 = pl.read_parquet(os.path.join(DATA_PATH, "parquet_files/test/test_credit_bureau_b_1.parquet")).pipe(set_table_dtypes)   

test_deposit_1 = pl.read_parquet(os.path.join(DATA_PATH, "parquet_files/test/test_deposit_1.parquet")).pipe(set_table_dtypes)   

test_person_1 = pl.read_parquet(os.path.join(DATA_PATH, "parquet_files/test/test_person_1.parquet")).pipe(set_table_dtypes)   

test_debitcard_1 = pl.read_parquet(os.path.join(DATA_PATH, "parquet_files/test/test_debitcard_1.parquet")).pipe(set_table_dtypes)   

test_applprev_2 = pl.read_parquet(os.path.join(DATA_PATH, "parquet_files/test/test_applprev_2.parquet")).pipe(set_table_dtypes)   

test_person_2 = pl.read_parquet(os.path.join(DATA_PATH, "parquet_files/test/test_person_2.parquet")).pipe(set_table_dtypes)   

test_credit_bureau_a_2 = pl.concat(
    [
        pl.read_parquet(os.path.join(DATA_PATH, "parquet_files/test/test_credit_bureau_a_2_0.parquet")).pipe(set_table_dtypes),
        pl.read_parquet(os.path.join(DATA_PATH, "parquet_files/test/test_credit_bureau_a_2_1.parquet")).pipe(set_table_dtypes),
        pl.read_parquet(os.path.join(DATA_PATH, "parquet_files/test/test_credit_bureau_a_2_2.parquet")).pipe(set_table_dtypes),
        pl.read_parquet(os.path.join(DATA_PATH, "parquet_files/test/test_credit_bureau_a_2_3.parquet")).pipe(set_table_dtypes),
        pl.read_parquet(os.path.join(DATA_PATH, "parquet_files/test/test_credit_bureau_a_2_4.parquet")).pipe(set_table_dtypes),
        pl.read_parquet(os.path.join(DATA_PATH, "parquet_files/test/test_credit_bureau_a_2_5.parquet")).pipe(set_table_dtypes),
        pl.read_parquet(os.path.join(DATA_PATH, "parquet_files/test/test_credit_bureau_a_2_6.parquet")).pipe(set_table_dtypes),
        pl.read_parquet(os.path.join(DATA_PATH, "parquet_files/test/test_credit_bureau_a_2_7.parquet")).pipe(set_table_dtypes),
        pl.read_parquet(os.path.join(DATA_PATH, "parquet_files/test/test_credit_bureau_a_2_8.parquet")).pipe(set_table_dtypes),
        pl.read_parquet(os.path.join(DATA_PATH, "parquet_files/test/test_credit_bureau_a_2_9.parquet")).pipe(set_table_dtypes),
        pl.read_parquet(os.path.join(DATA_PATH, "parquet_files/test/test_credit_bureau_a_2_10.parquet")).pipe(set_table_dtypes),
        pl.read_parquet(os.path.join(DATA_PATH, "parquet_files/test/test_credit_bureau_a_2_11.parquet")).pipe(set_table_dtypes),
    ],
    how="vertical_relaxed",
) 

test_credit_bureau_b_2 = pl.read_parquet(os.path.join(DATA_PATH, "parquet_files/test/test_credit_bureau_b_2.parquet")).pipe(set_table_dtypes)   

In [None]:
test_sample = (
    test_basetable.sample(500_000)
    .join(test_static, how="left", on="case_id")
    .join(test_static_cb, how="left", on="case_id")
    .join(test_applprev_1, how="left", on="case_id")
    .join(test_other_1, how="left", on=["case_id", "num_group1"])
    .join(test_tax_registry_a_1, how="left", on=["case_id", "num_group1"])
    .join(test_tax_registry_b_1, how="left", on=["case_id", "num_group1"])
    .join(test_tax_registry_c_1, how="left", on=["case_id", "num_group1"])
    .join(test_credit_bureau_a_1, how="left", on=["case_id", "num_group1"])
    .join(test_credit_bureau_b_1, how="left", on=["case_id", "num_group1"])
    .join(test_deposit_1, how="left", on=["case_id", "num_group1"])
    .join(test_person_1, how="left", on=["case_id", "num_group1"])
    .join(test_debitcard_1, how="left", on=["case_id", "num_group1"])
    .join(test_applprev_2, how="left", on=["case_id", "num_group1"])
    .join(test_person_2, how="left", on=["case_id", "num_group1", "num_group2"])
    .join(
        test_credit_bureau_a_2, how="left", on=["case_id", "num_group1", "num_group2"]
    )
    .join(
        test_credit_bureau_b_2, how="left", on=["case_id", "num_group1", "num_group2"]
    )
)

In [5]:
train_sample.write_parquet('train_sample_no_imputation_500_000.parquet')

In [7]:
train_sample

case_id,date_decision,MONTH,WEEK_NUM,target,actualdpdtolerance_344P,amtinstpaidbefduel24m_4187115A,annuity_780A,annuitynextmonth_57A,applicationcnt_361L,applications30d_658L,applicationscnt_1086L,applicationscnt_464L,applicationscnt_629L,applicationscnt_867L,avgdbddpdlast24m_3658932P,avgdbddpdlast3m_4187120P,avgdbdtollast24m_4525197P,avgdpdtolclosure24_3658938P,avginstallast24m_3658937A,avglnamtstart24m_4525187A,avgmaxdpdlast9m_3716943P,avgoutstandbalancel6m_4187114A,avgpmtlast12m_4525200A,bankacctype_710L,cardtype_51L,clientscnt12m_3712952L,clientscnt3m_3712950L,clientscnt6m_3712949L,clientscnt_100L,clientscnt_1022L,clientscnt_1071L,clientscnt_1130L,clientscnt_136L,clientscnt_157L,clientscnt_257L,clientscnt_304L,…,sex_738L,type_25L,last180dayaveragebalance_704A,last180dayturnover_1134A,last30dayturnover_651A,openingdate_857D,cacccardblochreas_147M,conts_type_509L,credacc_cards_status_52L,num_group2,addres_district_368M,addres_role_871L,addres_zip_823M,conts_role_79M,empls_economicalst_849M,empls_employedfrom_796D,empls_employer_name_740M,relatedpersons_role_762T,collater_typofvalofguarant_298M,collater_typofvalofguarant_407M,collater_valueofguarantee_1124L,collater_valueofguarantee_876L,collaterals_typeofguarante_359M,collaterals_typeofguarante_669M,pmts_dpd_1073P,pmts_dpd_303P,pmts_month_158T,pmts_month_706T,pmts_overdue_1140A,pmts_overdue_1152A,pmts_year_1139T,pmts_year_507T,subjectroles_name_541M,subjectroles_name_838M,pmts_date_1107D,pmts_dpdvalue_108P,pmts_pmtsoverdue_635A
i64,str,i64,i64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,str,str,f64,f64,f64,str,str,str,str,i64,str,str,str,str,str,str,str,str,str,str,f64,f64,str,str,f64,f64,f64,f64,f64,f64,f64,f64,str,str,str,f64,f64
208309,"""2020-02-22""",202002,59,0,0.0,189905.53,7701.0,6890.0,0.0,0.0,0.0,0.0,0.0,8.0,-10.0,-5.0,-11.0,0.0,8113.0,,0.0,134218.8,6890.0,"""CA""",,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,…,,"""PHONE""",,,,,"""a55475b1""","""PRIMARY_MOBILE…",,0,,,,,,,,,"""a55475b1""","""8fd95e4b""",,0.0,"""c7a5ad39""","""a55475b1""",,,,2.0,,,,2017.0,"""ab3c25cf""","""a55475b1""",,,
208309,"""2020-02-22""",202002,59,0,0.0,189905.53,7701.0,6890.0,0.0,0.0,0.0,0.0,0.0,8.0,-10.0,-5.0,-11.0,0.0,8113.0,,0.0,134218.8,6890.0,"""CA""",,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,…,,,,,,,"""a55475b1""","""PRIMARY_MOBILE…",,0,,,,,,,,,"""a55475b1""","""9a0c095e""",,0.0,"""c7a5ad39""","""a55475b1""",,,,2.0,,,,2017.0,"""ab3c25cf""","""a55475b1""",,,
208309,"""2020-02-22""",202002,59,0,0.0,189905.53,7701.0,6890.0,0.0,0.0,0.0,0.0,0.0,8.0,-10.0,-5.0,-11.0,0.0,8113.0,,0.0,134218.8,6890.0,"""CA""",,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,…,"""F""","""PRIMARY_MOBILE…",,,,"""2015-07-16""","""a55475b1""","""PRIMARY_MOBILE…",,0,"""a55475b1""",,"""a55475b1""","""a55475b1""","""a55475b1""",,"""a55475b1""",,"""9a0c095e""","""9a0c095e""",0.0,0.0,"""c7a5ad39""","""c7a5ad39""",,,2.0,2.0,,,2018.0,2013.0,"""ab3c25cf""","""ab3c25cf""",,,
208309,"""2020-02-22""",202002,59,0,0.0,189905.53,7701.0,6890.0,0.0,0.0,0.0,0.0,0.0,8.0,-10.0,-5.0,-11.0,0.0,8113.0,,0.0,134218.8,6890.0,"""CA""",,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,…,"""F""","""PRIMARY_MOBILE…",,,,"""2015-07-16""","""a55475b1""","""HOME_PHONE""",,1,,,,,,,,,"""a55475b1""","""a55475b1""",,,"""a55475b1""","""a55475b1""",,,3.0,3.0,,,2018.0,2013.0,"""a55475b1""","""a55475b1""",,,
208309,"""2020-02-22""",202002,59,0,0.0,189905.53,7701.0,6890.0,0.0,0.0,0.0,0.0,0.0,8.0,-10.0,-5.0,-11.0,0.0,8113.0,,0.0,134218.8,6890.0,"""CA""",,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,…,,,,,,,"""a55475b1""","""PRIMARY_MOBILE…",,0,,,,,,,,,,,,,,,,,,,,,,,,,,,
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
1373094,"""2019-05-21""",201905,20,0,0.0,0.0,3052.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,684.0,685.0,,678.0,,,,6919.66,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,…,,,,,,,"""a55475b1""","""HOME_PHONE""",,0,,,,,,,,,,,,,,,,,,,,,,,,,,,
1373094,"""2019-05-21""",201905,20,0,0.0,0.0,3052.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,684.0,685.0,,678.0,,,,6919.66,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,…,,,,,,,"""a55475b1""","""PRIMARY_MOBILE…",,1,,,,,,,,,,,,,,,,,,,,,,,,,,,
1593574,"""2019-10-23""",201910,42,0,0.0,0.0,2792.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,0.0,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,…,"""F""","""PRIMARY_MOBILE…",,,,,"""a55475b1""","""HOME_PHONE""",,0,"""a55475b1""",,"""a55475b1""","""a55475b1""","""a55475b1""",,"""a55475b1""",,"""9a0c095e""","""9a0c095e""",0.0,0.0,"""c7a5ad39""","""c7a5ad39""",,,2.0,2.0,,,2018.0,2009.0,"""ab3c25cf""","""ab3c25cf""",,,
1593574,"""2019-10-23""",201910,42,0,0.0,0.0,2792.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,0.0,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,…,"""F""","""PRIMARY_MOBILE…",,,,,"""a55475b1""","""PRIMARY_MOBILE…",,1,,,,,,,,,"""a55475b1""","""a55475b1""",,,"""a55475b1""","""a55475b1""",,,3.0,3.0,,,2018.0,2009.0,"""a55475b1""","""a55475b1""",,,
