In [None]:
!pip install polars

In [2]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import polars as pl

import os

DATA_PATH = "~/public/home-credit-credit-risk-model-stability"

In [3]:
def set_table_dtypes(df: pl.DataFrame) -> pl.DataFrame:
    # implement here all desired dtypes for tables
    # the following is just an example
    for col in df.columns:
        # last letter of column name will help you determine the type
        if col[-1] in ("P", "A"):
            df = df.with_columns(pl.col(col).cast(pl.Float64).alias(col))

    return df


def convert_strings(df: pd.DataFrame) -> pd.DataFrame:
    for col in df.columns:
        if df[col].dtype.name in ["object", "string"]:
            df[col] = df[col].astype("string").astype("category")
            current_categories = df[col].cat.categories
            new_categories = current_categories.to_list() + ["Unknown"]
            new_dtype = pd.CategoricalDtype(categories=new_categories, ordered=True)
            df[col] = df[col].astype(new_dtype)
    return df

In [4]:
train_basetable = pl.read_parquet(os.path.join(DATA_PATH, "parquet_files/train/train_base.parquet"))

train_static = pl.concat(
    [
        pl.read_parquet(os.path.join(DATA_PATH, "parquet_files/train/train_static_0_0.parquet")).pipe(set_table_dtypes),
        pl.read_parquet(os.path.join(DATA_PATH, "parquet_files/train/train_static_0_1.parquet")).pipe(set_table_dtypes),
    ],
    how="vertical_relaxed",
)

train_static_cb = pl.read_parquet(os.path.join(DATA_PATH, "parquet_files/train/train_static_cb_0.parquet")).pipe(set_table_dtypes)

train_applprev_1 = pl.concat(
    [
        pl.read_parquet(os.path.join(DATA_PATH, "parquet_files/train/train_applprev_1_0.parquet")).pipe(set_table_dtypes),
        pl.read_parquet(os.path.join(DATA_PATH, "parquet_files/train/train_applprev_1_1.parquet")).pipe(set_table_dtypes),
    ],
    how="vertical_relaxed",
) 

train_other_1 = pl.read_parquet(os.path.join(DATA_PATH, "parquet_files/train/train_other_1.parquet")).pipe(set_table_dtypes) 

train_tax_registry_a_1 = pl.read_parquet(os.path.join(DATA_PATH, "parquet_files/train/train_tax_registry_a_1.parquet")).pipe(set_table_dtypes)  

train_tax_registry_b_1 = pl.read_parquet(os.path.join(DATA_PATH, "parquet_files/train/train_tax_registry_b_1.parquet")).pipe(set_table_dtypes)  

train_tax_registry_c_1 = pl.read_parquet(os.path.join(DATA_PATH, "parquet_files/train/train_tax_registry_c_1.parquet")).pipe(set_table_dtypes)  

train_credit_bureau_a_1 = pl.concat(
    [
        pl.read_parquet(os.path.join(DATA_PATH, "parquet_files/train/train_credit_bureau_a_1_0.parquet")).pipe(set_table_dtypes),
        pl.read_parquet(os.path.join(DATA_PATH, "parquet_files/train/train_credit_bureau_a_1_1.parquet")).pipe(set_table_dtypes),
        pl.read_parquet(os.path.join(DATA_PATH, "parquet_files/train/train_credit_bureau_a_1_2.parquet")).pipe(set_table_dtypes),
        pl.read_parquet(os.path.join(DATA_PATH, "parquet_files/train/train_credit_bureau_a_1_3.parquet")).pipe(set_table_dtypes),
    ],
    how="vertical_relaxed",
)  

train_credit_bureau_b_1 = pl.read_parquet(os.path.join(DATA_PATH, "parquet_files/train/train_credit_bureau_b_1.parquet")).pipe(set_table_dtypes)   

train_deposit_1 = pl.read_parquet(os.path.join(DATA_PATH, "parquet_files/train/train_deposit_1.parquet")).pipe(set_table_dtypes)   

train_person_1 = pl.read_parquet(os.path.join(DATA_PATH, "parquet_files/train/train_person_1.parquet")).pipe(set_table_dtypes)   

train_debitcard_1 = pl.read_parquet(os.path.join(DATA_PATH, "parquet_files/train/train_debitcard_1.parquet")).pipe(set_table_dtypes)   

train_applprev_2 = pl.read_parquet(os.path.join(DATA_PATH, "parquet_files/train/train_applprev_2.parquet")).pipe(set_table_dtypes)   

train_person_2 = pl.read_parquet(os.path.join(DATA_PATH, "parquet_files/train/train_person_2.parquet")).pipe(set_table_dtypes)   

train_credit_bureau_a_2 = pl.concat(
    [
        pl.read_parquet(os.path.join(DATA_PATH, "parquet_files/train/train_credit_bureau_a_2_0.parquet")).pipe(set_table_dtypes),
        pl.read_parquet(os.path.join(DATA_PATH, "parquet_files/train/train_credit_bureau_a_2_1.parquet")).pipe(set_table_dtypes),
        pl.read_parquet(os.path.join(DATA_PATH, "parquet_files/train/train_credit_bureau_a_2_2.parquet")).pipe(set_table_dtypes),
        pl.read_parquet(os.path.join(DATA_PATH, "parquet_files/train/train_credit_bureau_a_2_3.parquet")).pipe(set_table_dtypes),
        pl.read_parquet(os.path.join(DATA_PATH, "parquet_files/train/train_credit_bureau_a_2_4.parquet")).pipe(set_table_dtypes),
        pl.read_parquet(os.path.join(DATA_PATH, "parquet_files/train/train_credit_bureau_a_2_5.parquet")).pipe(set_table_dtypes),
        pl.read_parquet(os.path.join(DATA_PATH, "parquet_files/train/train_credit_bureau_a_2_6.parquet")).pipe(set_table_dtypes),
        pl.read_parquet(os.path.join(DATA_PATH, "parquet_files/train/train_credit_bureau_a_2_7.parquet")).pipe(set_table_dtypes),
        pl.read_parquet(os.path.join(DATA_PATH, "parquet_files/train/train_credit_bureau_a_2_8.parquet")).pipe(set_table_dtypes),
        pl.read_parquet(os.path.join(DATA_PATH, "parquet_files/train/train_credit_bureau_a_2_9.parquet")).pipe(set_table_dtypes),
        pl.read_parquet(os.path.join(DATA_PATH, "parquet_files/train/train_credit_bureau_a_2_10.parquet")).pipe(set_table_dtypes),
    ],
    how="vertical_relaxed",
) 

train_credit_bureau_b_2 = pl.read_parquet(os.path.join(DATA_PATH, "parquet_files/train/train_credit_bureau_b_2.parquet")).pipe(set_table_dtypes)   

In [5]:
train_sample = (
    train_basetable.sample(1_000_000)
    .join(train_static, how="left", on="case_id")
    .join(train_static_cb, how="left", on="case_id")
    .join(train_applprev_1, how="left", on="case_id")
    .join(train_other_1, how="left", on=["case_id", "num_group1"])
    .join(train_tax_registry_a_1, how="left", on=["case_id", "num_group1"])
    .join(train_tax_registry_b_1, how="left", on=["case_id", "num_group1"])
    .join(train_tax_registry_c_1, how="left", on=["case_id", "num_group1"])
    # .join(train_credit_bureau_a_1, how="left", on=["case_id", "num_group1"])
    .join(train_credit_bureau_b_1, how="left", on=["case_id", "num_group1"])
    .join(train_deposit_1, how="left", on=["case_id", "num_group1"])
    .join(train_person_1, how="left", on=["case_id", "num_group1"])
    .join(train_debitcard_1, how="left", on=["case_id", "num_group1"])
    .join(train_applprev_2, how="left", on=["case_id", "num_group1"])
    .join(train_person_2, how="left", on=["case_id", "num_group1", "num_group2"])
    # .join(
    #     train_credit_bureau_a_2, how="left", on=["case_id", "num_group1", "num_group2"]
    # )
    .join(
        train_credit_bureau_b_2, how="left", on=["case_id", "num_group1", "num_group2"]
    )
)

In [6]:
train_sample.write_parquet('trian_sample_no_imputation.parquet')