In [1]:
import pathlib
import duckdb
import polars as pl

pl.Config(set_fmt_float="full")

train_paths: list[pathlib.Path] = list((pathlib.Path.cwd().parent / "data" / "datalake" / "bronze").rglob("train*parquet"))
train_paths: dict[str, pathlib.Path] = {k.name.split(".")[0]: k for k in train_paths}
sorted(list(train_paths.keys()))

['train_applprev_1_0',
 'train_applprev_1_1',
 'train_applprev_2',
 'train_base',
 'train_credit_bureau_a_1_0',
 'train_credit_bureau_a_1_1',
 'train_credit_bureau_a_1_2',
 'train_credit_bureau_a_1_3',
 'train_credit_bureau_a_2_0',
 'train_credit_bureau_a_2_1',
 'train_credit_bureau_a_2_10',
 'train_credit_bureau_a_2_2',
 'train_credit_bureau_a_2_3',
 'train_credit_bureau_a_2_4',
 'train_credit_bureau_a_2_5',
 'train_credit_bureau_a_2_6',
 'train_credit_bureau_a_2_7',
 'train_credit_bureau_a_2_8',
 'train_credit_bureau_a_2_9',
 'train_credit_bureau_b_1',
 'train_credit_bureau_b_2',
 'train_debitcard_1',
 'train_deposit_1',
 'train_other_1',
 'train_person_1',
 'train_person_2',
 'train_static_0_0',
 'train_static_0_1',
 'train_static_cb_0',
 'train_tax_registry_a_1',
 'train_tax_registry_b_1',
 'train_tax_registry_c_1']

In [2]:
depth_2_paths: list[pathlib.Path] = []
for key in train_paths.keys():
    if "num_group2" in pl.scan_parquet(train_paths.get(key)).columns:
        depth_2_paths.append(key)
depth_2_paths

['train_credit_bureau_b_2',
 'train_credit_bureau_a_2_6',
 'train_credit_bureau_a_2_3',
 'train_credit_bureau_a_2_9',
 'train_credit_bureau_a_2_5',
 'train_credit_bureau_a_2_2',
 'train_applprev_2',
 'train_credit_bureau_a_2_8',
 'train_person_2',
 'train_credit_bureau_a_2_10',
 'train_credit_bureau_a_2_1',
 'train_credit_bureau_a_2_7',
 'train_credit_bureau_a_2_0',
 'train_credit_bureau_a_2_4']

## Functions

In [3]:
def get_date_categorized_transforms(dataframe: pl.LazyFrame) -> list[pl.Expr]:
    date_transforms = []

    for col in dataframe.columns:
        if not col.endswith("D"):
            continue

        transform = (
            pl.when(pl.col(col) > 365 * 5).then(pl.lit("MORE_THAN_5_YEARS")).otherwise(
                pl.when((pl.col(col) <= 365 * 5) & (pl.col(col) > 365)).then(pl.lit("BETWEEN_1_AND_5_YEARS")).otherwise(
                    pl.when((pl.col(col) <= 365) & (pl.col(col) > 180)).then(pl.lit("BETWEEN_6_MONTHS_AND_1_YEAR")).otherwise(
                        pl.when((pl.col(col) <= 180) & (pl.col(col).is_not_null())).then(pl.lit("LESS_THAN_6_MONTH")).otherwise(pl.lit("UNKNOWN"))
                    )
                )
            ).alias(col))
        date_transforms.append(transform)

    return date_transforms


def get_date_transforms(dataframe: pl.LazyFrame) -> list[pl.Expr]:
    date_transforms = []
    for col in dataframe.columns:
        if not col.endswith("D"):
            continue

        date_transforms.append(
            (pl.col("date_decision").cast(pl.Date) - pl.col(col).cast(pl.Date)).dt.total_days().alias(col)
        )

    return date_transforms


def get_group_by_transforms(dataframe: pl.LazyFrame, group_by_cols: list[str]) -> list[pl.Expr]:
    transforms = []
    for col, col_dtype in zip(dataframe.columns, dataframe.dtypes):
        if col in group_by_cols:
            continue

        if isinstance(col_dtype, tuple(pl.NUMERIC_DTYPES)):
            transforms.append(pl.col(col).mean().shrink_dtype())

        elif isinstance(col_dtype, pl.String):
            transforms.append(pl.col(col).mode().drop_nulls().first())

        elif isinstance(col_dtype, pl.Date):
            transforms.append(pl.col(col).first())

    return transforms


def get_columns_with_more_85_perc_nulls(dataframe: pl.LazyFrame, threshold: float = 0.85) -> list[str]:
    dataframe = dataframe.collect()
    length_dataframe: int = dataframe.shape[0]

    describe = dataframe.describe(percentiles=[]).filter(pl.col("statistic") == "null_count")
    describe = describe.transpose(include_header=True, header_name="column", column_names=["null_count"]).slice(1).with_columns(pl.col("null_count").cast(pl.Float32) / length_dataframe)

    return describe.filter(pl.col("null_count") > threshold).select("column").to_series().to_list()


def binarize_columns_with_more_85_perc_nulls(columns: list[str]) -> list[pl.Expr]:
    transforms = []
    for col in columns:
        transforms.append(
            pl.when(pl.col(col).is_null()).then(pl.lit(False)).otherwise(pl.lit(True)).alias(col)
        )
    return transforms


## Load data

In [4]:
base_train: pl.LazyFrame = pl.scan_parquet(train_paths.get("train_base")).select("case_id", "date_decision")
persons: pl.LazyFrame = pl.scan_parquet(train_paths.get("train_person_2"))
applprev: pl.LazyFrame = pl.scan_parquet(train_paths.get("train_applprev_2"))
# credit_bureau: pl.LazyFrame = pl.concat([pl.scan_parquet(train_paths.get(f"train_credit_bureau_a_2_{i}")) for i in range(10)])

In [5]:
JOIN_COLUMNS = ["case_id", "num_group1", "num_group2"]

data = base_train.join(other=persons, how="left", on="case_id").join(other=applprev, how="left", on=JOIN_COLUMNS)

In [6]:
# cols_to_binarize = get_columns_with_more_85_perc_nulls(data)
# 
# data = data.with_columns(*binarize_columns_with_more_85_perc_nulls(cols_to_binarize))

In [7]:
data.select("case_id", "empls_employedfrom_796D").drop_nulls().collect()

case_id,empls_employedfrom_796D
i64,str
22,"""2018-06-15"""
246,"""2011-08-15"""
987,"""1994-05-15"""
1128,"""2013-01-15"""
1217,"""2014-09-15"""
1277,"""2010-06-15"""
1463,"""2018-09-04"""
1527,"""2015-02-15"""
1618,"""2008-06-15"""
1620,"""2007-09-15"""


In [8]:
data.select("conts_type_509L").drop_nulls().collect().to_series().value_counts().sort("count", descending=True)

conts_type_509L,count
str,u32
"""PRIMARY_MOBILE…",976126
"""PHONE""",118393
"""HOME_PHONE""",62356
"""EMPLOYMENT_PHO…",41769
"""PRIMARY_EMAIL""",3567
"""SECONDARY_MOBI…",1240
"""ALTERNATIVE_PH…",109
"""WHATSAPP""",1


In [9]:
data.select("case_id", "num_group1", "num_group2", "conts_type_509L").drop_nulls().sort("case_id").collect()

case_id,num_group1,num_group2,conts_type_509L
i64,i64,i64,str
5,0,0,"""PRIMARY_MOBILE…"
6,0,0,"""PRIMARY_MOBILE…"
6,1,0,"""PRIMARY_MOBILE…"
6,1,1,"""PHONE"""
10,0,0,"""PRIMARY_MOBILE…"
13,0,0,"""PRIMARY_MOBILE…"
14,0,0,"""PRIMARY_MOBILE…"
16,0,0,"""PRIMARY_MOBILE…"
17,0,0,"""PRIMARY_MOBILE…"
18,0,0,"""PRIMARY_MOBILE…"


In [10]:
data: pl.LazyFrame = data.group_by(["case_id", "date_decision"]).agg(
    pl.col("empls_employedfrom_796D").drop_nulls().mode().last()
)

In [11]:
data = data.with_columns(*get_date_transforms(data)).drop("date_decision")
# data = data.with_columns(*get_date_categorized_transforms(data))
# data = data.with_columns(pl.col(pl.Boolean).fill_null(False).cast(pl.Int8))
# data = data.with_columns(
#     # pl.col("target").cast(pl.Boolean),
#     # pl.col("num_group1").cast(pl.Int16),
#     # pl.col("num_group2").cast(pl.Int16),
# ).drop("date_decision")

# data.sort("case_id").head().collect()

In [12]:
from columns_fill_nulls import fill_nulls

data = fill_nulls(data)

In [13]:
def save_dataframe_to_disk(group_by_col: str, num_percentiles: int = 10, depth: int = 2) -> None:
    SLICE_OUTPUT_BASE_DIR: pathlib.Path = pathlib.Path(f"../data/datalake/silver/depth_{depth}")
    SLICE_OUTPUT_BASE_DIR.mkdir(parents=True, exist_ok=True)

    describe: dict[str, list[str | float]] = data.select("case_id").describe(percentiles=[x/num_percentiles for x in range(num_percentiles)]).filter((pl.col("statistic").str.contains("%")) | (pl.col("statistic").str.contains("max"))).to_dict(as_series=False)
    case_id_percentiles = sorted(describe.get(group_by_col))

    for i in range(len(case_id_percentiles) - 1):
        slice_data: pl.LazyFrame = data.filter((pl.col(group_by_col) >= case_id_percentiles[i]) & (pl.col(group_by_col) < case_id_percentiles[i+1]))
        # slice_data.group_by(group_by_col).agg(*get_group_by_transforms(data, [group_by_col])).collect().write_parquet(SLICE_OUTPUT_BASE_DIR / f"slice_{i}.parquet")
        slice_data.collect().write_parquet(SLICE_OUTPUT_BASE_DIR / f"slice_{i}.parquet")

    # merge files and delete slices
    pl.scan_parquet(SLICE_OUTPUT_BASE_DIR / "slice_*.parquet").collect().write_parquet(f"../data/datalake/silver/depth_{depth}.parquet")
    for path in pathlib.Path(SLICE_OUTPUT_BASE_DIR).glob("slice*.parquet"):
        path.unlink(missing_ok=True)

    SLICE_OUTPUT_BASE_DIR.rmdir()

if False:
    save_dataframe_to_disk(group_by_col="case_id")