In [2]:
import pathlib
import polars as pl
import plotly.express as px

In [3]:
BASE_PATH = pathlib.Path.cwd().parent / "data" / "raw" / "parquet_files" / "train"

DEBITCARD_PATH = BASE_PATH / "train_debitcard_1.parquet"

In [4]:
def get_transforms(data: pl.LazyFrame) -> list[pl.Expr]:
    transforms = []
    for col, dtype in zip(data.columns, data.dtypes):
        if col.endswith("D"):
            transforms.append(
                pl.when(pl.col(col).is_null()).then(pl.lit(False)).otherwise(pl.lit(True)).cast(pl.Boolean).alias(col)
            )
        elif dtype == pl.String and not col.endswith("D"):
            transforms.append(
                pl.col(col).fill_null("UNKNOWN").cast(pl.Categorical)
            )
        elif dtype in [pl.Float64, pl.Float32]:
            transforms.append(
                pl.col(col).fill_null(pl.col(col).median()).cast(pl.Float32)
            )
        elif col in {}:
            transforms.append(
                pl.col(col).fill_null(pl.col(col).median()).shrink_dtype()
            )
        elif col in {}:
            transforms.append(
                pl.col(col).fill_null(pl.lit(False)).shrink_dtype()
            )
    return transforms


In [6]:
debitcard_data: pl.LazyFrame = pl.scan_parquet(DEBITCARD_PATH)
debitcard_data.sort("case_id", "num_group1").head().collect()

case_id,last180dayaveragebalance_704A,last180dayturnover_1134A,last30dayturnover_651A,num_group1,openingdate_857D
i64,f64,f64,f64,i64,str
225,,,,0,"""2016-08-16"""
331,,,,0,"""2015-03-19"""
358,,,,0,"""2014-09-02"""
390,,,,0,"""2014-07-23"""
390,,,,1,"""2015-10-01"""


In [9]:
debitcard_data.group_by("case_id").agg(
    pl.col("num_group1").max()
).sort("num_group1", descending=True).collect()

case_id,num_group1
i64,i64
1377353,65
1494474,32
151842,31
783268,31
246503,31
1306349,31
1590262,28
216742,28
160829,28
1617931,28


In [16]:
debitcard_data.filter((pl.col("case_id") == 1377353) & (pl.col("last180dayaveragebalance_704A").is_not_null())).sort("last180dayaveragebalance_704A", descending=True).collect()

case_id,last180dayaveragebalance_704A,last180dayturnover_1134A,last30dayturnover_651A,num_group1,openingdate_857D
i64,f64,f64,f64,i64,str
1377353,16.052046,,,0,
