In [34]:
import pathlib
import polars as pl
import plotly.express as px

In [35]:
BASE_PATH = pathlib.Path.cwd().parent / "data" / "raw" / "parquet_files" / "train"

DEPOSIT_PATH = BASE_PATH / "train_deposit_1.parquet"

In [36]:
def get_transforms(data: pl.LazyFrame) -> list[pl.Expr]:
    transforms = []
    for col, dtype in zip(data.columns, data.dtypes):
        if col.endswith("D"):
            transforms.append(
                pl.when(pl.col(col).is_null()).then(pl.lit(False)).otherwise(pl.lit(True)).cast(pl.Boolean).alias(col)
            )
        elif dtype == pl.String and not col.endswith("D"):
            transforms.append(
                pl.col(col).fill_null("UNKNOWN").cast(pl.Categorical)
            )
        elif dtype in [pl.Float64, pl.Float32]:
            transforms.append(
                pl.col(col).fill_null(pl.col(col).median()).cast(pl.Float32)
            )
        elif col in {}:
            transforms.append(
                pl.col(col).fill_null(pl.col(col).median()).shrink_dtype()
            )
        elif col in {}:
            transforms.append(
                pl.col(col).fill_null(pl.lit(False)).shrink_dtype()
            )
    return transforms


In [37]:
deposit_data: pl.LazyFrame = pl.scan_parquet(DEPOSIT_PATH)
deposit_data.sort("case_id", "num_group1").head().collect()

case_id,amount_416A,contractenddate_991D,num_group1,openingdate_313D
i64,f64,str,i64,str
225,0.0,,0,"""2016-08-16"""
331,260.374,"""2018-03-18""",0,"""2015-03-19"""
358,0.0,,0,"""2014-09-02"""
390,211748.53,"""2017-07-22""",0,"""2014-07-23"""
390,203.602,"""2017-09-30""",1,"""2015-10-01"""


In [38]:
deposit_data.describe()

statistic,case_id,amount_416A,contractenddate_991D,num_group1,openingdate_313D
str,f64,f64,str,f64,str
"""count""",145086.0,145086.0,"""65404""",145086.0,"""145086"""
"""null_count""",0.0,0.0,"""79682""",0.0,"""0"""
"""mean""",1466200.0,8422.304482,,0.522531,
"""std""",886528.958911,86232.120476,,1.620954,
"""min""",225.0,-40000.0,"""2002-02-27""",0.0,"""2001-11-19"""
"""25%""",660041.0,0.0,,0.0,
"""50%""",1556939.0,223.658,,0.0,
"""75%""",2530539.0,478.34,,1.0,
"""max""",2703453.0,12213286.0,"""2020-03-18""",64.0,"""2017-07-31"""


In [39]:
deposit_data = deposit_data.with_columns(
    pl.when(pl.col("contractenddate_991D").is_not_null()).then(pl.lit(True)).otherwise(pl.lit(False)).alias("is_deposit_contract_ended")
)

In [40]:
deposit_data = deposit_data.group_by("case_id", "is_deposit_contract_ended").agg(
    pl.col("amount_416A").mean()
)

deposit_data.sort("case_id").head().collect()

case_id,is_deposit_contract_ended,amount_416A
i64,bool,f64
225,False,0.0
331,True,260.374
358,False,0.0
390,False,223.68001
390,True,105976.066
