In [9]:
import pathlib
import polars as pl
import plotly.express as px

In [10]:
BASE_PATH = pathlib.Path.cwd().parent / "data" / "raw" / "parquet_files" / "train"

OTHER_PATH = BASE_PATH / "train_other_1.parquet"

In [11]:
def get_transforms(data: pl.LazyFrame) -> list[pl.Expr]:
    transforms = []
    for col, dtype in zip(data.columns, data.dtypes):
        if col.endswith("D"):
            transforms.append(
                pl.when(pl.col(col).is_null()).then(pl.lit(False)).otherwise(pl.lit(True)).cast(pl.Boolean).alias(col)
            )
        elif dtype == pl.String and not col.endswith("D"):
            transforms.append(
                pl.col(col).fill_null("UNKNOWN").cast(pl.Categorical)
            )
        elif dtype in [pl.Float64, pl.Float32]:
            transforms.append(
                pl.col(col).fill_null(pl.col(col).median()).cast(pl.Float32)
            )
        elif col in {}:
            transforms.append(
                pl.col(col).fill_null(pl.col(col).median()).shrink_dtype()
            )
        elif col in {}:
            transforms.append(
                pl.col(col).fill_null(pl.lit(False)).shrink_dtype()
            )
    return transforms


In [12]:
other_data: pl.LazyFrame = pl.scan_parquet(OTHER_PATH)
other_data.sort("case_id", "num_group1").head().collect()

case_id,amtdebitincoming_4809443A,amtdebitoutgoing_4809440A,amtdepositbalance_4809441A,amtdepositincoming_4809444A,amtdepositoutgoing_4809442A,num_group1
i64,f64,f64,f64,f64,f64,i64
43801,12466.601,12291.2,914.2,0.0,304.80002,0
43991,3333.4001,3273.4001,0.0,0.0,0.0,0
44001,10000.0,10000.0,0.0,0.0,0.0,0
44053,0.0,0.0,2586.4001,0.0,88.8,0
44130,63.8,60.8,0.0,0.0,0.0,0


In [13]:
other_data.describe()

statistic,case_id,amtdebitincoming_4809443A,amtdebitoutgoing_4809440A,amtdepositbalance_4809441A,amtdepositincoming_4809444A,amtdepositoutgoing_4809442A,num_group1
str,f64,f64,f64,f64,f64,f64,f64
"""count""",51109.0,51109.0,51109.0,51109.0,51109.0,51109.0,51109.0
"""null_count""",0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""mean""",1419500.0,7552.901686,7462.384278,9967.412999,2949.3959,3586.875118,0.0
"""std""",924509.490925,34625.705832,35065.286854,89393.421442,41467.726075,48274.936439,0.0
"""min""",43801.0,0.0,0.0,-335718.0,0.0,0.0,0.0
"""25%""",242241.0,0.0,0.0,0.0,0.0,0.0,0.0
"""50%""",1811468.0,0.0,0.0,0.0,0.0,1.8000001,0.0
"""75%""",1916206.0,8000.0,7740.0,288.0,0.0,5.4,0.0
"""max""",2703453.0,4957852.0,5168004.5,4256314.5,4180150.5,4622917.5,0.0
