In [29]:
import pathlib
import polars as pl
import plotly.express as px

In [30]:
BASE_PATH = pathlib.Path.cwd().parent / "data" / "raw" / "parquet_files" / "train"

PERSON_PATH = BASE_PATH / "train_person_1.parquet"

In [31]:
def get_transforms(data: pl.LazyFrame) -> list[pl.Expr]:
    transforms = []
    for col, dtype in zip(data.columns, data.dtypes):
        if col.endswith("D"):
            transforms.append(
                pl.when(pl.col(col).is_null()).then(pl.lit(False)).otherwise(pl.lit(True)).cast(pl.Boolean).alias(col)
            )
        elif dtype == pl.String and not col.endswith("D"):
            transforms.append(
                pl.col(col).fill_null("UNKNOWN").cast(pl.Categorical)
            )
        elif dtype in [pl.Float64, pl.Float32]:
            transforms.append(
                pl.col(col).fill_null(pl.col(col).median()).cast(pl.Float32)
            )
        elif col in {}:
            transforms.append(
                pl.col(col).fill_null(pl.col(col).median()).shrink_dtype()
            )
        elif col in {}:
            transforms.append(
                pl.col(col).fill_null(pl.lit(False)).shrink_dtype()
            )
    return transforms


In [41]:
person_data: pl.LazyFrame = pl.scan_parquet(PERSON_PATH).select([
    "case_id",
    "num_group1",
    "birth_259D",
    "incometype_1044T",
    "role_1084L",
    "empl_employedfrom_271D",
    "empl_industry_691L",
    "mainoccupationinc_384A",
])
person_data.head().collect()

case_id,num_group1,birth_259D,incometype_1044T,role_1084L,empl_employedfrom_271D,empl_industry_691L,mainoccupationinc_384A
i64,i64,str,str,str,str,str,f64
0,0,"""1986-07-01""","""SALARIED_GOVT""","""CL""","""2017-09-15""","""OTHER""",10800.0
0,1,,,"""EM""",,,
0,2,,,"""PE""",,,
0,3,,,"""PE""",,,
1,0,"""1957-08-01""","""SALARIED_GOVT""","""CL""","""2008-10-29""","""OTHER""",10000.0


## Analyze birth columns

birthdate_87D contiene null, birth_259D non contiene null, quindi birthdate_87D si puo' cancellare

In [33]:
birth_data = person_data.select("case_id", *[col for col in person_data.columns if "birth" in col])

birth_data.group_by("case_id").agg(
    pl.col("birth_259D").min(),
    # pl.col("birthdate_87D").min(),
).describe()

statistic,case_id,birth_259D
str,f64,str
"""count""",1526659.0,"""1526659"""
"""null_count""",0.0,"""0"""
"""mean""",1286100.0,
"""std""",718946.592285,
"""min""",0.0,"""1943-03-01"""
"""25%""",766198.0,
"""50%""",1357358.0,
"""75%""",1739023.0,
"""max""",2703454.0,"""1999-10-01"""
