In [1]:
import pathlib
import polars as pl
import plotly.express as px

pl.Config.set_fmt_float("full")

polars.config.Config

In [2]:
BASE_PATH = pathlib.Path.cwd().parent / "data" / "raw" / "parquet_files" / "train"

BASE_TRAIN_PATH = BASE_PATH / "train_base.parquet"

APPLPREV0_PATH = BASE_PATH / "train_applprev_1_0.parquet"
APPLPREV1_PATH = BASE_PATH / "train_applprev_1_1.parquet"

DEBITCARD_PATH = BASE_PATH / "train_debitcard_1.parquet"
DEPOSIT_PATH = BASE_PATH / "train_deposit_1.parquet"
OTHER_PATH = BASE_PATH / "train_other_1.parquet"
PERSON_PATH = BASE_PATH / "train_person_1.parquet"

## Functions

In [3]:
def get_date_categorized_transforms(dataframe: pl.LazyFrame) -> list[pl.Expr]:
    date_transforms = []
    
    for col in dataframe.columns:
        if not col.endswith("D"):
            continue

        transform = (
            pl.when(pl.col(col) > 365 * 5).then(pl.lit("MORE_THAN_5_YEARS")).otherwise(
                pl.when((pl.col(col) <= 365 * 5) & (pl.col(col) > 365)).then(pl.lit("BETWEEN_1_AND_5_YEARS")).otherwise(
                    pl.when((pl.col(col) <= 365) & (pl.col(col) > 180)).then(pl.lit("BETWEEN_6_MONTHS_AND_1_YEAR")).otherwise(
                        pl.when((pl.col(col) <= 180) & (pl.col(col).is_not_null())).then(pl.lit("LESS_THAN_6_MONTH")).otherwise(pl.lit("UNKNOWN"))
                    )
                )
            ).alias(col))
        date_transforms.append(transform)
        
    return date_transforms


def get_date_transforms(dataframe: pl.LazyFrame) -> list[pl.Expr]:
    date_transforms = []
    for col in dataframe.columns:
        if col == "date_decision":
            date_transforms.append(pl.col(col).cast(pl.Date))
            date_transforms.extend([
                ((pl.col("date_decision").cast(pl.Date) - pl.col("birth_259D").cast(pl.Date)).dt.total_days() // 365).alias("age"),
                ((pl.col("date_decision").cast(pl.Date) - pl.col("birth_259D").cast(pl.Date)).dt.total_days() % 365).sin().alias("sin(age_days)"),
                ((pl.col("date_decision").cast(pl.Date) - pl.col("birth_259D").cast(pl.Date)).dt.total_days() % 365).cos().alias("cos(age_days)")
            ])
        
        if not col.endswith("D"):
            continue
        
        date_transforms.append(
            (pl.col("date_decision").cast(pl.Date) - pl.col(col).cast(pl.Date)).dt.total_days().alias(col)
        )
        
    return date_transforms


def get_group_by_transforms(dataframe: pl.LazyFrame, group_by_col: str) -> list[pl.Expr]:
    transforms = []
    for col, col_dtype in zip(dataframe.columns, dataframe.dtypes):
        if col == group_by_col:
            continue
        
        if isinstance(col_dtype, tuple(pl.NUMERIC_DTYPES)):
            transforms.append(pl.col(col).mean().shrink_dtype())
            
        elif isinstance(col_dtype, pl.String):
            transforms.append(pl.col(col).mode().drop_nulls().first())

        elif isinstance(col_dtype, pl.Date):
            transforms.append(pl.col(col).first())
        
    return transforms


def get_columns_with_more_85_perc_nulls(dataframe: pl.LazyFrame, threshold: float = 0.85) -> list[str]:
    dataframe = dataframe.collect()
    length_dataframe: int = dataframe.shape[0]

    describe = dataframe.describe(percentiles=[]).filter(pl.col("statistic") == "null_count")
    describe = describe.transpose(include_header=True, header_name="column", column_names=["null_count"]).slice(1).with_columns(pl.col("null_count").cast(pl.Float32) / length_dataframe)
    
    return describe.filter(pl.col("null_count") > threshold).select("column").to_series().to_list()
    
    
def binarize_columns_with_more_85_perc_nulls(columns: list[str]) -> list[pl.Expr]:
    transforms = []
    for col in columns:
        transforms.append(
            pl.when(pl.col(col).is_null()).then(pl.lit(False)).otherwise(pl.lit(True)).alias(col)
        )
    return transforms


## Load data

In [4]:
JOIN_COLS = ["case_id", "num_group1"]

In [5]:
base_train: pl.LazyFrame = pl.scan_parquet(BASE_TRAIN_PATH).select(["case_id", "date_decision", "target"])
applprev_data: pl.LazyFrame = pl.concat([pl.scan_parquet(APPLPREV0_PATH), pl.scan_parquet(APPLPREV1_PATH)]).drop(["childnum_21L", "education_1138M", "familystate_726L"])
# debitcard_data: pl.LazyFrame = pl.scan_parquet(DEBITCARD_PATH)
deposit_data: pl.LazyFrame = pl.scan_parquet(DEPOSIT_PATH)
other_data: pl.LazyFrame = pl.scan_parquet(OTHER_PATH)
person_data: pl.LazyFrame = pl.scan_parquet(PERSON_PATH).select([
    "case_id",
    "num_group1",
    "birth_259D",
    "incometype_1044T",
    "role_1084L",
    "empl_employedfrom_271D",
    "empl_industry_691L",
    "mainoccupationinc_384A",
])

data: pl.LazyFrame = (applprev_data
        .join(deposit_data, how="left", on=JOIN_COLS)
        .join(other_data, how="left", on=JOIN_COLS)
        .join(person_data, how="left", on=JOIN_COLS))

data = base_train.select("case_id", "date_decision").join(data, how="left", on="case_id")

data.sort(JOIN_COLS).head().collect()

case_id,date_decision,actualdpd_943P,annuity_853A,approvaldate_319D,byoccupationinc_3656910L,cancelreason_3545846M,creationdate_885D,credacc_actualbalance_314A,credacc_credlmt_575A,credacc_maxhisbal_375A,credacc_minhisbal_90A,credacc_status_367L,credacc_transactions_402L,credamount_590A,credtype_587L,currdebt_94A,dateactivated_425D,district_544M,downpmt_134A,dtlastpmt_581D,dtlastpmtallstes_3545839D,employedfrom_700D,firstnonzeroinstldate_307D,inittransactioncode_279L,isbidproduct_390L,isdebitcard_527L,mainoccupationinc_437A,maxdpdtolerance_577P,num_group1,outstandingdebt_522A,pmtnum_8L,postype_4733339M,profession_152M,rejectreason_755M,rejectreasonclient_4145042M,revolvingaccount_394A,status_219L,tenor_203L,amount_416A,contractenddate_991D,openingdate_313D,amtdebitincoming_4809443A,amtdebitoutgoing_4809440A,amtdepositbalance_4809441A,amtdepositincoming_4809444A,amtdepositoutgoing_4809442A,birth_259D,incometype_1044T,role_1084L,empl_employedfrom_271D,empl_industry_691L,mainoccupationinc_384A
i64,str,f64,f64,str,f64,str,str,f64,f64,f64,f64,str,f64,f64,str,f64,str,str,f64,str,str,str,str,str,bool,bool,f64,f64,i64,f64,f64,str,str,str,str,f64,str,f64,f64,str,str,f64,f64,f64,f64,f64,str,str,str,str,str,f64
0,"""2019-01-03""",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,"""2019-01-03""",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,"""2019-01-04""",0.0,640.2,,,"""a55475b1""","""2013-04-03""",,0.0,,,,,10000.0,"""CAL""",,,"""P136_108_173""",0.0,,,"""2010-02-15""","""2013-05-04""","""CASH""",False,,8200.0,,0.0,,24.0,"""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",,"""D""",24.0,,,,,,,,,"""1974-12-01""","""EMPLOYED""","""EM""","""2010-02-15""","""OTHER""",14000.0
2,"""2019-01-04""",0.0,1682.4,,,"""a55475b1""","""2013-04-03""",,0.0,,,,,16000.0,"""CAL""",,,"""P136_108_173""",0.0,,,"""2010-02-15""","""2013-05-04""","""CASH""",False,,8200.0,,1.0,,12.0,"""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",,"""D""",12.0,,,,,,,,,,,"""CL""",,,
3,"""2019-01-03""",0.0,6140.0,,,"""P94_109_143""","""2019-01-07""",,0.0,,,,,59999.8,"""CAL""",,,"""P131_33_167""",0.0,,,"""2018-05-15""","""2019-02-07""","""CASH""",False,,11000.0,,0.0,,12.0,"""a55475b1""","""a55475b1""","""P94_109_143""","""a55475b1""",,"""D""",12.0,,,,,,,,,"""1993-08-01""","""EMPLOYED""","""CL""","""2018-05-15""","""OTHER""",10000.0


In [6]:
cols_to_binarize = get_columns_with_more_85_perc_nulls(data)

data = data.with_columns(*binarize_columns_with_more_85_perc_nulls(cols_to_binarize))

In [7]:
data = data.with_columns(*get_date_transforms(data)).drop("birth_259D")
data = data.with_columns(*get_date_categorized_transforms(data))
data = data.with_columns(pl.col(pl.Boolean).fill_null(False).cast(pl.Int8))
data = data.with_columns(
    # pl.col("target").cast(pl.Boolean),
    pl.col("age").cast(pl.Int16),
    pl.col("num_group1").cast(pl.Int16),
)

data.sort("case_id").head().collect()

case_id,date_decision,actualdpd_943P,annuity_853A,approvaldate_319D,byoccupationinc_3656910L,cancelreason_3545846M,creationdate_885D,credacc_actualbalance_314A,credacc_credlmt_575A,credacc_maxhisbal_375A,credacc_minhisbal_90A,credacc_status_367L,credacc_transactions_402L,credamount_590A,credtype_587L,currdebt_94A,dateactivated_425D,district_544M,downpmt_134A,dtlastpmt_581D,dtlastpmtallstes_3545839D,employedfrom_700D,firstnonzeroinstldate_307D,inittransactioncode_279L,isbidproduct_390L,isdebitcard_527L,mainoccupationinc_437A,maxdpdtolerance_577P,num_group1,outstandingdebt_522A,pmtnum_8L,postype_4733339M,profession_152M,rejectreason_755M,rejectreasonclient_4145042M,revolvingaccount_394A,status_219L,tenor_203L,amount_416A,contractenddate_991D,openingdate_313D,amtdebitincoming_4809443A,amtdebitoutgoing_4809440A,amtdepositbalance_4809441A,amtdepositincoming_4809444A,amtdepositoutgoing_4809442A,incometype_1044T,role_1084L,empl_employedfrom_271D,empl_industry_691L,mainoccupationinc_384A,age,sin(age_days),cos(age_days)
i64,date,f64,f64,str,f64,str,str,i8,f64,i8,i8,i8,i8,f64,str,f64,str,str,f64,str,str,str,str,str,i8,i8,f64,f64,i16,f64,f64,str,str,str,str,i8,str,f64,i8,str,str,i8,i8,i8,i8,i8,str,str,str,i8,f64,i16,f64,f64
0,2019-01-03,,,"""UNKNOWN""",,,"""UNKNOWN""",0,,0,0,0,0,,,,"""UNKNOWN""",,,"""UNKNOWN""","""UNKNOWN""","""UNKNOWN""","""UNKNOWN""",,0,0,,,,,,,,,,0,,,0,"""MORE_THAN_5_YE…","""MORE_THAN_5_YE…",0,0,0,0,0,,,"""MORE_THAN_5_YE…",0,,,,
1,2019-01-03,,,"""UNKNOWN""",,,"""UNKNOWN""",0,,0,0,0,0,,,,"""UNKNOWN""",,,"""UNKNOWN""","""UNKNOWN""","""UNKNOWN""","""UNKNOWN""",,0,0,,,,,,,,,,0,,,0,"""MORE_THAN_5_YE…","""MORE_THAN_5_YE…",0,0,0,0,0,,,"""MORE_THAN_5_YE…",0,,,,
2,2019-01-04,0.0,640.2,"""UNKNOWN""",,"""a55475b1""","""MORE_THAN_5_YE…",0,0.0,0,0,0,0,10000.0,"""CAL""",,"""UNKNOWN""","""P136_108_173""",0.0,"""UNKNOWN""","""UNKNOWN""","""MORE_THAN_5_YE…","""MORE_THAN_5_YE…","""CASH""",0,0,8200.0,,0.0,,24.0,"""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",0,"""D""",24.0,0,"""MORE_THAN_5_YE…","""MORE_THAN_5_YE…",0,0,0,0,0,"""EMPLOYED""","""EM""","""MORE_THAN_5_YE…",1,14000.0,44.0,0.8509035245341184,0.5253219888177297
2,2019-01-04,0.0,1682.4,"""UNKNOWN""",,"""a55475b1""","""MORE_THAN_5_YE…",0,0.0,0,0,0,0,16000.0,"""CAL""",,"""UNKNOWN""","""P136_108_173""",0.0,"""UNKNOWN""","""UNKNOWN""","""MORE_THAN_5_YE…","""MORE_THAN_5_YE…","""CASH""",0,0,8200.0,,1.0,,12.0,"""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",0,"""D""",12.0,0,"""MORE_THAN_5_YE…","""MORE_THAN_5_YE…",0,0,0,0,0,,"""CL""","""MORE_THAN_5_YE…",0,,,,
3,2019-01-03,0.0,6140.0,"""UNKNOWN""",,"""P94_109_143""","""LESS_THAN_6_MO…",0,0.0,0,0,0,0,59999.8,"""CAL""",,"""UNKNOWN""","""P131_33_167""",0.0,"""UNKNOWN""","""UNKNOWN""","""BETWEEN_6_MONT…","""LESS_THAN_6_MO…","""CASH""",0,0,11000.0,,0.0,,12.0,"""a55475b1""","""a55475b1""","""P94_109_143""","""a55475b1""",0,"""D""",12.0,0,"""MORE_THAN_5_YE…","""MORE_THAN_5_YE…",0,0,0,0,0,"""EMPLOYED""","""CL""","""MORE_THAN_5_YE…",1,10000.0,25.0,-0.702407785577371,-0.7117747556357236


In [8]:
data: pl.LazyFrame = data.group_by("case_id").agg(*get_group_by_transforms(data, "case_id"))

In [None]:
data.collect().write_parquet(BASE_PATH / "train_depth_1_preprocessed.parquet")

In [ ]:
10 / 0

In [None]:
data.select("case_id", *[col for col in data.columns if col.endswith("D") or "age" in col]).head().collect()

## Analyze date columns for aggregation

In [None]:
date_columns = data.select("case_id", *[col for col in data.columns if col.endswith("D")]).group_by("case_id").agg(    
    *[pl.col(col).mean() for col in data.columns if col.endswith("D") ]
).sort("case_id").collect()

In [None]:
date_columns = date_columns.with_columns(
    *get_date_transforms(data)
)

## Analyze Masked columns for aggregation

In [None]:
masked_columns_data: pl.LazyFrame = data.select("case_id", *[col for col in data.columns if col.endswith("M")])
masked_columns_data.sort("case_id").head().collect()

### La moda puo' ritornare piu' valori, quindi si sceglie il primo

In [None]:
masked_columns_data.group_by("case_id").agg(
    *[pl.col(col).mode().drop_nulls().first() for col in data.columns if col.endswith("M")]
).collect()

## Analyze A columns for aggregation
Basta fare la media

In [None]:
amount_columns_data: pl.LazyFrame = data.select("case_id", *[col for col in data.columns if col.endswith("A")]).cast(pl.Float32)
amount_columns_data.group_by("case_id").mean().collect()

## Analyze P columns for aggregation
Sono float, basta fare la media

In [None]:
dpd_columns_data: pl.LazyFrame = data.select("case_id", *[col for col in data.columns if col.endswith("P")])
print(dpd_columns_data.dtypes)
dpd_columns_data.group_by("case_id").mean().collect()

## Analyze T columns for aggregation
Calcolare la moda, rimuovere i null e prendere il primo valore

In [None]:
t_columns_data: pl.LazyFrame = data.select("case_id", *[col for col in data.columns if col.endswith("T")])
t_columns_data.group_by("case_id").agg(
    *[pl.col(col).mode().drop_nulls().first() for col in data.columns if col.endswith("T")]
).sort("case_id").collect()

## Analyze L columns for aggregation
Calcolare la moda, rimuovere i null e prendere il primo valore

- Colonne string, calcolare la moda, rimuovere i null e prendere il primo valore
- int e float calcolare la media o la mediana
- boolean, si puo' calcolare any e all, ma prima bisogna riempire i null altrimenti vengono considerati come true

In [None]:
l_columns_data: pl.LazyFrame = data.select("case_id", *[col for col in data.columns if col.endswith("L")])
l_columns_data.dtypes

In [None]:
l_columns_data.select("case_id", pl.col(pl.Boolean)).fill_null(False).group_by("case_id").agg(
    *[pl.col(col).all() for col in l_columns_data.columns if col.startswith("is")]
).sort("case_id").head().collect()

In [None]:
l_columns_data.select("case_id", pl.col(pl.Boolean)).sort("case_id").head().collect()

In [None]:
get_group_by_transforms(l_columns_data)