In [46]:
import pathlib
import polars as pl
import plotly.express as px

pl.Config.set_fmt_float("full")

polars.config.Config

In [47]:
BASE_PATH = pathlib.Path.cwd().parent / "data" / "raw" / "parquet_files" / "train"

APPLPREV0_PATH = BASE_PATH / "train_applprev_1_0.parquet"
APPLPREV1_PATH = BASE_PATH / "train_applprev_1_1.parquet"

In [48]:
def get_transforms(data: pl.LazyFrame) -> list[pl.Expr]:
    transforms = []
    for col, dtype in zip(data.columns, data.dtypes):
        if col.endswith("D"):
            transforms.append(
                pl.when(pl.col(col).is_null()).then(pl.lit(False)).otherwise(pl.lit(True)).cast(pl.Boolean).alias(col)
            )
        elif dtype == pl.String and not col.endswith("D"):
            transforms.append(
                pl.col(col).fill_null("UNKNOWN").cast(pl.Categorical)
            )
        elif dtype in [pl.Float64, pl.Float32]:
            transforms.append(
                pl.col(col).fill_null(pl.col(col).median()).cast(pl.Float32)
            )
        elif col in {}:
            transforms.append(
                pl.col(col).fill_null(pl.col(col).median()).shrink_dtype()
            )
        elif col in {}:
            transforms.append(
                pl.col(col).fill_null(pl.lit(False)).shrink_dtype()
            )
    return transforms


## Load data

In [52]:
features_meaning: pl.DataFrame = pl.read_csv("/home/paolo/git/home-credit/data/raw/feature_definitions.csv")

In [54]:
applprev_0_data: pl.LazyFrame = pl.scan_parquet(APPLPREV0_PATH)
applprev_1_data: pl.LazyFrame = pl.scan_parquet(APPLPREV1_PATH)
applprev_data: pl.LazyFrame = pl.concat([applprev_0_data, applprev_1_data]).drop(["childnum_21L", "education_1138M", "familystate_726L"])

applprev_data = applprev_data.with_columns(
    *[pl.col(col).cast(pl.Date) for col in applprev_data.columns if col.endswith("D")]
)

In [53]:
features_meaning.filter(pl.col("Variable").is_in(applprev_data.columns))

Variable,Description
str,str
"""actualdpd_943P…","""Days Past Due …"
"""annuity_853A""","""Monthly annuit…"
"""approvaldate_3…","""Approval Date …"
"""byoccupationin…","""Applicant's in…"
"""cancelreason_3…","""Application ca…"
"""childnum_21L""","""Number of chil…"
"""creationdate_8…","""Date when prev…"
"""credacc_actual…","""Actual balance…"
"""credacc_credlm…","""Credit card cr…"
"""credacc_maxhis…","""Maximal histor…"


In [55]:
applprev_data.sort("case_id", "num_group1").head().collect()

case_id,actualdpd_943P,annuity_853A,approvaldate_319D,byoccupationinc_3656910L,cancelreason_3545846M,creationdate_885D,credacc_actualbalance_314A,credacc_credlmt_575A,credacc_maxhisbal_375A,credacc_minhisbal_90A,credacc_status_367L,credacc_transactions_402L,credamount_590A,credtype_587L,currdebt_94A,dateactivated_425D,district_544M,downpmt_134A,dtlastpmt_581D,dtlastpmtallstes_3545839D,employedfrom_700D,firstnonzeroinstldate_307D,inittransactioncode_279L,isbidproduct_390L,isdebitcard_527L,mainoccupationinc_437A,maxdpdtolerance_577P,num_group1,outstandingdebt_522A,pmtnum_8L,postype_4733339M,profession_152M,rejectreason_755M,rejectreasonclient_4145042M,revolvingaccount_394A,status_219L,tenor_203L
i64,f64,f64,date,f64,str,date,f64,f64,f64,f64,str,f64,f64,str,f64,date,str,f64,date,date,date,date,str,bool,bool,f64,f64,i64,f64,f64,str,str,str,str,f64,str,f64
2,0,640.2,,,"""a55475b1""",2013-04-03,,0.0,,,,,10000.0,"""CAL""",,,"""P136_108_173""",0.0,,,2010-02-15,2013-05-04,"""CASH""",False,,8200,,0,,24.0,"""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",,"""D""",24.0
2,0,1682.4,,,"""a55475b1""",2013-04-03,,0.0,,,,,16000.0,"""CAL""",,,"""P136_108_173""",0.0,,,2010-02-15,2013-05-04,"""CASH""",False,,8200,,1,,12.0,"""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",,"""D""",12.0
3,0,6140.0,,,"""P94_109_143""",2019-01-07,,0.0,,,,,59999.8,"""CAL""",,,"""P131_33_167""",0.0,,,2018-05-15,2019-02-07,"""CASH""",False,,11000,,0,,12.0,"""a55475b1""","""a55475b1""","""P94_109_143""","""a55475b1""",,"""D""",12.0
4,0,2556.6,,,"""P24_27_36""",2019-01-08,,0.0,,,,,40000.0,"""CAL""",,,"""P194_82_174""",0.0,,,,2019-02-08,"""CASH""",False,,16000,,0,,24.0,"""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",,"""T""",24.0
5,0,,,,"""P85_114_140""",2019-01-16,,,,,,,,,,,"""P54_133_26""",,,,,,,False,,62000,,0,,,"""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",,"""T""",


In [58]:
features_meaning.filter(pl.col("Variable").is_in(applprev_data.select(pl.col(pl.Date)).columns))

Variable,Description
str,str
"""approvaldate_3…","""Approval Date …"
"""creationdate_8…","""Date when prev…"
"""dateactivated_…","""Contract activ…"
"""dtlastpmt_581D…","""Date of last p…"
"""dtlastpmtallst…","""Date of the ap…"
"""employedfrom_7…","""Employment sta…"
"""firstnonzeroin…","""Date of first …"


In [61]:
applprev_data.select(*[col for col in applprev_data.columns if "reject" in col], pl.col(pl.Date)).describe()

statistic,rejectreason_755M,rejectreasonclient_4145042M,approvaldate_319D,creationdate_885D,dateactivated_425D,dtlastpmt_581D,dtlastpmtallstes_3545839D,employedfrom_700D,firstnonzeroinstldate_307D
str,str,str,str,str,str,str,str,str,str
"""count""","""6525979""","""6525979""","""3515685""","""6525913""","""3384226""","""1775595""","""2482358""","""2639501""","""5873497"""
"""null_count""","""0""","""0""","""3010294""","""66""","""3141753""","""4750384""","""4043621""","""3886478""","""652482"""
"""mean""",,,"""2016-06-11""","""2016-07-22""","""2016-05-29""","""2018-06-12""","""2018-10-23""","""2008-09-26""","""2016-06-12"""
"""std""",,,,,,,,,
"""min""","""P121_60_164""","""P129_162_80""","""2005-12-30""","""2005-12-26""","""2006-01-01""","""2008-07-04""","""2008-07-04""","""1961-09-15""","""2006-01-26"""
"""25%""",,,"""2014-09-29""","""2014-10-05""","""2014-09-09""","""2017-08-10""","""2018-01-14""","""2005-10-15""","""2014-07-20"""
"""50%""",,,"""2017-08-09""","""2017-08-21""","""2017-08-01""","""2018-08-06""","""2019-01-28""","""2010-05-15""","""2017-06-29"""
"""75%""",,,"""2018-10-29""","""2018-12-01""","""2018-10-24""","""2019-05-03""","""2019-09-16""","""2013-09-01""","""2018-12-06"""
"""max""","""a55475b1""","""a55475b1""","""2020-10-19""","""2020-10-19""","""2020-10-19""","""2020-10-19""","""2020-10-19""","""2020-07-15""","""2020-11-19"""


In [70]:
applprev_data.select("rejectreasonclient_4145042M").collect().to_series().value_counts().sort("count", descending=True)

rejectreasonclient_4145042M,count
str,u32
"""a55475b1""",4952742
"""P94_109_143""",1417017
"""P30_86_84""",78013
"""P52_67_90""",30538
"""P69_72_116""",20398
"""P129_162_80""",15536
"""P84_14_61""",5831
"""P64_121_167""",2746
"""P19_25_34""",1383
"""P5_143_178""",1247


In [71]:
applprev_data.select("rejectreason_755M").collect().to_series().value_counts().sort("count", descending=True)

rejectreason_755M,count
str,u32
"""a55475b1""",4663656
"""P94_109_143""",810325
"""P99_56_166""",713198
"""P45_84_106""",155196
"""P198_131_9""",153879
"""P30_86_84""",7849
"""P48_22_32""",7239
"""P52_67_90""",4267
"""P196_88_176""",3459
"""P121_60_164""",3014
