In [1]:
import pathlib
import duckdb
import polars as pl

pl.Config(set_fmt_float="full")

train_paths: list[pathlib.Path] = list((pathlib.Path.cwd().parent / "data").rglob("train*parquet"))
train_paths: dict[str, pathlib.Path] = {k.name.split(".")[0]: k for k in train_paths}
sorted(list(train_paths.keys()))

['train_applprev_1_0',
 'train_applprev_1_1',
 'train_applprev_2',
 'train_base',
 'train_credit_bureau_a_1_0',
 'train_credit_bureau_a_1_1',
 'train_credit_bureau_a_1_2',
 'train_credit_bureau_a_1_3',
 'train_credit_bureau_a_2_0',
 'train_credit_bureau_a_2_1',
 'train_credit_bureau_a_2_10',
 'train_credit_bureau_a_2_2',
 'train_credit_bureau_a_2_3',
 'train_credit_bureau_a_2_4',
 'train_credit_bureau_a_2_5',
 'train_credit_bureau_a_2_6',
 'train_credit_bureau_a_2_7',
 'train_credit_bureau_a_2_8',
 'train_credit_bureau_a_2_9',
 'train_credit_bureau_b_1',
 'train_credit_bureau_b_2',
 'train_debitcard_1',
 'train_deposit_1',
 'train_other_1',
 'train_person_1',
 'train_person_2',
 'train_static_0_0',
 'train_static_0_1',
 'train_static_cb_0',
 'train_tax_registry_a_1',
 'train_tax_registry_b_1',
 'train_tax_registry_c_1']

## Helper functions

In [2]:
def find_root_path() -> pathlib.Path:
    directory: pathlib.Path = pathlib.Path.cwd()
    while not bool(list(directory.glob(".gitignore"))):
        directory = directory.parent    
    return directory

In [3]:
def get_cols_to_keep(table: duckdb.DuckDBPyRelation) -> list[tuple[str, float]]:
    dataframe: pl.DataFrame = table.pl()
    dataframe = dataframe.null_count() / dataframe.shape[0]
    return dataframe.transpose(include_header=True, column_names=["nulls_perc"]).filter(pl.col("nulls_perc") < 0.5).select("column").to_series().to_list()

def get_cols_to_keep_polars(dataframe: pl.LazyFrame) -> list[tuple[str, float]]:
    # dataframe: pl.DataFrame = table.pl()
    if isinstance(dataframe, pl.LazyFrame):
        dataframe: pl.DataFrame = dataframe.collect()
    dataframe = dataframe.null_count() / dataframe.shape[0]
    return dataframe.transpose(include_header=True, column_names=["nulls_perc"]).filter(pl.col("nulls_perc") < 0.5).select("column").to_series().to_list()

def get_categories(dataframe: pl.LazyFrame) -> dict[str, list[str]]:
    string_unique_cat = {}
    for col in dataframe.select(pl.col(pl.String), pl.col(pl.Categorical)).columns:
        string_unique_cat[col] = dataframe.select(col).unique().collect().to_series().to_list()

    return string_unique_cat

In [4]:
duckdb_file: pathlib.Path = find_root_path() / "data" / "duckdb" / "database.db"
duckdb_file.parent.mkdir(parents=True, exist_ok=True)

connection: duckdb.DuckDBPyConnection = duckdb.connect(str(duckdb_file))

### Get columns to keep

In [5]:
# cols_to_keep = {k: get_cols_to_keep_polars(pl.scan_parquet(v)) for k, v in train_paths.items()}

### Analyze base train

In [39]:
train_base = pl.scan_parquet(train_paths.get("train_base"))
train_base.head().collect()

case_id,date_decision,MONTH,WEEK_NUM,target
i64,str,i64,i64,i64
0,"""2019-01-03""",201901,0,0
1,"""2019-01-03""",201901,0,0
2,"""2019-01-04""",201901,0,0
3,"""2019-01-03""",201901,0,0
4,"""2019-01-04""",201901,0,1


## Analyze First depth 0 files

### Static_0

In [7]:
static_0: pl.LazyFrame = pl.scan_parquet(train_paths.get("train_static_0_0")).sort("case_id")

static_0 = static_0.with_columns(
    pl.col("datefirstoffer_1144D").cast(pl.Date),
    pl.col("datelastinstal40dpd_247D").cast(pl.Date),
    pl.col("datelastunpaid_3546854D").cast(pl.Date),
    pl.col("firstdatedue_489D").cast(pl.Date),
    pl.col("lastactivateddate_801D").cast(pl.Date),
    pl.col("lastapplicationdate_877D").cast(pl.Date),
    pl.col("lastapprdate_640D").cast(pl.Date),
    pl.col("lastdelinqdate_224D").cast(pl.Date),
    pl.col("lastrejectdate_50D").cast(pl.Date),
    pl.col("lastrepayingdate_696D").cast(pl.Date),
    pl.col("maxdpdinstldate_3546855D").cast(pl.Date),
    pl.col("validfrom_1069D").cast(pl.Date),
    pl.col("payvacationpostpone_4187118D").cast(pl.Date),
    pl.col("firstclxcampaign_1125D").cast(pl.Date),
    pl.col("dtlastpmtallstes_4499206D").cast(pl.Date),
)
static_0 = static_0.with_columns(pl.col(pl.String).fill_null("UNKNOWN").cast(pl.Categorical), pl.col(pl.Float64).cast(pl.Float32))
static_0 = static_0.select(pl.col(pl.Categorical), pl.col(pl.Date), pl.col(pl.Float32))

In [38]:
nulls_count = static_0.collect()
nulls_count = nulls_count.null_count() / nulls_count.shape[0]
nulls_count.transpose(include_header=True, column_names=["nulls_perc"]).sort(["nulls_perc"], descending=True)

column,nulls_perc
str,f64
"""clientscnt_136…",0.9996044859462998
"""payvacationpos…",0.9985384908897273
"""lastrepayingda…",0.9984010074151413
"""lastotherlnsex…",0.9978371259179264
"""lastotherinc_9…",0.99778033926538
"""interestrategr…",0.9834790691372514
"""lastdependents…",0.9750357905349601
"""maxannuity_407…",0.950065603527547
"""avglnamtstart2…",0.9304542832578004
"""datelastinstal…",0.9287745938509021


## static_cb_0

In [8]:
static_cb_0 = pl.scan_parquet(train_paths.get("train_static_cb_0"))
static_cb_0 = static_cb_0.with_columns(
    pl.col("assignmentdate_238D").cast(pl.Date),
    pl.col("assignmentdate_4527235D").cast(pl.Date),
    pl.col("assignmentdate_4955616D").cast(pl.Date),
    pl.col("birthdate_574D").cast(pl.Date),
    pl.col("dateofbirth_342D").cast(pl.Date),
    pl.col("dateofbirth_337D").cast(pl.Date),
    pl.col("responsedate_1012D").cast(pl.Date),
    pl.col("responsedate_4527233D").cast(pl.Date),
    pl.col("responsedate_4917613D").cast(pl.Date),
    pl.col("riskassesment_302T").str.replace_all('%', '',).str.split(" - ").list.to_struct(fields=["min_riskassesment_302T","max_riskassesment_302T"]).map_elements(lambda x: {k: int(v) / 100. if v else None for k, v in x.items()})
).unnest("riskassesment_302T")

static_cb_0 = static_cb_0.collect().lazy()

In [10]:
static_cb_0.head().collect()

case_id,assignmentdate_238D,assignmentdate_4527235D,assignmentdate_4955616D,birthdate_574D,contractssum_5085716L,dateofbirth_337D,dateofbirth_342D,days120_123L,days180_256L,days30_165L,days360_512L,days90_310L,description_5085714M,education_1103M,education_88M,firstquarter_103L,for3years_128L,for3years_504L,for3years_584L,formonth_118L,formonth_206L,formonth_535L,forquarter_1017L,forquarter_462L,forquarter_634L,fortoday_1092L,forweek_1077L,forweek_528L,forweek_601L,foryear_618L,foryear_818L,foryear_850L,fourthquarter_440L,maritalst_385M,maritalst_893M,numberofqueries_373L,pmtaverage_3A,pmtaverage_4527227A,pmtaverage_4955615A,pmtcount_4527229L,pmtcount_4955617L,pmtcount_693L,pmtscount_423L,pmtssum_45A,requesttype_4525192L,responsedate_1012D,responsedate_4527233D,responsedate_4917613D,min_riskassesment_302T,max_riskassesment_302T,riskassesment_940T,secondquarter_766L,thirdquarter_1082L
i64,date,date,date,date,f64,date,date,f64,f64,f64,f64,f64,str,str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,date,date,date,f64,f64,f64,f64,f64
357,,,,1988-04-01,,,,,,,,,"""a55475b1""","""a55475b1""","""a55475b1""",,,,,,,,,,,,,,,,,,,"""a55475b1""","""a55475b1""",,,,,,,,6,6301.4,,2019-01-25,,,,,,,
381,,,,1973-11-01,,,,,,,,,"""a55475b1""","""a55475b1""","""a55475b1""",,,,,,,,,,,,,,,,,,,"""a55475b1""","""a55475b1""",,,,,,,,6,4019.6,,2019-01-25,,,,,,,
388,,,,1989-04-01,,1989-04-01,,6.0,8.0,2.0,10.0,4.0,"""a55475b1""","""a55475b1""","""a55475b1""",2.0,,,,,,,,,,,,,,,,,6.0,"""a55475b1""","""a55475b1""",10.0,,,,,,,6,14548.0,,2019-01-28,,,,,,3.0,5.0
405,,,,1974-03-01,,1974-03-01,,0.0,0.0,0.0,1.0,0.0,"""a55475b1""","""a55475b1""","""a55475b1""",0.0,,,,,,,,,,,,,,,,,4.0,"""a55475b1""","""a55475b1""",1.0,,,,,,,6,10498.24,,2019-01-21,,,,,,2.0,0.0
409,,,,1993-06-01,,1993-06-01,,2.0,3.0,0.0,3.0,1.0,"""a55475b1""","""717ddd49""","""a55475b1""",4.0,,,,,,,,,,,,,,,,,1.0,"""a7fcb6e5""","""a55475b1""",3.0,,,,,,,7,6344.8804,,2019-01-21,,,,,,0.0,4.0


In [34]:
birth_date = static_cb_0.select("birthdate_574D", "dateofbirth_342D", "dateofbirth_337D").with_columns(
    pl.when(pl.col("birthdate_574D") == pl.col("dateofbirth_342D")).then(pl.lit(True)).otherwise(pl.lit(False)).alias("birthdate_574D_equals_dateofbirth_342D"),
    pl.when(pl.col("birthdate_574D") == pl.col("dateofbirth_337D")).then(pl.lit(True)).otherwise(pl.lit(False)).alias("birthdate_574D_equals_dateofbirth_337D"),
    pl.when(pl.col("dateofbirth_342D") == pl.col("dateofbirth_337D")).then(pl.lit(True)).otherwise(pl.lit(False)).alias("dateofbirth_342D_equals_dateofbirth_337D"),
    pl.when((pl.col("dateofbirth_342D") == pl.col("dateofbirth_337D")) & (pl.col("birthdate_574D") == pl.col("dateofbirth_337D")) & (pl.col("birthdate_574D") == pl.col("dateofbirth_342D")) ).then(pl.lit(True)).otherwise(pl.lit(False)).alias("all_equals"),
)

In [37]:
nulls_count = static_cb_0.collect()
nulls_count = nulls_count.null_count() / nulls_count.shape[0]
nulls_count.transpose(include_header=True, column_names=["nulls_perc"]).sort(["nulls_perc"], descending=True)

column,nulls_perc
str,f64
"""dateofbirth_34…",0.9756743859948444
"""for3years_128L…",0.9756650556223492
"""for3years_504L…",0.9756650556223492
"""for3years_584L…",0.9756650556223492
"""formonth_118L""",0.9756650556223492
"""formonth_206L""",0.9756650556223492
"""formonth_535L""",0.9756650556223492
"""forquarter_101…",0.9756650556223492
"""forquarter_462…",0.9756650556223492
"""forquarter_634…",0.9756650556223492


## Non si usano le date di nascita