In [1]:
import datetime

import pandas as pd
import polars as pl
import pathlib
import numpy as np
import lightgbm as lgb
import holidays
from dateutil.relativedelta import relativedelta
import mlflow
import plotly.graph_objects as go
import plotly.express as px
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

In [2]:
pl.Config(set_fmt_float="full")
pd.options.display.float_format = '{:.3f}'.format

mlflow.set_tracking_uri("/home/paolo/git/home-credit/data/mlflow_runs")

In [3]:
SEED = 666
INPUT_PATH = pathlib.Path("../data/datalake/gold/preprocessed.parquet")
HOLIDAYS = list(holidays.country_holidays("US", years=range(2000, 2024)).keys())

In [4]:
def split_train_validation(dataframe: pl.LazyFrame, test_months: float = 0.2) -> tuple[pl.LazyFrame, pl.LazyFrame]:
    if isinstance(dataframe, pl.LazyFrame):
        min_date: datetime.date = dataframe.select("date_decision").min().collect().item(0,0)
        max_date: datetime.date = dataframe.select("date_decision").max().collect().item(0,0)
    else:
        min_date: datetime.date = dataframe.select("date_decision").min().item(0,0)
        max_date: datetime.date = dataframe.select("date_decision").max().item(0,0)

    total_months: int = (max_date - min_date).days // 30 + 1

    test_months: int = round(total_months * test_months)

    start_test_date: datetime.date = max_date - relativedelta(months=test_months)
    return dataframe.filter(pl.col("date_decision") < start_test_date).drop("date_decision"), dataframe.filter(pl.col("date_decision") >= start_test_date).drop("date_decision")


def convert_object_to_categorical(data: pd.DataFrame) -> pd.DataFrame:
    for col in data.columns:
        if data[col].dtype == "object":
            data[col] = data[col].astype("category")

    return data

def get_columns_with_more_x_perc_nulls(dataframe: pl.LazyFrame, threshold: float = 0.5) -> list[str]:
    describe = dataframe.describe(percentiles=[]).filter(pl.col("statistic").is_in(["count", "null_count"]))
    return describe.transpose(include_header=True, column_names=["count", "null_count"]).slice(1).with_columns(
        (pl.col("null_count").cast(pl.Float32) / pl.col("count").cast(pl.Float32)).alias("perc_null")
    ).filter(pl.col("perc_null") > threshold).select("column").to_series().to_list()

def binarize_columns_with_more_x_perc_nulls(columns: list[str]) -> list[pl.Expr]:
    transforms = []
    for col in columns:
        transforms.append(
            pl.when(pl.col(col).is_null()).then(pl.lit(False)).otherwise(pl.lit(True)).alias(col)
        )
    return transforms


def optimize_datatype(dataframe: pl.LazyFrame) -> list[pl.Expr]:
    transforms = []
    for col, col_dtype in zip(dataframe.columns, dataframe.dtypes):
        if col == "age":
            transforms.append(pl.col(col).cast(pl.Int16).shrink_dtype().alias(col))
        elif isinstance(col_dtype, tuple(pl.FLOAT_DTYPES)):
            transforms.append(
                pl.col(col).cast(pl.Float32).shrink_dtype().alias(col)
            )
        elif isinstance(col_dtype, tuple(pl.INTEGER_DTYPES)):
            transforms.append(
                pl.col(col).cast(pl.Int32).shrink_dtype().alias(col)
            )
        elif isinstance(col_dtype, pl.String):
            transforms.append(
                pl.col(col).cast(pl.Categorical).alias(col)
            )
            
    return transforms


def get_fill_null_transforms(dataframe: pl.LazyFrame) -> list[pl.Expr]:
    fill_null_transforms = []

    for col, col_dtype in zip(dataframe.columns, dataframe.dtypes):
        if isinstance(col_dtype, tuple(pl.NUMERIC_DTYPES)):
            fill_null_transforms.append(pl.col(col).fill_null(pl.col(col).median()).shrink_dtype().alias(col))
        elif isinstance(col_dtype, (pl.Boolean, pl.Categorical)):
            fill_null_transforms.append(pl.col(col).fill_null(pl.col(col).drop_nulls().mode().first()).alias(col))
    return fill_null_transforms

In [5]:
data: pl.LazyFrame = pl.scan_parquet(INPUT_PATH).with_columns(pl.col("date_decision").cast(pl.Date))
data = data.drop([col for col in data.columns if "num_group" in col])
data = data.with_columns(*optimize_datatype(data))

data = data.with_columns(
    pl.col("date_decision").dt.year().alias("year"),
    pl.col("date_decision").dt.month().alias("month"),
    pl.col("date_decision").dt.ordinal_day().alias("day_of_year"),
    pl.when(pl.col("date_decision").dt.weekday().is_in([6,7])).then(pl.lit(1)).otherwise(pl.lit(0)).alias("decision_is_on_weekend"),
).drop(["WEEK_NUM", "MONTH"])

In [6]:
cols_to_drop = []
for col in data.columns:
    if "clientscnt" in col or "district" in col or "age_days" in col:
        cols_to_drop.append(col)

data = data.drop(cols_to_drop)
data = data.drop(["case_id"]) #.unique().collect()

## Encode categorical to labels

In [7]:
categories_mapping = {}
inverse_categories_mapping = {}
for column in data.select(pl.col(pl.Categorical)).columns:
    category = data.select(pl.col(column).cat.get_categories()).collect().to_series().to_list()
    category = {val: i for i, val in enumerate(category)}
    inverse_category = {i: value for value, i in category.items()}
    categories_mapping[column] = category
    inverse_categories_mapping[column] = inverse_category


In [8]:
transforms = []

for col, map_col in categories_mapping.items():
    transforms.append(pl.col(col).replace(map_col, default=None, return_dtype=pl.Int32))

data = data.with_columns(*transforms)

In [9]:
data = data.with_columns(
    pl.col("date_decision").dt.epoch()
)

## Impute nulls with KNN

In [10]:
from sklearn.preprocessing import StandardScaler


scaler = StandardScaler()
scaler.set_output(transform="polars")

data_scaled = scaler.fit_transform(data.collect())

In [11]:
%%time

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
import numpy as np


imputer = IterativeImputer(
    missing_values=np.nan,
    max_iter=5,
    n_nearest_features=5,
    skip_complete=True,
    tol=1e-2,
    verbose=1,
    random_state=666,
    add_indicator=True,
    keep_empty_features=False
)
imputer.set_output(transform="polars")
print("start data imputing")
data_imputed: pl.DataFrame = imputer.fit_transform(data_scaled)

start data imputing
[IterativeImputer] Completing matrix with shape (1526659, 264)
[IterativeImputer] Change: 150069541253369.72, scaled tolerance: 6.14723753096631 
[IterativeImputer] Change: 1.0372303396224494e+66, scaled tolerance: 6.14723753096631 
[IterativeImputer] Change: 6.782937232482271e+77, scaled tolerance: 6.14723753096631 
[IterativeImputer] Change: 1.2628329036716468e+98, scaled tolerance: 6.14723753096631 
[IterativeImputer] Change: 2.0217184895750865e+143, scaled tolerance: 6.14723753096631 




CPU times: user 16min 28s, sys: 33min 38s, total: 50min 7s
Wall time: 5min 11s


In [ ]:
data_imputed.select()