In [1]:
import datetime

import pandas as pd
import polars as pl
import pathlib
import numpy as np
import lightgbm as lgb
import holidays
from dateutil.relativedelta import relativedelta 
import mlflow
import plotly.graph_objects as go

In [2]:
SEED = 666
mlflow.set_tracking_uri("/home/paolo/git/home-credit/data/mlflow_runs")

In [3]:
def get_learning_curves(model: lgb.LGBMModel, metric: str = "binary_logloss") -> go.Figure:
    keys = list(model.evals_result_.keys())
    train_key, valid_key = "", ""
    for key in keys:
        if "train" in key:
            train_key = key
        elif "valid" in key:
            valid_key = key
    
    results = pd.DataFrame({"train": model.evals_result_[train_key][metric], "valid": model.evals_result_[valid_key][metric]})

    fig = go.Figure()
    for col in results.columns:
        fig.add_trace(go.Scatter(x=results.index+1, y=results[col], mode='lines', name=f"{col}"))

    fig.update_layout(title='Learning curves',
                      xaxis_title='Iteration',
                      yaxis_title='Log Loss',
                      margin=dict(l=0,r=0,b=0,t=30),
                      )
    return fig

In [4]:
STATIC_0_PATH = pathlib.Path("/home/paolo/git/home-credit/data/raw/parquet_files/train/train_static_0_0.parquet")
STATIC_1_PATH = pathlib.Path("/home/paolo/git/home-credit/data/raw/parquet_files/train/train_static_0_1.parquet")
BASE_PATH = pathlib.Path("/home/paolo/git/home-credit/data/raw/parquet_files/train/train_base.parquet")

In [5]:
HOLIDAYS = list(holidays.country_holidays("US", years=range(2000, 2024)).keys())

In [6]:
train_base: pl.LazyFrame = pl.scan_parquet(BASE_PATH).with_columns(pl.col("date_decision").cast(pl.Date))
static_0: pl.LazyFrame = pl.scan_parquet(STATIC_0_PATH)
static_1: pl.LazyFrame = pl.scan_parquet(STATIC_1_PATH)
data: pl.LazyFrame = pl.concat([static_0, static_1]).lazy()

In [7]:
train_base = train_base.with_columns(
    pl.col("date_decision").dt.year().alias("year"),
    (2 * np.pi * pl.col("date_decision").dt.month() / 12).sin().cast(pl.Float32).alias("sin(month)"),
    (2 * np.pi * pl.col("date_decision").dt.month() / 12).cos().cast(pl.Float32).alias("cos(month)"),
    pl.when(pl.col("date_decision").dt.is_leap_year())
    .then(np.pi * pl.col("date_decision").dt.day() / 366)
    .otherwise(np.pi * pl.col("date_decision").dt.day() / 365)
    .sin()
    .cast(pl.Float32)
    .alias("sin(day_of_year)"),
    pl.when(pl.col("date_decision").dt.is_leap_year())
    .then(np.pi * pl.col("date_decision").dt.day() / 366)
    .otherwise(np.pi * pl.col("date_decision").dt.day() / 365)
    .cos()
    .cast(pl.Float32)
    .alias("cos(day_of_year)"),
    # pl.col("datetime").dt.quarter().alias("quarter"),
    pl.col("date_decision")
    .dt.strftime("%Y-%m-%d")
    .is_in([x.strftime("%Y-%m-%d") for x in HOLIDAYS])
    .alias("is_holiday"),
    (2 * np.pi * pl.col("WEEK_NUM") / 52).sin().cast(pl.Float32).alias("sin(WEEK_NUM)"),
    (2 * np.pi * pl.col("WEEK_NUM") / 52).cos().cast(pl.Float32).alias("cos(WEEK_NUM)")
).drop(["WEEK_NUM", "MONTH"])

In [8]:
data = data.drop([col for col in data.columns if "birth" in col])

In [9]:
transforms = []
for col, dtype in zip(data.columns, data.dtypes):
    if col.endswith("D"):
        transforms.append(
            pl.when(pl.col(col).is_null()).then(pl.lit(False)).otherwise(pl.lit(True)).cast(pl.Boolean).alias(col)
        )
    elif dtype == pl.String and not col.endswith("D"):
        transforms.append(
            pl.col(col).fill_null("UNKNOWN").cast(pl.Categorical)
        )
    elif dtype in [pl.Float64, pl.Float32]:
        transforms.append(
            pl.col(col).fill_null(pl.col(col).median()).cast(pl.Float32)
        )
    elif col in {"opencred_647L"}:
        transforms.append(
            pl.col(col).fill_null(pl.col(col).median()).shrink_dtype()
        )
    elif col in {"isdebitcard_729L", "isbidproductrequest_292L", "equalitydataagreement_891L", "equalityempfrom_62L"}:
        transforms.append(
            pl.col(col).fill_null(pl.lit(False)).shrink_dtype()
        )

data = data.with_columns(*transforms)

In [10]:
data = train_base.join(other=data, on=["case_id"], how="inner").drop("case_id")

In [11]:
def split_train_validation(dataframe: pl.LazyFrame, test_months: float = 0.2) -> tuple[pl.LazyFrame, pl.LazyFrame]:
    min_date: datetime.date = dataframe.select("date_decision").min().collect().item(0,0)
    max_date: datetime.date = dataframe.select("date_decision").max().collect().item(0,0)
    total_months: int = (max_date - min_date).days // 30 + 1
    
    test_months: int = round(total_months * test_months)
    
    start_test_date: datetime.date = max_date - relativedelta(months=test_months)
    return dataframe.filter(pl.col("date_decision") < start_test_date).drop("date_decision"), dataframe.filter(pl.col("date_decision") >= start_test_date).drop("date_decision")


train, valid = split_train_validation(data)
y_train, y_valid = train.select("target").collect().to_series().to_pandas(), valid.select("target").collect().to_series().to_pandas()
x_train, x_valid = train.drop("target"), valid.drop("target")

In [12]:
def convert_object_to_categorical(data: pd.DataFrame) -> pd.DataFrame:
    for col in data.columns:
        if data[col].dtype == "object":
            data[col] = data[col].astype("category")
    
    return data

x_train = convert_object_to_categorical(x_train.collect().to_pandas())
x_valid = convert_object_to_categorical(x_valid.collect().to_pandas())

In [None]:
mlflow.set_experiment("simple_model_static")

eval_results = {}

mlflow.lightgbm.autolog(
    log_input_examples=False, 
    log_model_signatures=True, 
    log_models=True, 
    log_datasets=False, 
    disable=False, 
    exclusive=False, 
    disable_for_unsupported_versions=False, 
    silent=False, 
    registered_model_name=None, 
    extra_tags=None    
)

with mlflow.start_run():
    model = lgb.LGBMClassifier(
        random_state=SEED,
        n_jobs=-1,
        boosting_type='gbdt', 
        num_leaves=31, 
        max_depth=-1, 
        learning_rate=0.1, 
        n_estimators=10_000, 
        subsample_for_bin=200_000, 
        objective="binary", 
        class_weight="balanced", 
        min_split_gain=0.0, 
        min_child_weight=0.001, 
        min_child_samples=20, 
        subsample=0.5, 
        subsample_freq=0, 
        colsample_bytree=0.5, 
        reg_alpha=0.0, 
        reg_lambda=0.0, 
        importance_type='split'
    )
    
    model.fit(
        X=x_train,
        y=y_train,
        eval_set=[(x_train, y_train), (x_valid, y_valid)],
        eval_metric=["binary_logloss", "auc"],
        callbacks=[
            lgb.log_evaluation(),
            lgb.record_evaluation(eval_results),
            lgb.early_stopping(stopping_rounds=100, first_metric_only=True),
        ],
    )

    fig = get_learning_curves(model)
    mlflow.log_figure(fig, artifact_file="consumer_model.png")
    
    

[LightGBM] [Info] Number of positive: 43866, number of negative: 1292863
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.172335 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 19086
[LightGBM] [Info] Number of data points in the train set: 1336729, number of used features: 166
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
[1]	training's binary_logloss: 0.679905	training's auc: 0.711383	valid_1's binary_logloss: 0.680316	valid_1's auc: 0.741526
Training until validation scores don't improve for 100 rounds
[2]	training's binary_logloss: 0.667766	training's auc: 0.733749	valid_1's binary_logloss: 0.668078	valid_1's auc: 0.761123
[3]	training's binary_logloss: 0.658037	training's auc: 0.739202	valid_1's binary_logloss: 0.659632	valid_1's auc: 0.767298
[4]	

In [None]:
get_learning_curves(model, metric="auc").show()