In [1]:
!pip install polars
!pip install lightgbm
!pip install --upgrade dask pandas

# standard library imports
import gc
import os
import random
import sys
import warnings

warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

# third party imports
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import polars as pl
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    roc_auc_score,
    f1_score,
    log_loss,
    SCORERS, 
    get_scorer,
    classification_report, 
    ConfusionMatrixDisplay, 
)
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.svm import LinearSVC

from lightgbm import LGBMClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import roc_auc_score, f1_score, log_loss, get_scorer
from sklearn.metrics import confusion_matrix
import pandas as pd

# will probably need these later:

# import torch
# from adapters import AutoAdapterModel
# from joblib import dump, load
# from pytorch_lightning import Trainer, LightningModule

# from torch.utils.data import DataLoader, Dataset
# from tqdm import tqdm
# from transformers import (
#     AdamW, 
#     AutoTokenizer, 
#     RobertaForSequenceClassification, 
#     get_linear_schedule_with_warmup
# )

dataPath = ""

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


In [2]:
df = pd.read_csv("~/public/home-credit-credit-risk-model-stability/csv_files/train/train_base.csv")
df_test = pd.read_csv("~/public/home-credit-credit-risk-model-stability/csv_files/test/test_base.csv")

In [3]:
df.head()

Unnamed: 0,case_id,date_decision,MONTH,WEEK_NUM,target
0,0,2019-01-03,201901,0,0
1,1,2019-01-03,201901,0,0
2,2,2019-01-04,201901,0,0
3,3,2019-01-03,201901,0,0
4,4,2019-01-04,201901,0,1


#### Model Development: Training LightGBM

In [4]:
# Define the pipeline for LightGBM
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer,  ['case_id', 'MONTH', 'WEEK_NUM']),
        ('cat', categorical_transformer,  [])])  # Since there are no categorical features in this example

lgbm_pipe = Pipeline(
    [
        ('preprocessor', preprocessor),
        ("clf", LGBMClassifier(random_state=0)),
    ]
)

X_train = df[['case_id', 'MONTH', 'WEEK_NUM']]  # Remove 'target' from X_train
y_train = df['target']

# Train LightGBM model
lgbm_pipe.fit(X_train, y_train)

# Function to generate report using LightGBM model
def generate_report(mdl, df, cols, extra_metrics=None):
    '''Takes in a trained model, pandas dataframe of a subset of the test data, 
    and list of columns to be used in the model.
    If necessary, can include a list of other metrics to be outputted, but default is None.
    Returns a dictionary of the following breakdown of scores: AUC, F1-Score, and any extra metrics
    '''
    X_test, y_test = df[cols], df["target"]
    y_pred = mdl.predict(X_test)
    
    res = dict()
    res['AUC'] = roc_auc_score(y_test, y_pred)
    res['Macro F-1'] = f1_score(y_test, y_pred, average='macro')
        
    for metric in (extra_metrics or []):
        if metric == 'log_loss':
            res[metric] = log_loss(y_test, y_pred)
            print(f"Log Loss: {log_loss(y_test, y_pred)}")
        else:
            res[metric] = get_scorer(metric)._score_func(y_test, y_pred)
            print(f"{metric}: {get_scorer(metric)._score_func(y_test, y_pred)}")
        
    # Results
    print(f"AUC: {roc_auc_score(y_test, y_pred)}")
    print(f"Macro-Averaged F1-Score: {f1_score(y_test, y_pred, average='macro')}")

    # Confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    print("Confusion Matrix:")
    print(cm)
        
    return res

# Generate report using LightGBM model
generate_report(lgbm_pipe, df, ['case_id', 'MONTH', 'WEEK_NUM'], ['log_loss'])

[LightGBM] [Info] Number of positive: 47994, number of negative: 1478665
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005604 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 371
[LightGBM] [Info] Number of data points in the train set: 1526659, number of used features: 3
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.031437 -> initscore=-3.427819
[LightGBM] [Info] Start training from score -3.427819
Log Loss: 1.0858050385170126
AUC: 0.5
Macro-Averaged F1-Score: 0.4920151704109108
Confusion Matrix:
[[1478665       0]
 [  47994       0]]


{'AUC': 0.5, 'Macro F-1': 0.4920151704109108, 'log_loss': 1.0858050385170126}

#### Evaluation / Test