In [None]:
"""
author: Michael Munz

workflow applying re-sampling and avoiding data leakage use imblearn (Imbalanced-learn)
"""


'\nauthor: Michael Munz\n\nworkflow applying re-sampling and avoiding data leakage use imblearn (Imbalanced-learn)\n'

In [20]:
# import
import pandas as pd
import numpy as np

import sys
sys.path.append( '../../library' )
import gc_storage as gcs
import data_preprocessing_pipeline as dpp

from joblib import load

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, f1_score, make_scorer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.utils.class_weight import compute_class_weight, compute_sample_weight

from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

from xgboost import XGBClassifier

import lightgbm as lgb



In [None]:
# init GCS
bucket_name='sep25-bds-road-accidents'
key_path='../../auth/fiery-glass-478009-t8-18a81c8cbe63.json'

bucket = gcs.init_bucket( bucket=bucket_name,
                          json_key_path=key_path )

Initialized sep25-bds-road-accidents


In [None]:
# listing all blobs in GCS
gcs.list_bucket( bucket=bucket,
                 remote_folder='2_preprocessing' )


Number of blobs: [34]
data/processed/2_preprocessing/
data/processed/2_preprocessing/0.1-munz-data-preprocessing_locations.joblib
data/processed/2_preprocessing/0.2-munz-data-preprocessing_locations.joblib
data/processed/2_preprocessing/0.3-munz-data-preprocessing_locations.joblib
data/processed/2_preprocessing/0.4-munz-data-preprocessing_locations.joblib
data/processed/2_preprocessing/1.0-becker-data-preprocessing_usagers.joblib
data/processed/2_preprocessing/1.0-leibold-data-preprocessing_aggr.joblib
data/processed/2_preprocessing/1.0-leibold-data-preprocessing_vehicles.joblib
data/processed/2_preprocessing/1.0-munz-acc-municipality_X_test_uniques_lookup_table.gc
data/processed/2_preprocessing/1.0-munz-acc-municipality_X_train_uniques_lookup_table.gc
data/processed/2_preprocessing/1.0-munz-preprocessing-X_test.gc
data/processed/2_preprocessing/1.0-munz-preprocessing-X_test_processed.gc
data/processed/2_preprocessing/1.0-munz-preprocessing-X_train_num.gc
data/processed/2_preprocessing

In [24]:
# load data
df = load( '../../data/processed/2_preprocessing/1.0-leibold-data-preprocessing_aggr.gc' )



In [25]:
# 1 split features / target

# explanatory vars :X
X = df.drop( columns='ind_severity',
             axis=1 )

# target var y: :ind_severity
# important: var is unbalanced
y = df.ind_severity

print( f"target distribution:\n{y.value_counts()}\n" )
print( f"target distribution:\n{y.value_counts(normalize=True).round(3)}" )

target distribution:
ind_severity
1    285859
2    280987
3    106958
4     18355
Name: count, dtype: int64

target distribution:
ind_severity
1    0.413
2    0.406
3    0.155
4    0.027
Name: proportion, dtype: float64


In [26]:
# 2 split { :training_set, :test_set }
# data splitting with stratification
# split into training set, test set BEFORE applying pipeline + resampling
# stratify=y -> stratified split; prevents bias
#               ensures class distribution (proportions of each target class)
#               in { :y_train, :y_test } matches original :y
# stratify=n -> random split
X_train, X_test, y_train, y_test = train_test_split( X, 
                                                     y, 
                                                     test_size=0.3, 
                                                     random_state=369, 
                                                     stratify=y )

print( f"train shape: {X_train.shape}" )
print( f"test shape: {X_test.shape}" )


train shape: (484511, 44)
test shape: (207648, 44)


In [31]:
# 3 pre-processing pipeline
preprocessing_pipeline = dpp.build_default_full_pipeline()

# apply fit & transorm on :X_train, :y_train
X_train_processed = preprocessing_pipeline.fit_transform( X_train,
                                                          y_train )

# apply transform on :X_test (same transformations, NO LEAKAGE)
X_test_processed = preprocessing_pipeline.transform( X_test )

print( f"processed train shape: {X_train_processed.shape}" )
print( f"processed test shape: {X_test_processed.shape}" )




No missing values found in acc_year




No missing values found in acc_year




processed train shape: (484511, 131)
processed test shape: (207648, 131)




In [None]:
# 4 model training
# unbalanced
model_rfc = RandomForestClassifier( n_estimators=200,
                                    random_state=369,
                                    n_jobs=-1 )

# apply fit on :X_train, :y_train
model_rfc.fit( X_train_processed,
               y_train )

# make prediction on :X_test
y_pred = model_rfc.predict( X_test_processed )

# classification report
print( classification_report(y_test, 
                             y_pred) )

              precision    recall  f1-score   support

           1       0.73      0.82      0.77     85758
           2       0.64      0.68      0.66     84296
           3       0.51      0.35      0.42     32087
           4       0.38      0.06      0.10      5507

    accuracy                           0.67    207648
   macro avg       0.57      0.47      0.49    207648
weighted avg       0.65      0.67      0.65    207648



In [34]:
# note: can only work with pure numeric feature matrices

# 1 under-sample majorities
# 1 + 2 down-sampling
sampling_strategy_majority = {
    1: 180_000,
    2: 180_000
}

# 2 over-sample minorities on reduced set
# 3 + 4 up-sampling
sampling_strategy_minority = {
    3: 150_000,
    4: 140_000
}



In [35]:
resampler = Pipeline(
    steps=[('under', RandomUnderSampler( sampling_strategy=sampling_strategy_majority,
                                         random_state=369 )),
           ('over', RandomOverSampler( sampling_strategy=sampling_strategy_minority,
                                       random_state=369 ))]
)

# apply :fit_resample on :X_train, :y_train
X_rs, y_rs = resampler.fit_resample( X_train_processed,
                                     y_train )

In [37]:
# check new class distribution
print( f"Original Training Class Counts:\n{ y_train.value_counts() }\n" )
print( f"Training Class Counts: { y_rs.value_counts() }" )

# interpretation
# over-corrected & over-sampled rarest class (4: :killed)
# original class -> heavily imbalanced; majority 1-2
# after SMOTEENN -> skewed towards 3-4; classes 1-2 are now minority classes
# next steps: trying SMOTE and SMOTE+Tomek to target milder balance

Original Training Class Counts:
ind_severity
1    200101
2    196691
3     74871
4     12848
Name: count, dtype: int64

Training Class Counts: ind_severity
1    180000
2    180000
3    150000
4    140000
Name: count, dtype: int64


In [38]:
# optimize for classes 3-4 -> using GridSearchCV

# custom scorer
# macro F1 over classes { :3, :4 }
def f1_34(y_true, y_pred):
    return f1_score( y_true,
                     y_pred,
                     labels=[3, 4],
                     average='macro' )
 
minority_f1 = make_scorer( f1_34 )



In [None]:
# model training
# on resampled data
# init :RandomForestClassifier
model_rf = RandomForestClassifier( n_jobs=-1,
                                   random_state=369 )

# params
param_grid = {
    "n_estimators": [300, 600],
    "max_depth": [None, 20, 10],
    "min_samples_split": [2, 10],
    "min_samples_leaf": [1, 5, 10],
    "max_features": ["sqrt", 0.5],
    "class_weight": [None, "balanced"]
}

# :GridSearchCV
# focus on classes 3–4
model_gscv = GridSearchCV( model_rf,
                     param_grid=param_grid,
                     scoring=minority_f1,
                     cv=3,
                     n_jobs=-1,
                     verbose=1 )

# apply fit on re-sampled training set :X_rs, :y_rs
model_gscv.fit( X=X_rs,
                y=y_rs )

# :best_estimators_
best_rf = model_gscv.best_estimator_

# predict on :X_test
# make prediction -> on original untouched test_set
y_pred_gscv = best_rf.predict( X_test_processed )



Fitting 3 folds for each of 144 candidates, totalling 432 fits


In [None]:
# model eval
# use robust metrics beyond simple accuracy, such as f1-score, precision, recall

# classification report
print( classification_report(y_test, 
                             y_pred_gscv) )

# confusion matrix
cm = pd.crosstab( y_test, 
                  y_pred_gscv, 
                  rownames=['Actual Class'], 
                  colnames=['Predicted Class'] )

print( f"Confusion Matrix:\n{ cm }" )

              precision    recall  f1-score   support

           1       0.76      0.84      0.80     95219
           2       0.73      0.65      0.69     93114
           3       0.52      0.60      0.56     34672
           4       0.53      0.14      0.22      5795

    accuracy                           0.71    228800
   macro avg       0.64      0.56      0.57    228800
weighted avg       0.71      0.71      0.70    228800

Confusion Matrix:
Predicted Class      1      2      3    4
Actual Class                             
1                79799  11592   3770   58
2                21089  60227  11604  194
3                 3772   9790  20633  477
4                  504   1027   3441  823


In [None]:
# :GradientBoostingClassifier
# has no direct :class_weight
# workaround: passing sample weights computed from class frequencies
classes = np.unique( y_rs )

class_w = compute_class_weight( 'balanced',
                                classes=classes,
                                y=y_rs )

class_weight_dict = dict( zip(classes, class_w) )

# per-sample weights from :class_weight_dict
sample_weight = np.array( [class_weight_dict[c] for c in y_rs] )

# init :GradientBoostingClassifier
model_gbc = GradientBoostingClassifier( n_estimators=400,
                                        learning_rate=0.05,
                                        max_depth=3,
                                        random_state=369 )

# apply fit
model_gbc.fit( X=X_rs,
               y=y_rs,
               sample_weight=sample_weight )

# predict :X_test
y_pred_gbc = model_gbc.predict( X_test )




In [None]:
# classification report
print( classification_report(y_test, 
                             y_pred_gbc) )

# confusion matrix
cm = pd.crosstab( y_test,
                  y_pred_gbc,
                  rownames=['Actual Class'],
                  colnames=['Predicted Class'] )

print( f"Confusion Matrix:\n{ cm }" )


              precision    recall  f1-score   support

           1       0.74      0.79      0.77     95219
           2       0.73      0.49      0.59     93114
           3       0.40      0.44      0.42     34672
           4       0.14      0.62      0.23      5795

    accuracy                           0.61    228800
   macro avg       0.50      0.59      0.50    228800
weighted avg       0.67      0.61      0.63    228800

Confusion Matrix:
Predicted Class      1      2      3      4
Actual Class                               
1                75279  10670   5066   4204
2                22822  45916  16630   7746
3                 3127   6247  15228  10070
4                  324    474   1379   3618


In [None]:
# :XGBoost with sample weights (multi-class)
# multi-class imbalance -> using :sample_weight (one weight per instance)
# expects lables to start at '0' -> :y_rs labels are 1-4

# re-mapping labels
# { 1, 2, 3, 4} -> { 0, 1, 2, 3 }
y_rs_xgbc_encoded = y_rs - 1
y_test_xgbc_encoded = y_test - 1

# calc sample weights on encoded labels
sample_weight = compute_sample_weight( 'balanced', 
                                       y_rs_xgbc_encoded )

# init :XGBC
model_xgbc = XGBClassifier(
    objective="multi:softprob",
    num_class=4,
    eval_metric="mlogloss",
    n_estimators=600,
    learning_rate=0.05,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    tree_method="hist",
    random_state=369,
    n_jobs=-1
)

# apply fit on :X_rs, :y_rs_xgbc_encoded
model_xgbc.fit( X_rs, 
                y_rs_xgbc_encoded, 
                sample_weight=sample_weight )

# predict on :X_test
# :XGBC will see classes [0,.., 3] as required
y_pred_xgbc_encoded = model_xgbc.predict( X_test )

# mapping back to original lables
# evaluating everyting in original severity labels 1-4
# 0..3 -> 1..4
y_pred_xgbc = y_pred_xgbc_encoded + 1



In [18]:
# classification report
print( classification_report(y_test, 
                             y_pred_xgbc) )

# confusion matrix
cm = pd.crosstab( y_test,
                  y_pred_xgbc,
                  rownames=['Actual Class'],
                  colnames=['Predicted Class'] )

print( f"Confusion Matrix:\n{ cm }" )



              precision    recall  f1-score   support

           1       0.76      0.79      0.77     95219
           2       0.73      0.52      0.61     93114
           3       0.41      0.47      0.44     34672
           4       0.16      0.63      0.25      5795

    accuracy                           0.63    228800
   macro avg       0.51      0.60      0.52    228800
weighted avg       0.68      0.63      0.64    228800

Confusion Matrix:
Predicted Class      1      2      3     4
Actual Class                              
1                75511  11129   5282  3297
2                21326  48144  16958  6686
3                 2845   6063  16424  9340
4                  275    450   1403  3667


In [19]:
# :LightGBM with class weights
# supports class weights natively
classes = np.unique( y_rs )

class_w = compute_class_weight( 'balanced',
                                classes=classes, 
                                y=y_rs )

class_weight = dict( zip(classes,
                         class_w) )

lgbm = lgb.LGBMClassifier(
    objective="multiclass",
    num_class=4,
    n_estimators=600,
    learning_rate=0.05,
    max_depth=-1,
    subsample=0.8,
    colsample_bytree=0.8,
    class_weight=class_weight,
    random_state=369,
    n_jobs=-1
)

# apply fit on :X_rs, :y_rs
lgbm.fit( X_rs, 
          y_rs )

# predict on :X_test
y_pred_lgbm = lgbm.predict( X_test )





[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.017298 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1997
[LightGBM] [Info] Number of data points in the train set: 650000, number of used features: 40
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294


In [20]:
# classification report
print( classification_report(y_test, 
                             y_pred_lgbm) )

# confusion matrix
cm = pd.crosstab( y_test,
                  y_pred_lgbm,
                  rownames=['Actual Class'],
                  colnames=['Predicted Class'] )

print( f"Confusion Matrix:\n{ cm }" )


              precision    recall  f1-score   support

           1       0.76      0.79      0.78     95219
           2       0.73      0.53      0.62     93114
           3       0.42      0.49      0.45     34672
           4       0.17      0.62      0.27      5795

    accuracy                           0.64    228800
   macro avg       0.52      0.61      0.53    228800
weighted avg       0.68      0.64      0.65    228800

Confusion Matrix:
Predicted Class      1      2      3     4
Actual Class                              
1                75107  11777   5420  2915
2                20302  49765  16993  6054
3                 2712   6202  17093  8665
4                  253    454   1495  3593


In [28]:
# :LightGBM with focal loss
# stronger focus on minority samples
# requires custom objective
def focal_loss_lgb_multiclass(alpha=0.25, gamma=2.0):
    
    # returns a LightGBM-compatible loss function
    def loss(y_true, preds):
        # y_true: (n,), labels [ 1,..,4 ]
        # preds: raw scores, shape (n * K,)
        n_classes = 4
        
        # convert labels to int indices 0..K-1
        # 1..4 -> 0..3
        y_true = y_true.astype( int ) - 1
        
        # re-shape preds to (n, K)
        preds = preds.reshape( n_classes, 
                               -1 ).T  
        
        # softmax to probabilities
        exp_preds = np.exp( preds - preds.max(axis=1, 
                                              keepdims=True) )
        
        # (n, K)
        prob = exp_preds / exp_preds.sum( axis=1, 
                                          keepdims=True )

        # one-hot labels (n, K)
        y_onehot = np.eye( n_classes )[y_true]
        
        # focal loss terms
        # (n, )
        pt = ( prob * y_onehot ).sum( axis=1 )
        pt = np.clip( pt,
                      1e-12,
                      1.0 )
        
        grad_factor = alpha * (gamma * (1 - pt) ** (gamma - 1) * (-np.log(pt)) + (1 - pt) ** gamma) / ( pt )

        # gradients
        # (n, K)
        grad = grad_factor[:, None] * ( prob - y_onehot )
        
        # hessians
        # simple approx
        hess = np.ones_like( grad )

        return grad.T.reshape(-1), hess.T.reshape(-1)
    return loss

# init :LGBMClassifier
lgbm_focal = lgb.LGBMClassifier(
    num_class=4,
    objective=focal_loss_lgb_multiclass(alpha=0.25, gamma=2.0),
    n_estimators=600,
    learning_rate=0.05,
    max_depth=-1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=369,
    n_jobs=-1
)

# apply fit on :X_rs, :y_rs
# :y_rs -> re-mapping done by objective
lgbm_focal.fit( X=X_rs, 
                y=y_rs )

# predict on :X_test
# get raw scores (n_samples, n_classes)
# for multi-class this is already (n, K)
proba = lgbm_focal.predict( X_test )

# convert to predicted class indices 0..3
y_pred_idx = proba.argmax( axis=1 )

# mapping back
# 0..3 -> 1..4
y_pred_lgbm_focal = y_pred_idx + 1




[LightGBM] [Info] Using self-defined objective function
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.032254 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1997
[LightGBM] [Info] Number of data points in the train set: 650000, number of used features: 40
[LightGBM] [Info] Using self-defined objective function


Returning raw scores instead.


In [29]:
# classification report
print( classification_report(y_test, 
                             y_pred_lgbm_focal) )

# confusion matrix
cm = pd.crosstab( y_test,
                  y_pred_lgbm_focal,
                  rownames=['Actual Class'],
                  colnames=['Predicted Class'] )

print( f"Confusion Matrix:\n{ cm }" )


              precision    recall  f1-score   support

           1       0.20      0.15      0.17     95219
           2       0.42      0.15      0.23     93114
           3       0.37      0.29      0.33     34672
           4       0.00      0.07      0.01      5795

    accuracy                           0.17    228800
   macro avg       0.25      0.17      0.18    228800
weighted avg       0.31      0.17      0.21    228800

Confusion Matrix:
Predicted Class      1      2      3      4
Actual Class                               
1                14028   4935   5410  70846
2                47180  14384   8487  23063
3                 7753  13159  10181   3579
4                  700   1457   3218    420


In [None]:
# ---------------------------
# save to google cloud bucket
# ---------------------------
gc_storage.upload( bucket=bucket,
                   obj=df,
                   local_folder='2_preprocessing',
                   file_name='1.0-munz-data-preprocessing_resampling.joblib' )