In [1]:
"""
notebook: 1.0.-leibold-data-pipeline.jpynb

author: Christian Leibold

created/updated at: 2025-12-01

intention: create preprocessing pipeline to prepare data from concat joblib. 
           

content:
---------
-> reads in joblib with concatenated data from prior process stept with csv-files for 2019-2024 as base for this notebook 
-> create a pipeline with most important preprocessing steps

"""

'\nnotebook: 1.5.-leibold-data-modeling_randomForest.jpynb\n\nauthor: Christian Leibold\n\ncreated/updated at: 2025-11-30\n\nintention: create preprocessing pipeline to prepare data from concat joblib. \n           use random forest model with smote to create a first baseline model. \n\ncontent:\n---------\n-> reads in joblib with concatenated data from prior process stept with csv-files for 2019-2024 as base for this notebook \n-> create a pipeline with most important preprocessing steps\n-> NEW: use GridSearchCV to optimize hyperparamethers\n-> create random forest model + smote resampling\n-> save model to local storage\n\n'

In [1]:
# classic packages
import pandas as pd
import numpy as np
from joblib import dump, load

# make custom libraries importable
import sys
sys.path.append('../../library')

# pipeline compatibel classes
from road_accidents_pipeline import RemoveIrrelevantCols         # removes some unnecessary columns
from road_accidents_pipeline import RearrangeCatCols             # bundle veh classes together to higher-tier classes
from road_accidents_pipeline import ConditionalMultiQuantImputer # impute missing values of quantiative variables via distr. on cat var
from road_accidents_pipeline import ConditionalCatImputer        # impute missing values of categorcial variables on target distribution
from road_accidents_pipeline import AggrLocaSplit                # aggregation of loca intersections incl. oneHotEncoding 
from road_accidents_pipeline import SupervisedEncoderWrapper     # wrapper for supervised encoders like CatBoostEncoder or TargetEncoder to make it work in pipeline
from road_accidents_pipeline import RemoveIdCols                 # removes ind_temp_id and acc_num after aggregation
from road_accidents_pipeline import TrigonometricEncoder         # encoding for cyclic variables lice acc_month or acc_hour
from road_accidents_pipeline import SafeColumnSelector           # makes sure only to use existing columns in pipeline

# scaler
from sklearn.preprocessing import MinMaxScaler, RobustScaler

# encoder
from sklearn.preprocessing import OneHotEncoder, SplineTransformer
from sklearn.preprocessing import FunctionTransformer
from category_encoders import TargetEncoder, CatBoostEncoder

# train_test_split
from sklearn.model_selection import train_test_split

# pipeline
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# little helper
from cleaning_utils import distinguish_cols, print_col_categories
import time



In [3]:
# -------------------------------------------------------------------------------------------------
# read in joblib with concatenated data
# -------------------------------------------------------------------------------------------------
df_ori = load(r'..\..\..\temp_data\1.2-leibold-data-preprocessing_concat.joblib')
#df = df_ori.loc[(df['acc_year']==2024) | (df['acc_year']==2023)]
#df = df_ori.sample(n=10000)

print(df_ori.shape)
display(df_ori.head(3))

(762666, 44)


Unnamed: 0,acc_num,ind_place,ind_cat,ind_severity,ind_sex,ind_trip,ind_secu1,ind_secu2,ind_location,ind_action,...,veh_id,loca_road_cat,loca_traffic_circul,loca_road_lanes,loca_road_gradient,loca_road_view,loca_road_surface_cond,loca_accident,loca_max_speed,loca_is_intersection
0,201900000001,2,2,2,2,0,1,0,,,...,138 306 524,1,3,10.0,1,2.0,1.0,1,70.0,0
1,201900000001,1,1,2,2,5,1,0,,,...,138 306 524,1,3,10.0,1,2.0,1.0,1,70.0,0
2,201900000001,1,1,1,1,0,1,0,,,...,138 306 525,1,3,10.0,1,2.0,1.0,1,70.0,0


In [4]:
# -------------------------------------------------------------------------------------------------
# create sample dataframe respecting ind_severity distribution and using rows from all years
# -------------------------------------------------------------------------------------------------
# Example proportion
prop = 0.1 

# Filter years 2019–2023 -> 2024 then shall be used to pretend to be "new" data
df_filtered = df_ori[df_ori['acc_year'].between(2019, 2023)]

# Compute target sample size
target_size = int(len(df_filtered) * prop)

# Step 1: Split by year
samples = []
n_years = df_filtered['acc_year'].nunique()

for year, df_year in df_filtered.groupby('acc_year'):
    # Equal share per year
    year_target_size = target_size // n_years

    # Step 2: Stratified sampling within year by ind_severity
    df_year_sample = (
        df_year
        .groupby('ind_severity', group_keys=False)
        .sample(
            n=None,  # let pandas compute based on frac
            frac=year_target_size / len(df_year),
            random_state=42
        )
    )
    samples.append(df_year_sample)

# Concatenate all year samples
df = pd.concat(samples)

print(df['acc_year'].value_counts())                     # equal counts per year
print(df['ind_severity'].value_counts(normalize=True))   # distribution preserved
print(df_ori['ind_severity'].value_counts(normalize=True))
print(df.shape)

acc_year
2019    12255
2020    12255
2021    12255
2022    12255
2023    12255
Name: count, dtype: int64
ind_severity
1    0.415031
2    0.406446
3    0.152950
4    0.025573
Name: proportion, dtype: float64
ind_severity
1    0.416165
2    0.406967
3    0.151538
4    0.025330
Name: proportion, dtype: float64
(61275, 44)


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 61275 entries, 128040 to 628509
Data columns (total 44 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   acc_num                 61275 non-null  int64         
 1   ind_place               61275 non-null  int64         
 2   ind_cat                 61275 non-null  int64         
 3   ind_severity            61275 non-null  int64         
 4   ind_sex                 61275 non-null  int64         
 5   ind_trip                61275 non-null  int64         
 6   ind_secu1               61223 non-null  Int64         
 7   ind_secu2               37439 non-null  Int64         
 8   ind_location            33586 non-null  Int64         
 9   ind_action              36465 non-null  Int64         
 10  ind_year                61275 non-null  int64         
 11  ind_age                 61275 non-null  Int64         
 12  ind_age_group           61275 non-null  Int64

In [6]:
# -------------------------------------------------------------------------------------------------
# data split
# -------------------------------------------------------------------------------------------------
# Separate features and target
X = df.drop(columns=['ind_severity'])   # predictors
y = df['ind_severity']                  # target

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# check shapes
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

X_train shape: (49020, 43)
y_train shape: (49020,)
X_test shape: (12255, 43)
y_test shape: (12255,)


In [7]:
# -------------------------------------------------------------------------------------------------
# check missing values
# -------------------------------------------------------------------------------------------------
missing_vars = X_train.isna().sum()
missing_vars = missing_vars[missing_vars > 0]
print(missing_vars)

ind_secu1                    41
ind_secu2                 19026
ind_location              22283
ind_action                19957
acc_ambient_lightning         1
acc_atmosphere                5
acc_intersection              1
acc_collision_type          377
veh_impact                   15
veh_maneuver               2742
loca_road_lanes             975
loca_road_view               26
loca_road_surface_cond       26
loca_max_speed             1174
dtype: int64


In [8]:
# -------------------------------------------------------------------------------------------------
# define columns to be operated: read in columns from dictionary for later use in pipeline
# -------------------------------------------------------------------------------------------------
cols_dict = distinguish_cols(df)

cat_cols = cols_dict["categorical"] # Get all categorical variables from cols_dict
missing_cat_cols = [col for col in cat_cols if col in X_train.columns and X_train[col].isna().any()] # Restrict to those with missing values

ohe_cols = cols_dict['oneHot_encoder']
valid_ohe_cols = [col for col in ohe_cols if col in X_train.columns] # Restrict to existing columns


⚠️ Missing columns (ignored): ['loca_road_count']


In [9]:
# -------------------------------------------------------------------------------------------------
# Full preprocessing pipeline
# -------------------------------------------------------------------------------------------------
# Pipeline with most important preprocessing steps. can be reused in other notebooks. 
# needs concatenated data as input.
# for the whole dataset takes about 11 minutes -> so try with smaller data sample first. 

# steps:
# --------------------------------------------------------
# - RemoveIrrelevantCols: Drops columns not useful for prediction.
# - ConditionalMultiQuantImputer: Imputes missing numeric values based on road category.
# - ConditionalCatImputer: Fills missing categorical values conditionally on target distribution.
# - AggrLocaSplit: Aggregates location features for intersections.
# - RemoveIdCols: Removes identifier columns used in aggregation.
# - RearrangeCatCols: merge some classes in veh categorical columns to reduce cardinatliy.
# - ColumnTransformer: Applies spline encoding, one-hot, CatBoost encoding, and scaling to features.


# Start timer
start_time = time.time()

# instantiate encoders
ohe_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False, dtype='int')
catboost_encoder = SupervisedEncoderWrapper(CatBoostEncoder(handle_unknown='value', handle_missing='value'), columns=["acc_municipality"])
cyclical_transformer = TrigonometricEncoder({"acc_hour": 24, "acc_month": 12})

# instantiate Scaler
scaler = MinMaxScaler()

# Combine everything into one pipeline
preprocessing_pipeline = Pipeline([
    # data preparation
    ("remove_cols", RemoveIrrelevantCols(verbose=True)),

    # imputing
    ("impute_quant", ConditionalMultiQuantImputer(quant_vars=["loca_max_speed", "loca_road_lanes"],cat_var="loca_road_cat")),
    ("impute_cat", ConditionalCatImputer(var_list=cat_cols)),

    # aggregation
    ("aggregate", AggrLocaSplit(agg_features="max", agg_target="max")),
    ("remove_id_cols", RemoveIdCols(verbose=True)),

    # encoding & scaling
    ("rearrange_categories", RearrangeCatCols(verbose=True)),
    ("encode_scale", ColumnTransformer([
        ("cyclical", cyclical_transformer, ["acc_hour", "acc_month"]),  
       ("onehot", Pipeline([
            ("select", SafeColumnSelector(cols_dict['oneHot_encoder'])),
            ("encode", ohe_encoder)
        ]), slice(None)),

        ("catboost", catboost_encoder, ["acc_municipality"]),
        ("scale", scaler, cols_dict["quantitative"])
    ], remainder="passthrough"))
    
], verbose=True)


# Fit the pipeline
preprocessing_pipeline.fit(X_train, y_train)

# Transform training and test data
X_train_piped = preprocessing_pipeline.transform(X_train)
X_test_piped = preprocessing_pipeline.transform(X_test)

# Access the fitted aggregation step
aggr_step = preprocessing_pipeline.named_steps["aggregate"]

# Transform y consistently (because the aggregation step reduces row count in X -> so y must also be reduced to avoid mismatch error)
y_train_piped = aggr_step.transform_y(X_train, y_train)
y_test_piped  = aggr_step.transform_y(X_test, y_test)


# End timer
end_time = time.time()
elapsed_minutes = (end_time - start_time) / 60
print(f"⏱️ Preprocessing Pipeline fit completed in {elapsed_minutes:.2f} minutes")


[Pipeline] ....... (step 1 of 7) Processing remove_cols, total=   0.0s
[Pipeline] ...... (step 2 of 7) Processing impute_quant, total=   1.0s
ℹ️ ConditionalCatImputer -> Column 'ind_location' not found, skipping.
ℹ️ ConditionalCatImputer -> Column 'ind_action' not found, skipping.
ℹ️ ConditionalCatImputer -> Column 'ind_secu2' not found, skipping.
[Pipeline] ........ (step 3 of 7) Processing impute_cat, total=   1.6s
[Pipeline] ......... (step 4 of 7) Processing aggregate, total=   1.1s
[Pipeline] .... (step 5 of 7) Processing remove_id_cols, total=   0.0s
[Pipeline]  (step 6 of 7) Processing rearrange_categories, total=   0.0s
[Pipeline] ...... (step 7 of 7) Processing encode_scale, total=   0.3s
⏱️ Preprocessing Pipeline fit completed in 0.15 minutes


In [14]:
# --------------------------------------------------------------------------------------------------
# safe pipeline
# --------------------------------------------------------------------------------------------------

dump(preprocessing_pipeline, "../../models/preprocessing_pipeline.pkl")


['../../models/preprocessing_pipeline.pkl']

In [18]:
# --------------------------------------------------------------------------------------------------
# check loading pipeline
# --------------------------------------------------------------------------------------------------
loaded_pipeline = load("../../models/preprocessing_pipeline.pkl")

loaded_pipeline.fit(X_train, y_train)

[Pipeline] ....... (step 1 of 7) Processing remove_cols, total=   0.0s
[Pipeline] ...... (step 2 of 7) Processing impute_quant, total=   1.1s
ℹ️ ConditionalCatImputer -> Column 'ind_location' not found, skipping.
ℹ️ ConditionalCatImputer -> Column 'ind_action' not found, skipping.
ℹ️ ConditionalCatImputer -> Column 'ind_secu2' not found, skipping.
[Pipeline] ........ (step 3 of 7) Processing impute_cat, total=   1.7s
[Pipeline] ......... (step 4 of 7) Processing aggregate, total=   1.2s
[Pipeline] .... (step 5 of 7) Processing remove_id_cols, total=   0.0s
[Pipeline]  (step 6 of 7) Processing rearrange_categories, total=   0.0s
[Pipeline] ...... (step 7 of 7) Processing encode_scale, total=   0.3s


0,1,2
,steps,"[('remove_cols', ...), ('impute_quant', ...), ...]"
,transform_input,
,memory,
,verbose,True

0,1,2
,cols_to_drop,"['veh_id', 'ind_year', ...]"
,verbose,True

0,1,2
,quant_vars,"['loca_max_speed', 'loca_road_lanes']"
,cat_var,'loca_road_cat'
,random_state,42

0,1,2
,var_list,"['acc_year', 'acc_municipality', ...]"
,random_state,42
,verbose,True

0,1,2
,agg_features,'max'
,agg_target,'max'
,keys,"['acc_num', 'ind_temp_id']"
,random_state,42
,verbose,True

0,1,2
,cols_to_drop,"['ind_temp_id', 'acc_num']"
,verbose,True

0,1,2
,verbose,True

0,1,2
,transformers,"[('cyclical', ...), ('onehot', ...), ...]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,col_periods,"{'acc_hour': 24, 'acc_month': 12}"

0,1,2
,candidate_cols,"['acc_ambient_lightning', 'acc_urbanization_level', ...]"

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,'int'
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,verbose,0
,cols,
,drop_invariant,False
,return_df,True
,handle_unknown,'value'
,handle_missing,'value'
,random_state,
,sigma,
,a,1

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False
