In [None]:
"""
author: Michael Munz

Use pipeline in Jupyter notebook to streamline data processing.
Follow these steps:
- import pipeline factory functions
- load raw data into pd.DataFrame
- init pipeline
- fit & transform data via fit_transform()
- transform new data (:validation_set, :test_set) via transform() to
apply same preprocessing steps based on training fit.
"""

In [1]:
# import
import pandas as pd
import numpy as np

import sys
sys.path.append( '../../library' )
import gc_storage as gcs
import data_preprocessing_utils as dpu
import data_preprocessing_pipeline as dpp

from joblib import load
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split


In [None]:
# init google cloud storage (GCS)
bucket_name='sep25-bds-road-accidents'
key_path='../../auth/fiery-glass-478009-t8-18a81c8cbe63.json'

bucket = gcs.init_bucket( bucket=bucket_name,
                          json_key_path=key_path )



In [None]:
# list all files in GCS
gcs.list_bucket( bucket=bucket,
                 remote_folder='2_preprocessing' )


In [None]:
# download from GCS
df = gcs.download( bucket=bucket,
                   remote_path='2_preprocessing/1.0-leibold-data-preprocessing_aggr.joblib')



In [2]:
# load data
df = load( '../../data/processed/2_preprocessing/1.0-leibold-data-preprocessing_aggr.gc' )



In [None]:
# data exploration & pipeline config for ML pre-processing

# get ALL column categories filtered to EXISTING cols
col_categories = dpu.categorize_dataframe_columns(df)

# create summary table
dpu.display_column_categories( col_categories )



In [3]:
# explanatory vars :X
X = df.drop( columns='ind_severity',
             axis=1 )

# target var y: :ind_severity
# important: var is unbalanced
y = df.ind_severity

print( f"target distribution:\n{y.value_counts()}\n" )
print( f"target distribution:\n{y.value_counts(normalize=True).round(3)}" )

target distribution:
ind_severity
1    285859
2    280987
3    106958
4     18355
Name: count, dtype: int64

target distribution:
ind_severity
1    0.413
2    0.406
3    0.155
4    0.027
Name: proportion, dtype: float64


In [4]:
# data splitting with stratification
# split into training set, test set BEFORE applying pipeline + resampling
# stratify=y -> stratified split; prevents bias
#               ensures class distribution (proportions of each target class)
#               in { :y_train, :y_test } matches original :y
# stratify=n -> random split
X_train, X_test, y_train, y_test = train_test_split( X, 
                                                     y, 
                                                     test_size=0.3, 
                                                     random_state=369, 
                                                     stratify=y )

print( f"train shape: {X_train.shape}" )
print( f"test shape: {X_test.shape}" )

train shape: (484511, 44)
test shape: (207648, 44)


In [None]:
# data exploration & pipeline config for ML pre-processing

# get ALL column categories filtered to EXISTING cols
X_train_col_categories = dpu.categorize_dataframe_columns( X_train )

# create summary table
dpu.display_column_categories( X_train_col_categories )


In [6]:
# using small sample of training set
pipeline = dpp.build_default_full_pipeline()

# fit full pipeline :X_train, :y_train
pipeline.fit( X_train,
              y_train )

preprocessor = pipeline.named_steps['preparation']

# use small sample
X_sample = X_train.sample( 200,
                           random_state=369 )

# :get_feature_names_out()
# # opt 1 - manual
# preprocessor = pipeline.named_steps[ 'preparation' ]

# # last step (ColumnTransformer)
# encoding = preprocessor.named_steps[ 'encoding' ]

# feature_names = encoding.get_feature_names_out()

# print( len(feature_names) )
# print( feature_names[:10] )

# opt 2 - via method
feature_names = dpp.get_full_feature_names_from_preprocessor( X=X_sample,
                                                              preprocessor=preprocessor )


# inspect
display( "No. of features: {len(feature_names)}" )
display( feature_names[:10] )





ValueError: Shape of passed values is (200, 79), indices imply (200, 77)

In [7]:
X_train_processed = pipeline.transform( X_train )

X_train_processed_df = pd.DataFrame( X_train_processed,
                                     columns=feature_names,
                                     index=X_train.index )

print( X_train_processed_df.shape )
print( X_train_processed_df.head() )




NameError: name 'feature_names' is not defined

In [None]:
# build fully configured pipeline with defaults
pipeline = dpp.build_default_full_pipeline()
pipeline.fit( X_train,
              y_train )

X_train_processed = pipeline.transform( X_train )

n_features = X_train_processed.shape[1]

feature_names = [ f"{i}" for i in range(n_features) ]

X_train_processed_df = pd.DataFrame(
    X_train_processed,
    columns=feature_names,
    index=X_train.index
)

print( f"processed train shape: {X_train_processed.shape}" )
display( X_train_processed_df.head() )


In [None]:
display( X_train_processed_df.head() )



In [None]:
# train baseline model
# opt 1: use processed data directly with simple model to validate pipeline

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

clf = RandomForestClassifier(
    n_estimators=200,
    random_state=369,
    n_jobs=-1
)

# 1 apply fit
clf.fit( X_train_processed,
         y_train )

# 2 make prediction
y_pred = clf.predict( X_test_processed )

# 3 classification report
print( classification_report(y_test,
                             y_pred) )



In [None]:
# opt 2: wrap model + preprocessing into single pipeline
preprocessing_pipeline = dpp.build_default_full_pipeline()

# single pipeline
model_rfc = Pipeline([
    ( 'preprocessing',
      preprocessing_pipeline ),
    ( 'model',
      RandomForestClassifier(n_estimators=200,
                             random_state=369,
                             n_jobs=-1) )
])

# apply fit on :X_train, :y_train
model_rfc.fit( X_train,
               y_train )

# make prediction on :X_test
y_pred = model_rfc.predict( X_test )

# classification report
print( classification_report(y_test, 
                             y_pred) )




In [None]:
# persist pre-processed :training_set, :test_set

# :X_train_processed
gcs.upload( bucket=bucket,
            obj=X_train_processed,
            local_folder='2_preprocessing',
            file_name='1.0-munz-preprocessing-X_train_processed.gc')



In [None]:
# :y_train
gcs.upload( bucket=bucket,
            obj=y_train,
            local_folder='2_preprocessing',
            file_name='1.0-munz-preprocessing-y_train.gc')

In [None]:
# :X_test_processed
gcs.upload( bucket=bucket,
            obj=X_test_processed,
            local_folder='2_preprocessing',
            file_name='1.0-munz-preprocessing-X_test_processed.gc')



In [None]:
# :y_test
gcs.upload( bucket=bucket,
            obj=y_test,
            local_folder='2_preprocessing',
            file_name='1.0-munz-preprocessing-y_test.gc')