In [None]:
# author: Michael Munz
# workflow applying re-sampling and avoiding data leakage use imblearn (Imbalanced-learn)
# it is perfectly compatible with the scikit-learn ecosystem
# necessary steps:
#   1 defining explanatory vars :X
#   2 defining target var y: :ind_severity
#   3 splitting
#   4 imputing




In [1]:
# import
import pandas as pd
import numpy as np

import sys
sys.path.append( '../../library' )
import gc_storage
import missing_values_utils

from joblib import load

from sklearn.model_selection import train_test_split


In [2]:
# -------------------------
# init Google Cloud storage
# -------------------------
bucket_name='sep25-bds-road-accidents'
key_path='../../auth/fiery-glass-478009-t8-18a81c8cbe63.json'

bucket = gc_storage.init_bucket( bucket=bucket_name,
                                 json_key_path=key_path )

Initialized sep25-bds-road-accidents


In [3]:
# ---------------------------------------
# listing joblibs in Google Cloud storage
# ---------------------------------------
gc_storage.list_bucket( bucket=bucket,
                        remote_folder='2_preprocessing' )


Number of blobs: [28]
data/processed/2_preprocessing/
data/processed/2_preprocessing/0.1-munz-data-preprocessing_locations.joblib
data/processed/2_preprocessing/0.2-munz-data-preprocessing_locations.joblib
data/processed/2_preprocessing/0.3-munz-data-preprocessing_locations.joblib
data/processed/2_preprocessing/0.4-munz-data-preprocessing_locations.joblib
data/processed/2_preprocessing/1.0-becker-data-preprocessing_usagers.joblib
data/processed/2_preprocessing/1.0-leibold-data-preprocessing_vehicles.joblib
data/processed/2_preprocessing/1.0-munz-acc-municipality_X_test_uniques_lookup_table.gc
data/processed/2_preprocessing/1.0-munz-acc-municipality_X_train_uniques_lookup_table.gc
data/processed/2_preprocessing/1.0-munz-preprocessing-X_train_num.gc
data/processed/2_preprocessing/1.0-simmler-data-preprocessing_accidents.joblib
data/processed/2_preprocessing/1.0.1-munz-data-preprocessing_locations.joblib
data/processed/2_preprocessing/1.0.2-munz-data-preprocessing_locations.joblib
data/pr

In [6]:
# ----------------------------------
# download from Google Cloud storage
# ----------------------------------
df = gc_storage.download( bucket=bucket,
                          remote_path='2_preprocessing/1.2-leibold-data-preprocessing_concat.joblib' )

Downloaded data/processed/2_preprocessing/1.2-leibold-data-preprocessing_concat.joblib to
 ../../data/processed/2_preprocessing/1.2-leibold-data-preprocessing_concat.joblib


In [4]:
# ----
# load
# ----
df = load( '../../data/processed/2_preprocessing/1.2-leibold-data-preprocessing_concat.gc' )



In [5]:
# preview
display( df.shape )
display( df.info() )

display( df.isnull().sum() )

(762666, 44)

<class 'pandas.core.frame.DataFrame'>
Index: 762666 entries, 0 to 807331
Data columns (total 44 columns):
 #   Column                  Non-Null Count   Dtype         
---  ------                  --------------   -----         
 0   acc_num                 762666 non-null  int64         
 1   ind_place               762666 non-null  int64         
 2   ind_cat                 762666 non-null  int64         
 3   ind_severity            762666 non-null  int64         
 4   ind_sex                 762666 non-null  int64         
 5   ind_trip                762666 non-null  int64         
 6   ind_secu1               761592 non-null  Int64         
 7   ind_secu2               459219 non-null  Int64         
 8   ind_location            410831 non-null  Int64         
 9   ind_action              440404 non-null  Int64         
 10  ind_year                762666 non-null  int64         
 11  ind_age                 762666 non-null  Int64         
 12  ind_age_group           762666 non-

None

acc_num                        0
ind_place                      0
ind_cat                        0
ind_severity                   0
ind_sex                        0
ind_trip                       0
ind_secu1                   1074
ind_secu2                 303447
ind_location              351835
ind_action                322262
ind_year                       0
ind_age                        0
ind_age_group                  0
ind_temp_id                    0
acc_date                       0
acc_year                       0
acc_month                      0
acc_hour                       0
acc_department                 0
acc_municipality               0
acc_metro                      0
acc_long                       0
acc_lat                        0
acc_ambient_lightning         10
acc_atmosphere                37
acc_urbanization_level         0
acc_intersection              29
acc_collision_type          3769
veh_cat                        0
veh_fixed_obstacle             0
veh_moving

In [6]:
# delete unnecessary cols
delete = [ 'acc_num',
           'ind_temp_id',
           'veh_id' ]

df.drop( columns=delete,
         axis=1,
         inplace=True )



In [7]:
# explanatory vars :X
X = df.drop( columns='ind_severity',
             axis=1 )

# target var y: :ind_severity
# unbalanced target variable
y = df.ind_severity


In [8]:
# data splitting with stratification
# split into training set, test set BEFORE performing any re-sampling
X_train, X_test, y_train, y_test = train_test_split( X, 
                                                     y, 
                                                     test_size=0.3, 
                                                     random_state=369, 
                                                     stratify=y )



In [9]:
# no. of training class counts
print( f"Original Training Class Counts:\n{ y_train.value_counts() }" )


Original Training Class Counts:
ind_severity
1    222176
2    217266
3     80901
4     13523
Name: count, dtype: int64


In [10]:
# missing value mgmt -> impute

# variables with missing values (NaNs)
# numerical = [ :loca_max_speed,  :loca_road_lanes ]

# 1 impute quantitative (numerical)
target_col1 = 'loca_max_speed'

predictor_cols = [
    # Accident-level (time, geo) context    
    "acc_urbanization_level",
    "acc_year",
    "acc_month",
    "acc_hour",
    "acc_metro",
    "acc_long",
    "acc_lat",
    
    # Vehicle-level context
    "veh_cat",
    "veh_fixed_obstacle",
    "veh_moving_obstacle",
    "veh_motor",
    
    # Location / road context
    "loca_road_cat",
    "loca_traffic_circul",
    "loca_road_gradient",
    "loca_accident",
    "loca_is_intersection",
    
    # Individual-level context
    "ind_place",
    "ind_cat",
    "ind_sex",
    "ind_trip",
    "ind_year",
    "ind_age",
    "ind_age_group"
]

# :X_train
X_train = missing_values_utils.impute_numerical_by_regression(
    df=X_train,
    target_col=target_col1,
    predictor_cols=predictor_cols
)

target_col2 = 'loca_road_lanes'

# :X_train
X_train = missing_values_utils.impute_numerical_by_regression(
    df=X_train,
    target_col=target_col2,
    predictor_cols=predictor_cols
)



In [11]:
# :X_test
X_test = missing_values_utils.impute_numerical_by_regression(
    df=X_test,
    target_col=target_col1,
    predictor_cols=predictor_cols
)

# :X_test
X_test = missing_values_utils.impute_numerical_by_regression(
    df=X_test,
    target_col=target_col2,
    predictor_cols=predictor_cols
)


In [12]:
# variables with missing values (NaNs)

# 2 impute qualitative (categorical)
# 2.1 replace NaN by 0: :unknown
categorical_cols_0 = [
    # accident-level (time, geo) context
    'acc_ambient_lightning',
    'acc_atmosphere',
    'acc_urbanization_level',
    'acc_intersection',
    'acc_collision_type',
    
    # vehicle-level context
    'veh_impact',
    'veh_maneuver',
    
    # road-context
    'loca_road_view',
    'loca_road_surface_cond',
    
    # individual-context
    'ind_location',
    'ind_action'
]

# :X_train
X_train = missing_values_utils.impute_categorical_by_category(
    X_train,
    categorical_cols_0,
    new_category_label=0
)

# 2.2 replace NaN by 8: :unknown
categorical_cols_8 = [
    # individual-level context
    'ind_secu1',
    'ind_secu2'
]

# :X_train
X_train = missing_values_utils.impute_categorical_by_category(
    X_train,
    categorical_cols_8,
    new_category_label=8
)



In [13]:
# :X_test
X_test = missing_values_utils.impute_categorical_by_category(
    X_test,
    categorical_cols_0,
    new_category_label=0
)

# :X_test
X_test = missing_values_utils.impute_categorical_by_category(
    X_test,
    categorical_cols_8,
    new_category_label=8
)

In [14]:
# encode column :acc_date = 2024-08-13 00:00:00
# breaking down :acc_date -> { :acc_year, :acc_month, 
#                              :acc_dayofweek, :acc_dayofyear, :acc_hour }
# { 0: :Mon, ..., 6: :Sun }
X_train['acc_dayofweek'] = X_train.acc_date.dt.dayofweek
X_train['acc_dayofyear'] = X_train.acc_date.dt.dayofyear

# drop :acc_date
X_train.drop( columns=['acc_date'], axis=1, inplace=True )


In [15]:
X_test['acc_dayofweek'] = X_test.acc_date.dt.dayofweek
X_test['acc_dayofyear'] = X_test.acc_date.dt.dayofyear

# # drop :acc_date
X_test.drop( columns=['acc_date'], axis=1, inplace=True )



In [16]:
# verify
display( X_train[ ['acc_year', 'acc_month', 'acc_dayofweek', 'acc_dayofyear', 'acc_hour'] ].head(2) )




Unnamed: 0,acc_year,acc_month,acc_dayofweek,acc_dayofyear,acc_hour
650463,2024,8,1,226,19
432105,2022,12,5,344,17


In [17]:
# encode column :acc_department (object)
# converting 'departments' into numeric features preserving categorical nature
# without creating huge sparse one-hot matrix
# converting alpha-numeric codes [ '2A', '2B' ] to unique integers

# mapping Corsica codes
# any value NOT used by mainland France codes
corsica = {
    '2A': 201,
    '2B': 202
}

# apply mapping, then cast ALL to integer
X_train['acc_department'] = X_train.acc_department.replace( corsica ).astype( int )
X_test['acc_department'] = X_test.acc_department.replace( corsica ).astype( int )



In [18]:
# verify
display( X_train.acc_department.nunique() )
display( X_train.acc_department.unique() )



96

array([ 93,  35,  75,  17,  33,  37,  13,  62,  91,   9,  94,  87,  88,
        29,  67,  95,  21,  64,  48,  69,  57,  38,  49,  59,  54,  16,
        11,  92,  14,  89,   3,  23,  63,  22,  31,  47,  77,  26,  85,
        83,  78,  71,  74,   1,  61,  72,  10,  45,  56,  68,  80,  65,
        41,  52,  81,  25,  60,  15,   7,  79,  34,  30,  82,  39,  28,
        58,   5,  86,  51,  66,   6,  50,  27,   8,  73,  53,  44, 202,
        36,  18,  19,  84,  24,  43,  76,  32,  42,  55,  70, 201,   2,
        12,  40,   4,  46,  90])

In [19]:
# encode column :acc_municipality (object)
# convert to single numerical code, NOT one-hot
# each 'municipality' code is treated as categorical ID as integer 
# keeping leading zeros '0's for interpretation -> not important for ML

# codes can contain non-digits, such as '2B010'
# mapping codes to integer IDs via "factorize"
# creating integer codes for each distinct municipality label
# codes   -> numeric encoding used as feature
# uniques -> lookup table to map numeric code back to original municipality code
X_train_codes, X_train_uniques = pd.factorize( X_train.acc_municipality )

# :X_train
X_train['acc_municipality'] = X_train_codes.astype( int )



In [20]:
# :X_test
X_test_codes, X_test_uniques = pd.factorize( X_test.acc_municipality )

X_test['acc_municipality'] = X_test_codes.astype( int )

In [21]:
# store lookup table -> save to google cloud bucket
# :X_train_uniques
gc_storage.upload( bucket=bucket,
                   obj=X_train_uniques,
                   local_folder='2_preprocessing',
                   file_name='1.0-munz-acc-municipality_X_train_uniques_lookup_table.gc' )


Uploaded ../../data/processed/2_preprocessing/1.0-munz-acc-municipality_X_train_uniques_lookup_table.gc to
 gs://sep25-bds-road-accidents/data/processed/2_preprocessing/1.0-munz-acc-municipality_X_train_uniques_lookup_table.gc


In [22]:
# store lookup table
# :X_test_uniques
gc_storage.upload( bucket=bucket,
                   obj=X_test_uniques,
                   local_folder='2_preprocessing',
                   file_name='1.0-munz-acc-municipality_X_test_uniques_lookup_table.gc' )




Uploaded ../../data/processed/2_preprocessing/1.0-munz-acc-municipality_X_test_uniques_lookup_table.gc to
 gs://sep25-bds-road-accidents/data/processed/2_preprocessing/1.0-munz-acc-municipality_X_test_uniques_lookup_table.gc


In [23]:
# verify
display( X_train.acc_municipality.nunique() )
display( X_train.acc_municipality.unique() )


23076

array([    0,     1,     2, ..., 23073, 23074, 23075], shape=(23076,))

In [24]:
# validate
# preview
display( X_train.shape )
display( X_train.info() )

display( X_train.isnull().sum().sum() )
display( X_train.isnull().sum() )


(533866, 41)

<class 'pandas.core.frame.DataFrame'>
Index: 533866 entries, 650463 to 547672
Data columns (total 41 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   ind_place               533866 non-null  int64  
 1   ind_cat                 533866 non-null  int64  
 2   ind_sex                 533866 non-null  int64  
 3   ind_trip                533866 non-null  int64  
 4   ind_secu1               533866 non-null  Int64  
 5   ind_secu2               533866 non-null  Int64  
 6   ind_location            533866 non-null  Int64  
 7   ind_action              533866 non-null  Int64  
 8   ind_year                533866 non-null  int64  
 9   ind_age                 533866 non-null  Int64  
 10  ind_age_group           533866 non-null  Int64  
 11  acc_year                533866 non-null  int64  
 12  acc_month               533866 non-null  int64  
 13  acc_hour                533866 non-null  int64  
 14  acc_department      

None

np.int64(0)

ind_place                 0
ind_cat                   0
ind_sex                   0
ind_trip                  0
ind_secu1                 0
ind_secu2                 0
ind_location              0
ind_action                0
ind_year                  0
ind_age                   0
ind_age_group             0
acc_year                  0
acc_month                 0
acc_hour                  0
acc_department            0
acc_municipality          0
acc_metro                 0
acc_long                  0
acc_lat                   0
acc_ambient_lightning     0
acc_atmosphere            0
acc_urbanization_level    0
acc_intersection          0
acc_collision_type        0
veh_cat                   0
veh_fixed_obstacle        0
veh_moving_obstacle       0
veh_impact                0
veh_maneuver              0
veh_motor                 0
loca_road_cat             0
loca_traffic_circul       0
loca_road_lanes           0
loca_road_gradient        0
loca_road_view            0
loca_road_surface_co

In [25]:
# using purley numeric, float-compatible matrix for SMOTEENN
# feed SMOTEENN only 'float64' BEFORE re-sampling

# before re-sampling, build X_train as numpy array or DF
# where ALL cols are float

X_train_num = X_train.astype( 'float64' )



In [26]:
# data set ready for re-sampling
# save to google cloud bucket
gc_storage.upload( bucket=bucket,
                   obj=X_train_num,
                   local_folder='2_preprocessing',
                   file_name='1.0-munz-preprocessing-X_train_num.gc' )


Uploaded ../../data/processed/2_preprocessing/1.0-munz-preprocessing-X_train_num.gc to
 gs://sep25-bds-road-accidents/data/processed/2_preprocessing/1.0-munz-preprocessing-X_train_num.gc


In [28]:
# :y_train
gc_storage.upload( bucket=bucket,
                   obj=y_train,
                   local_folder='2_preprocessing',
                   file_name='1.0-munz-preprocessing-y_train.gc')



Uploaded ../../data/processed/2_preprocessing/1.0-munz-preprocessing-y_train.gc to
 gs://sep25-bds-road-accidents/data/processed/2_preprocessing/1.0-munz-preprocessing-y_train.gc


In [27]:
# :X_test
# save to google cloud bucket
gc_storage.upload( bucket=bucket,
                   obj=X_test,
                   local_folder='2_preprocessing',
                   file_name='1.0-munz-preprocessing-X_test.gc' )




Uploaded ../../data/processed/2_preprocessing/1.0-munz-preprocessing-X_test.gc to
 gs://sep25-bds-road-accidents/data/processed/2_preprocessing/1.0-munz-preprocessing-X_test.gc


In [29]:
# :y_test
gc_storage.upload( bucket=bucket,
                   obj=y_test,
                   local_folder='2_preprocessing',
                   file_name='1.0-munz-preprocessing-y_test.gc')



Uploaded ../../data/processed/2_preprocessing/1.0-munz-preprocessing-y_test.gc to
 gs://sep25-bds-road-accidents/data/processed/2_preprocessing/1.0-munz-preprocessing-y_test.gc
