In [None]:
"""
notebook: 1.1.-leibold-data-preprocessing_concatenated.jpynb

author: Christian Leibold

created/updated at: 2025-11-19

intention: concatenating prepared ind, loca, veh and acc datasets. 
           Caution: includes multplied lines due to intersections in loca dataset!! 

content:
---------
-> reads in joblib from prior process stept prepared data for ind, loca, acc, veh
-> adding new variable "loca_is_intersection". helpful for later aggregation and maybe relevant for model
-> concat ind, loca, acc, veh
-> remove rows with metro=0 (accidents outside France mainland)
-> remove id columns veh_num, ind_vehID
-> export to joblib
NEW: -> export to google storage included

"""

In [42]:
import pandas as pd
import numpy as np
from joblib import dump, load

import seaborn as sns
import matplotlib as plt

from scipy.stats import chi2_contingency
import scipy.stats as stats

In [43]:
#----------------------------------------------------------------------------------------------------------------------------------
# import joblibs from prior step / show first infos about datasets
#----------------------------------------------------------------------------------------------------------------------------------
df_acc = load(r'..\..\data\processed\2_preprocessing\1.1-simmler-data-preprocessing_accidents.joblib')
df_loca = load(r'..\..\data\processed\2_preprocessing\1.1-munz-data-preprocessing_locations.joblib')
df_ind = load(r'..\..\data\processed\2_preprocessing\1.3-becker-data-preprocessing_usagers.joblib')
df_veh = load(r'..\..\data\processed\2_preprocessing\1.3-leibold-data-preprocessing_vehicles.joblib')

print("accidents shape:", df_acc.shape)
print("locactions shape:", df_loca.shape)
print("individuals shape:", df_ind.shape)
print("vehicles shape:", df_veh.shape)

display(df_acc.head(3))
display(df_loca.head(3))
display(df_ind.head(3))
display(df_veh.head(3))

df_acc.info()
df_loca.info()
df_ind.info()
df_veh.info()


accidents shape: (327628, 15)
locactions shape: (359510, 9)
individuals shape: (733875, 15)
vehicles shape: (559847, 8)


Unnamed: 0,acc_num,acc_date,acc_year,acc_month,acc_hour,acc_department,acc_municipality,acc_metro,acc_long,acc_lat,acc_ambient_lightning,acc_atmosphere,acc_urbanization_level,acc_intersection,acc_collision_type
0,201900000001,2019-11-30,2019,11,1,93,93053,1,2.47012,48.89621,4.0,1.0,1,1.0,2.0
1,201900000002,2019-11-30,2019,11,2,93,93066,1,2.3688,48.9307,3.0,1.0,1,1.0,6.0
2,201900000003,2019-11-28,2019,11,15,92,92036,1,2.319174,48.935872,1.0,1.0,1,1.0,4.0


Unnamed: 0,acc_num,loca_road_cat,loca_traffic_circul,loca_road_lanes,loca_road_gradient,loca_road_view,loca_road_surface_cond,loca_accident,loca_max_speed
0,201900000001,1,3,10.0,1,2.0,1.0,1,70.0
1,201900000002,1,1,2.0,4,2.0,1.0,1,70.0
2,201900000003,1,3,8.0,1,3.0,1.0,1,90.0


Unnamed: 0,acc_num,ind_vehID,veh_num,ind_place,ind_cat,ind_severity,ind_sex,ind_trip,ind_secu1,ind_secu2,ind_location,ind_action,ind_year,ind_age,ind_age_group
0,201900000001,138 306 524,B01,2,2,2,2,0,1,0,,,2019,17,1
1,201900000001,138 306 524,B01,1,1,2,2,5,1,0,,,2019,26,3
2,201900000001,138 306 525,A01,1,1,1,1,0,1,0,,,2019,60,4


Unnamed: 0,acc_num,veh_cat,veh_fixed_obstacle,veh_moving_obstacle,veh_impact,veh_maneuver,veh_motor,veh_id
0,201900000001,7,0,2,5.0,23.0,1,138 306 524
1,201900000001,17,1,0,3.0,11.0,1,138 306 525
2,201900000002,7,4,0,1.0,,1,138 306 523


<class 'pandas.core.frame.DataFrame'>
Index: 327628 entries, 0 to 54401
Data columns (total 15 columns):
 #   Column                  Non-Null Count   Dtype         
---  ------                  --------------   -----         
 0   acc_num                 327628 non-null  int64         
 1   acc_date                327628 non-null  datetime64[ns]
 2   acc_year                327628 non-null  int64         
 3   acc_month               327628 non-null  int64         
 4   acc_hour                327628 non-null  int64         
 5   acc_department          327628 non-null  object        
 6   acc_municipality        327628 non-null  object        
 7   acc_metro               327628 non-null  int64         
 8   acc_long                327628 non-null  float64       
 9   acc_lat                 327628 non-null  float64       
 10  acc_ambient_lightning   327619 non-null  float64       
 11  acc_atmosphere          327602 non-null  float64       
 12  acc_urbanization_level  327628 non-n

In [44]:
#----------------------------------------------------------------------------------------------------------------------------------
# check missing values
#----------------------------------------------------------------------------------------------------------------------------------
# Show columns with missing values in df_acc
print("Missing values in df_acc:")
print(df_acc.isna().sum()[df_acc.isna().sum() > 0])

# Show columns with missing values in df_ind
print("\nMissing values in df_ind:")
print(df_ind.isna().sum()[df_ind.isna().sum() > 0])

# Show columns with missing values in df_loca
print("\nMissing values in df_loca:")
print(df_loca.isna().sum()[df_loca.isna().sum() > 0])

# Show columns with missing values in df_veh
print("\nMissing values in df_veh:")
print(df_veh.isna().sum()[df_veh.isna().sum() > 0])


Missing values in df_acc:
acc_ambient_lightning       9
acc_atmosphere             26
acc_intersection           14
acc_collision_type       1619
dtype: int64

Missing values in df_ind:
ind_secu1          889
ind_secu2       292536
ind_location    331748
ind_action      300547
dtype: int64

Missing values in df_loca:
loca_road_lanes           10664
loca_road_view              238
loca_road_surface_cond      310
loca_max_speed            11599
dtype: int64

Missing values in df_veh:
veh_impact        299
veh_maneuver    35808
dtype: int64


In [30]:
#----------------------------------------------------------------------------------------------------------------------------------
# check duplicates
#----------------------------------------------------------------------------------------------------------------------------------
# display(df_acc[df_acc.duplicated(keep=False)])
# display(df_loca[df_loca.duplicated(keep=False)])
# display(df_ind[df_ind.duplicated(keep=False)])
# display(df_veh[df_veh.duplicated(keep=False)])


In [45]:
#---------------------------------------------------------------------------------------------------------------------------------------
# loca_is_intersection as new binary variable to identify intersection accidents
#---------------------------------------------------------------------------------------------------------------------------------------
# take all rows where acc_num occures more than 1 time and add as new column
df_loca['loca_is_intersection'] = df_loca['acc_num'].duplicated(keep=False).astype(int)

# check value_counts
df_loca['loca_is_intersection'].value_counts(normalize=True)


loca_is_intersection
0    0.823657
1    0.176343
Name: proportion, dtype: float64

In [46]:
#---------------------------------------------------------------------------------------------------------------------------------------
# concat ind, acc, veh, loca: starting with ind dataset
#---------------------------------------------------------------------------------------------------------------------------------------
# not merged in order of documentation from Frensh government to avoid additional lines.
# if f.e. two vehicles are involved in accident but only one person is known and has data we only want to keep the lines where we have info of our target variable

print("IND shape:", df_ind.shape)

df = pd.merge(df_ind, df_acc, on = 'acc_num', how='left')
print("IND + ACC shape:", df.shape)

df = pd.merge(df, df_veh, left_on = ['acc_num', 'ind_vehID'], right_on = ['acc_num', 'veh_id'], how='left')
print("IND + ACC + VEH shape:", df.shape)

df = pd.merge(df, df_loca, on = 'acc_num', how='left').reset_index(drop=True)
print("IND + ACC + VEH + LOCA shape:", df.shape)


IND shape: (733875, 15)
IND + ACC shape: (733875, 29)
IND + ACC + VEH shape: (733875, 36)
IND + ACC + VEH + LOCA shape: (807332, 45)


In [22]:
# #---------------------------------------------------------------------------------------------------------------------------------------
# # concat ind, acc, veh, loca: starting with acc dataset following documenation order for merges
# #---------------------------------------------------------------------------------------------------------------------------------------
# print("ACC shape:", df_acc.shape)

# df2 = pd.merge(df_acc, df_loca, on='acc_num', how='left')  # Unfall + Ort
# print("ACC + LOCA shape:", df2.shape)

# df2 = pd.merge(df2, df_veh, on='acc_num', how='left')       # + Fahrzeuge
# print("ACC + LOCA + VEH shape:", df2.shape)

# df2 = pd.merge(df2, df_ind, left_on=['acc_num', 'veh_id'], right_on=['acc_num', 'ind_vehID'], how='left')  # + Personen
# print("ACC + LOCA + VEH + IND shape:", df2.shape)


ACC shape: (327628, 15)
ACC + LOCA shape: (359510, 24)
ACC + LOCA + VEH shape: (617566, 31)
ACC + LOCA + VEH + IND shape: (820050, 45)


In [39]:
##---------------------------------------------------------------------------------------------------------------------------------------
## compare row counts for both merge orders
##---------------------------------------------------------------------------------------------------------------------------------------
# # Example: group by acc_num and count rows
# df_counts = df.groupby("acc_num").size().reset_index(name="count_df")
# df2_counts = df2.groupby("acc_num").size().reset_index(name="count_df2")

# # Merge the two count tables on acc_num
# merged_counts = pd.merge(df_counts, df2_counts, on="acc_num", how="outer")

# # Keep only acc_num where counts differ
# diff_counts = merged_counts[merged_counts["count_df"] != merged_counts["count_df2"]]

# print(diff_counts)

In [40]:
# #---------------------------------------------------------------------------------------------------------------------------------------
# # check raw data to be sure not having mistake in year-csv-concatenated file
# #---------------------------------------------------------------------------------------------------------------------------------------
# df_ind_raw = load(r'..\..\data\processed\1_exploration\1.0-becker-data-exploration-raw_usagers.joblib')
# df_veh_raw = load(r'..\..\data\processed\1_exploration\1.0-leibold-data-exploration_vehicles.joblib')
# #df_ind_raw.head()
# display(df_ind_raw[df_ind_raw['Num_Acc']==acc_num_])
# display(df_veh_raw[df_veh_raw['Num_Acc']==acc_num_])


In [41]:
# #---------------------------------------------------------------------------------------------------------------------------------------
# # check single accident
# #---------------------------------------------------------------------------------------------------------------------------------------
# acc_num_ = 201900000058

# display(df_acc[df_acc['acc_num']==acc_num_])
# display(df_loca[df_loca['acc_num']==acc_num_])
# display(df_veh[df_veh['acc_num']==acc_num_])
# display(df_ind[df_ind['acc_num']==acc_num_])

# display(df[df['acc_num']==acc_num_])
# display(df2[df2['acc_num']==acc_num_])

In [47]:
#---------------------------------------------------------------------------------------------------------------------------------------
# remove rows with accidents outside France mainland
#---------------------------------------------------------------------------------------------------------------------------------------
#df['acc_metro'].value_counts()
print("df shape BEFORE removing accidents out of France mainland:", df.shape)
df = df.loc[df['acc_metro']==1]
print("df shape AFTER removing accidents out of France mainland:", df.shape)

df shape BEFORE removing accidents out of France mainland: (807332, 45)
df shape AFTER removing accidents out of France mainland: (762666, 45)


In [48]:
#---------------------------------------------------------------------------------------------------------------------------------------
# remove irrelvant columns (if not already removed in prior joblib files)
#---------------------------------------------------------------------------------------------------------------------------------------
cols_remove = ['veh_num', 'ind_vehID']

df = df.drop(columns=cols_remove, errors='ignore')

In [49]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 762666 entries, 0 to 807331
Data columns (total 43 columns):
 #   Column                  Non-Null Count   Dtype         
---  ------                  --------------   -----         
 0   acc_num                 762666 non-null  int64         
 1   ind_place               762666 non-null  int64         
 2   ind_cat                 762666 non-null  int64         
 3   ind_severity            762666 non-null  int64         
 4   ind_sex                 762666 non-null  int64         
 5   ind_trip                762666 non-null  int64         
 6   ind_secu1               761592 non-null  Int64         
 7   ind_secu2               459219 non-null  Int64         
 8   ind_location            410831 non-null  Int64         
 9   ind_action              440404 non-null  Int64         
 10  ind_year                762666 non-null  int64         
 11  ind_age                 762666 non-null  Int64         
 12  ind_age_group           762666 non-

In [65]:
# -------------------------------------------------------------------------------------------------
# export final dataframe to joblib in local folder
# -------------------------------------------------------------------------------------------------
from joblib import dump

dump(df, (r'..\..\..\temp_data\1.2-leibold-data-preprocessing_concat.joblib'))
#dump(df, (r'..\..\data\processed\2_preprocessing\1.1-leibold-data-preprocessing_concatenated.joblib'))


['..\\..\\..\\temp_data\\1.1-leibold-data-preprocessing_concat.joblib']

In [61]:
# -------------------------------------------------------------------------------------------------
# init Goolge storage
# -------------------------------------------------------------------------------------------------
import os
import sys
sys.path.append('../../library')
import gc_storage

# init Goolge Cloud storage
bucket_name = 'sep25-bds-road-accidents'
key_path = '../../fiery-glass-478009-t8-18a81c8cbe63.json'

bucket = gc_storage.init_bucket(bucket = bucket_name, json_key_path=key_path)

Initialized sep25-bds-road-accidents


In [62]:
# -------------------------------------------------------------------------------------------------
# list content in goolge storage
# -------------------------------------------------------------------------------------------------
# blobs = bucket.list_blobs( prefix='data/processed/2_preprocessing/' )
# for blob in blobs:
#     if blob.name.endswith( '.joblib' ):
#         print( blob.name )

# #gc_storage.list_bucket(bucket=bucket, remote_folder='2_preprocessing')

data/processed/2_preprocessing/0.1-munz-data-preprocessing_locations.joblib
data/processed/2_preprocessing/0.2-munz-data-preprocessing_locations.joblib
data/processed/2_preprocessing/0.3-munz-data-preprocessing_locations.joblib
data/processed/2_preprocessing/0.4-munz-data-preprocessing_locations.joblib
data/processed/2_preprocessing/1.0-becker-data-preprocessing_usagers.joblib
data/processed/2_preprocessing/1.0-leibold-data-preprocessing_vehicles.joblib
data/processed/2_preprocessing/1.0-simmler-data-preprocessing_accidents.joblib
data/processed/2_preprocessing/1.0.1-munz-data-preprocessing_locations.joblib
data/processed/2_preprocessing/1.0.2-munz-data-preprocessing_locations.joblib
data/processed/2_preprocessing/1.0.3-munz-data-preprocessing_locations.joblib
data/processed/2_preprocessing/1.1-becker-data-preprocessing_usagers.joblib
data/processed/2_preprocessing/1.1-leibold-data-preprocessing_concat.joblib
data/processed/2_preprocessing/1.1-leibold-data-preprocessing_vehicles.joblib

In [67]:
# -------------------------------------------------------------------------------------------------
# upload locally safed file to Goolge storage
# -------------------------------------------------------------------------------------------------
gc_storage.upload(bucket = bucket, obj=df, 
                  local_folder=r'../../../temp_data/', 
                  file_name='1.2-leibold-data-preprocessing_concat.joblib')


RetryError: Timeout of 120.0s exceeded, last exception: ('Connection aborted.', TimeoutError('The write operation timed out'))

In [63]:
# df_test = gc_storage.download(bucket = bucket, 
#                       remote_path = '2_preprocessing/1.1-munz-data-preprocessing_locations.joblib')
# df_test.head()

NotFound: 404 GET https://storage.googleapis.com/download/storage/v1/b/sep25-bds-road-accidents/o/data%5Cprocessed%5C2_preprocessing%5C1.1-munz-data-preprocessing_locations.joblib?alt=media: No such object: sep25-bds-road-accidents/data\processed\2_preprocessing\1.1-munz-data-preprocessing_locations.joblib: ('Request failed with status code', 404, 'Expected one of', <HTTPStatus.OK: 200>, <HTTPStatus.PARTIAL_CONTENT: 206>)