In [None]:
"""
notebook: 1.0.-leibold-data-preprocessing_aggr.jpynb

author: Christian Leibold

created/updated at: 2025-12-05

intention: create joblib data file including concatenated data from acc, loca, veh, ind 
           having insersections in loca data allready reduced to one line

content:
---------
-> reads in csv files with acc, loca, veh, ind data
-> reducing intersection in locations data to one line (keep road with max max_speed)
-> safe data in joblib file for reuse

"""

In [16]:
import pandas as pd
import numpy as np
from joblib import dump, load

import seaborn as sns
import matplotlib as plt

import itertools
from scipy.stats import chi2_contingency
import scipy.stats as stats

In [2]:
#----------------------------------------------------------------------------------------------------------------------------------
# import joblibs from prior step / show first infos about datasets
#----------------------------------------------------------------------------------------------------------------------------------
df_acc = load(r'..\..\data\processed\2_preprocessing\1.1-simmler-data-preprocessing_accidents.joblib')
df_loca = load(r'..\..\data\processed\2_preprocessing\1.1-munz-data-preprocessing_locations.joblib')
df_ind = load(r'..\..\data\processed\2_preprocessing\1.3-becker-data-preprocessing_usagers.joblib')
df_veh = load(r'..\..\data\processed\2_preprocessing\1.3-leibold-data-preprocessing_vehicles.joblib')

print("accidents shape:", df_acc.shape)
print("locactions shape:", df_loca.shape)
print("individuals shape:", df_ind.shape)
print("vehicles shape:", df_veh.shape)

display(df_acc.head(3))
display(df_loca.head(3))
display(df_ind.head(3))
display(df_veh.head(3))

df_acc.info()
df_loca.info()
df_ind.info()
df_veh.info()


accidents shape: (327628, 15)
locactions shape: (359510, 9)
individuals shape: (733875, 15)
vehicles shape: (559847, 8)


Unnamed: 0,acc_num,acc_date,acc_year,acc_month,acc_hour,acc_department,acc_municipality,acc_metro,acc_long,acc_lat,acc_ambient_lightning,acc_atmosphere,acc_urbanization_level,acc_intersection,acc_collision_type
0,201900000001,2019-11-30,2019,11,1,93,93053,1,2.47012,48.89621,4.0,1.0,1,1.0,2.0
1,201900000002,2019-11-30,2019,11,2,93,93066,1,2.3688,48.9307,3.0,1.0,1,1.0,6.0
2,201900000003,2019-11-28,2019,11,15,92,92036,1,2.319174,48.935872,1.0,1.0,1,1.0,4.0


Unnamed: 0,acc_num,loca_road_cat,loca_traffic_circul,loca_road_lanes,loca_road_gradient,loca_road_view,loca_road_surface_cond,loca_accident,loca_max_speed
0,201900000001,1,3,10.0,1,2.0,1.0,1,70.0
1,201900000002,1,1,2.0,4,2.0,1.0,1,70.0
2,201900000003,1,3,8.0,1,3.0,1.0,1,90.0


Unnamed: 0,acc_num,ind_vehID,veh_num,ind_place,ind_cat,ind_severity,ind_sex,ind_trip,ind_secu1,ind_secu2,ind_location,ind_action,ind_year,ind_age,ind_age_group
0,201900000001,138 306 524,B01,2,2,2,2,0,1,0,,,2019,17,1
1,201900000001,138 306 524,B01,1,1,2,2,5,1,0,,,2019,26,3
2,201900000001,138 306 525,A01,1,1,1,1,0,1,0,,,2019,60,4


Unnamed: 0,acc_num,veh_cat,veh_fixed_obstacle,veh_moving_obstacle,veh_impact,veh_maneuver,veh_motor,veh_id
0,201900000001,7,0,2,5.0,23.0,1,138 306 524
1,201900000001,17,1,0,3.0,11.0,1,138 306 525
2,201900000002,7,4,0,1.0,,1,138 306 523


<class 'pandas.core.frame.DataFrame'>
Index: 327628 entries, 0 to 54401
Data columns (total 15 columns):
 #   Column                  Non-Null Count   Dtype         
---  ------                  --------------   -----         
 0   acc_num                 327628 non-null  int64         
 1   acc_date                327628 non-null  datetime64[ns]
 2   acc_year                327628 non-null  int64         
 3   acc_month               327628 non-null  int64         
 4   acc_hour                327628 non-null  int64         
 5   acc_department          327628 non-null  object        
 6   acc_municipality        327628 non-null  object        
 7   acc_metro               327628 non-null  int64         
 8   acc_long                327628 non-null  float64       
 9   acc_lat                 327628 non-null  float64       
 10  acc_ambient_lightning   327619 non-null  float64       
 11  acc_atmosphere          327602 non-null  float64       
 12  acc_urbanization_level  327628 non-n

In [3]:
#----------------------------------------------------------------------------------------------------------------------------------
# keep only one road per acc_num in df_loca for intersections
#----------------------------------------------------------------------------------------------------------------------------------
# Step 1: compute group-level metadata
meta = (
    df_loca.groupby("acc_num")
           .agg(
               loca_road_counts=("acc_num", "size"),
               loca_max_speed_min=("loca_max_speed", "min"),
               loca_max_speed_max=("loca_max_speed", "max")
           )
           .reset_index()
)

# Step 2: add derived variables
meta["loca_is_intersection"] = (meta["loca_road_counts"] > 1).astype(int)
meta["loca_max_speed_dif"] = meta["loca_max_speed_max"] - meta["loca_max_speed_min"]

# Step 3: select representative row per acc_num
# - keep row with max loca_max_speed
# - if tie, keep first occurrence
idx = (
    df_loca.sort_values(["acc_num", "loca_max_speed"], ascending=[True, False])
           .groupby("acc_num")
           .head(1)
           .index
)

df_representative = df_loca.loc[idx].reset_index(drop=True)

# Step 4: merge metadata back
df_loca_no_intersec = df_representative.merge(
    meta[["acc_num", "loca_is_intersection", "loca_road_counts", "loca_max_speed_dif"]],
    on="acc_num", how="left"
)

In [4]:
#----------------------------------------------------------------------------------------------------------------------------------
# check shape of new df without intersections
#----------------------------------------------------------------------------------------------------------------------------------
print("locactions shape:", df_loca.shape)
print("loca_no_intersec shape:", df_loca_no_intersec.shape)

display(df_loca_no_intersec.head(3))

df_loca_no_intersec.info()



locactions shape: (359510, 9)
loca_no_intersec shape: (327628, 12)


Unnamed: 0,acc_num,loca_road_cat,loca_traffic_circul,loca_road_lanes,loca_road_gradient,loca_road_view,loca_road_surface_cond,loca_accident,loca_max_speed,loca_is_intersection,loca_road_counts,loca_max_speed_dif
0,201900000001,1,3,10.0,1,2.0,1.0,1,70.0,0,1,0.0
1,201900000002,1,1,2.0,4,2.0,1.0,1,70.0,0,1,0.0
2,201900000003,1,3,8.0,1,3.0,1.0,1,90.0,0,1,0.0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 327628 entries, 0 to 327627
Data columns (total 12 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   acc_num                 327628 non-null  int64  
 1   loca_road_cat           327628 non-null  int64  
 2   loca_traffic_circul     327628 non-null  int64  
 3   loca_road_lanes         322798 non-null  float64
 4   loca_road_gradient      327628 non-null  int64  
 5   loca_road_view          327585 non-null  float64
 6   loca_road_surface_cond  327550 non-null  float64
 7   loca_accident           327628 non-null  int64  
 8   loca_max_speed          322482 non-null  float64
 9   loca_is_intersection    327628 non-null  int64  
 10  loca_road_counts        327628 non-null  int64  
 11  loca_max_speed_dif      322482 non-null  float64
dtypes: float64(5), int64(7)
memory usage: 30.0 MB


In [5]:
#----------------------------------------------------------------------------------------------------------------------------------
# list all intersection accidents to check examples 
#----------------------------------------------------------------------------------------------------------------------------------
# Find acc_num values with more than one row
multi_acc_nums = df_loca["acc_num"].value_counts()
multi_acc_nums = multi_acc_nums[multi_acc_nums > 1].index

# Filter df_loca to only those rows
df_loca_multi = df_loca[df_loca["acc_num"].isin(multi_acc_nums)]

# Optional: sort for readability
df_loca_multi = df_loca_multi.sort_values(["acc_num", "loca_max_speed"]).reset_index(drop=True)

# Show result
display(df_loca_multi)

Unnamed: 0,acc_num,loca_road_cat,loca_traffic_circul,loca_road_lanes,loca_road_gradient,loca_road_view,loca_road_surface_cond,loca_accident,loca_max_speed
0,202300000001,5,1,2.0,1,1.0,2.0,1,30.0
1,202300000001,5,1,1.0,1,1.0,2.0,1,30.0
2,202300000003,4,2,4.0,1,1.0,2.0,1,50.0
3,202300000003,4,2,4.0,1,1.0,2.0,1,50.0
4,202300000005,5,1,1.0,1,1.0,2.0,1,30.0
...,...,...,...,...,...,...,...,...,...
63392,202400054388,5,1,2.0,1,1.0,1.0,1,50.0
63393,202400054393,4,1,1.0,1,1.0,1.0,7,30.0
63394,202400054393,4,2,4.0,1,1.0,1.0,1,50.0
63395,202400054398,4,2,3.0,1,1.0,2.0,4,50.0


In [6]:
#----------------------------------------------------------------------------------------------------------------------------------
# check single examples
#----------------------------------------------------------------------------------------------------------------------------------
display(df_loca.loc[df_loca['acc_num']==202400054388])
display(df_loca_no_intersec.loc[df_loca_no_intersec['acc_num']==202400054388])

display(df_loca.loc[df_loca['acc_num']==202400054393])
display(df_loca_no_intersec.loc[df_loca_no_intersec['acc_num']==202400054393])

Unnamed: 0,acc_num,loca_road_cat,loca_traffic_circul,loca_road_lanes,loca_road_gradient,loca_road_view,loca_road_surface_cond,loca_accident,loca_max_speed
359494,202400054388,5,1,2.0,1,1.0,1.0,1,50.0
359495,202400054388,5,2,,1,1.0,1.0,1,30.0


Unnamed: 0,acc_num,loca_road_cat,loca_traffic_circul,loca_road_lanes,loca_road_gradient,loca_road_view,loca_road_surface_cond,loca_accident,loca_max_speed,loca_is_intersection,loca_road_counts,loca_max_speed_dif
327613,202400054388,5,1,2.0,1,1.0,1.0,1,50.0,1,2,20.0


Unnamed: 0,acc_num,loca_road_cat,loca_traffic_circul,loca_road_lanes,loca_road_gradient,loca_road_view,loca_road_surface_cond,loca_accident,loca_max_speed
359500,202400054393,4,2,4.0,1,1.0,1.0,1,50.0
359501,202400054393,4,1,1.0,1,1.0,1.0,7,30.0


Unnamed: 0,acc_num,loca_road_cat,loca_traffic_circul,loca_road_lanes,loca_road_gradient,loca_road_view,loca_road_surface_cond,loca_accident,loca_max_speed,loca_is_intersection,loca_road_counts,loca_max_speed_dif
327618,202400054393,4,2,4.0,1,1.0,1.0,1,50.0,1,2,20.0


In [7]:
#---------------------------------------------------------------------------------------------------------------------------------------
# check missing values
#---------------------------------------------------------------------------------------------------------------------------------------
print("\nMissing values in df_loca:")
print(df_loca.isna().sum()[df_loca.isna().sum() > 0], end="\n")

print("\nMissing values in df_loca_no_intersec:")
print(df_loca_no_intersec.isna().sum()[df_loca_no_intersec.isna().sum() > 0])


Missing values in df_loca:
loca_road_lanes           10664
loca_road_view              238
loca_road_surface_cond      310
loca_max_speed            11599
dtype: int64

Missing values in df_loca_no_intersec:
loca_road_lanes           4830
loca_road_view              43
loca_road_surface_cond      78
loca_max_speed            5146
loca_max_speed_dif        5146
dtype: int64


In [8]:
#---------------------------------------------------------------------------------------------------------------------------------------
# concat ind, acc, veh, loca: starting with ind dataset
#---------------------------------------------------------------------------------------------------------------------------------------
# not merged in order of documentation from Frensh government to avoid additional lines.
# if f.e. two vehicles are involved in accident but only one person is known and has data we only want to keep the lines where we have info of our target variable

print("IND shape:", df_ind.shape)

df = pd.merge(df_ind, df_acc, on = 'acc_num', how='left')
print("IND + ACC shape:", df.shape)

df = pd.merge(df, df_veh, left_on = ['acc_num', 'ind_vehID'], right_on = ['acc_num', 'veh_id'], how='left')
print("IND + ACC + VEH shape:", df.shape)

df = pd.merge(df, df_loca_no_intersec, on = 'acc_num', how='left').reset_index(drop=True)
print("IND + ACC + VEH + LOCA shape:", df.shape)


IND shape: (733875, 15)
IND + ACC shape: (733875, 29)
IND + ACC + VEH shape: (733875, 36)
IND + ACC + VEH + LOCA shape: (733875, 47)


In [9]:
#---------------------------------------------------------------------------------------------------------------------------------------
# remove rows with accidents outside France mainland
#---------------------------------------------------------------------------------------------------------------------------------------
#df['acc_metro'].value_counts()
print("df shape BEFORE removing accidents out of France mainland:", df.shape)
df = df.loc[df['acc_metro']==1]
print("df shape AFTER removing accidents out of France mainland:", df.shape)

df shape BEFORE removing accidents out of France mainland: (733875, 47)
df shape AFTER removing accidents out of France mainland: (692159, 47)


In [10]:
#---------------------------------------------------------------------------------------------------------------------------------------
# remove irrelvant columns (if not already removed in prior joblib files)
#---------------------------------------------------------------------------------------------------------------------------------------
cols_remove = ['veh_num', 'ind_vehID']

df = df.drop(columns=cols_remove, errors='ignore')

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 692159 entries, 0 to 733874
Data columns (total 45 columns):
 #   Column                  Non-Null Count   Dtype         
---  ------                  --------------   -----         
 0   acc_num                 692159 non-null  int64         
 1   ind_place               692159 non-null  int64         
 2   ind_cat                 692159 non-null  int64         
 3   ind_severity            692159 non-null  int64         
 4   ind_sex                 692159 non-null  int64         
 5   ind_trip                692159 non-null  int64         
 6   ind_secu1               691305 non-null  Int64         
 7   ind_secu2               415207 non-null  Int64         
 8   ind_location            379506 non-null  Int64         
 9   ind_action              409030 non-null  Int64         
 10  ind_year                692159 non-null  int64         
 11  ind_age                 692159 non-null  Int64         
 12  ind_age_group           692159 non-

In [22]:
# -------------------------------------------------------------------------------------------------
# export final dataframe to joblib in local folder
# -------------------------------------------------------------------------------------------------
from joblib import dump

dump(df, ('C:/Users/Christian/Documents/1.0-leibold-data-preprocessing_aggr.joblib'))

['C:/Users/Christian/Documents/1.0-leibold-data-preprocessing_aggr.joblib']

In [17]:
# ---------------------------------------------------------------------------------------
# check correlation between variables
# ---------------------------------------------------------------------------------------
# List of categorical variables
cols_cat = ['acc_year','acc_month','acc_hour','acc_ambient_lightning','acc_urbanization_level','acc_intersection','acc_atmosphere','acc_collision_type',
            'ind_place','ind_cat','ind_sex','ind_trip','ind_location','ind_action','ind_secu1','ind_secu2','ind_age_group', 
            'loca_road_cat','loca_traffic_circul','loca_road_gradient','loca_road_view','loca_road_surface_cond','loca_accident', 'loca_is_intersection',
            'veh_cat','veh_fixed_obstacle','veh_moving_obstacle','veh_impact','veh_maneuver', 'veh_motor'
           ]

cat_vars = cols_cat

# Define Cramér's V function
def cramers_v(x, y):
    confusion_matrix = pd.crosstab(x.dropna(), y.dropna())
    chi2 = stats.chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    phi2 = chi2 / n
    r, k = confusion_matrix.shape
    return np.sqrt(phi2 / min(k - 1, r - 1))

    
# Compute Cramér's V for all unique pairs
results = []
for var1, var2 in itertools.combinations(cat_vars, 2):
    v = cramers_v(df[var1].dropna(), df[var2].dropna())
    results.append({
        'Variable 1': var1,
        'Variable 2': var2,
        'Cramér\'s V': round(v, 4)
    })

# Convert to DataFrame
cramers_df = pd.DataFrame(results)

pd.set_option('display.max_rows', None)  # Show all rows
display(cramers_df.sort_values(by="Cramér's V", ascending=False))
pd.reset_option('display.max_rows')


Unnamed: 0,Variable 1,Variable 2,Cramér's V
204,ind_place,ind_cat,0.9969
228,ind_cat,ind_action,0.6925
227,ind_cat,ind_location,0.6921
122,acc_urbanization_level,loca_road_cat,0.6517
229,ind_cat,ind_secu1,0.5525
424,veh_cat,veh_motor,0.5106
152,acc_intersection,loca_is_intersection,0.4852
241,ind_cat,veh_moving_obstacle,0.4846
22,acc_year,loca_is_intersection,0.4733
57,acc_hour,acc_ambient_lightning,0.427
