In [None]:
"""
notebook: 1.3.-leibold-data-preprocessing_vehicles.jpynb

author: Christian Leibold

created/updated at: 2025-11-14

intention: based on csv-files first data preparation for vehicle data as prior step for concatenating with ind, loca and acc in the next step. 

content:
---------
-> reads in joblib from prior process stept with csv-files for 2019-2024 vehicle data as base for this notebook 
-> check and remove duplicates (not occuring so far)
-> rename columns
-> remove completetly irrelevant variables 
        - veh_num: key column for merge is veh_id
        - veh_traffic_direction: irrelvant per definition
        - veh_public_occupants: Number of occupants in public transport is not relevant as almost no frequency for trains, busses, etc. 
-> replace -1 with 0 values in categorical variables 
-> regroup classes removed, will come after train test split to avoid data leackage
-> check missing values after -1 -> nan() replacement
-> check if value_counts() are fine and no corrupt values exist.
-> export to joblib data\processed\2_preprocessing\1.2-leibold-data-preprocessing_vehicles.joblib'
-> veh_motor is kept
"""

In [1]:
import pandas as pd
import numpy as np
from joblib import dump, load
import seaborn as sns

In [2]:
df_veh = load(r'..\..\data\processed\1_exploration\1.1-leibold-data-exploration_vehicles.joblib')
df_veh.head(3)

Unnamed: 0,Num_Acc,senc,catv,occutc,obs,obsm,choc,manv,num_veh,motor,id_vehicule
0,201900000001,2,7,,0,2,5,23,B01,1,138 306 524
1,201900000001,2,17,,1,0,3,11,A01,1,138 306 525
2,201900000002,1,7,,4,0,1,0,A01,1,138 306 523


In [3]:
# --------------------------------------------------------------------------------------------------------------------------------
# check and drop duplicates
# --------------------------------------------------------------------------------------------------------------------------------
print("vehicles duplicates:", df_veh.duplicated().sum())

df_veh.drop_duplicates(inplace=True)

print("vehicles duplicates AFTER cleaning:", df_veh.duplicated().sum())



vehicles duplicates: 0
vehicles duplicates AFTER cleaning: 0


In [4]:
# --------------------------------------------------------------------------------------------------------------------------------
# rename columns
# --------------------------------------------------------------------------------------------------------------------------------
rename_map_vehicles = {
    'Num_Acc': 'acc_num',
    'senc': 'veh_traffic_direction',
    'catv': 'veh_cat',
    'occutc': 'veh_public_occupants',
    'obs': 'veh_fixed_obstacle',
    'obsm': 'veh_moving_obstacle',
    'choc': 'veh_impact',
    'manv': 'veh_maneuver',
    'num_veh': 'veh_num',
    'motor': 'veh_motor',
    'id_vehicule': 'veh_id'
}
df_veh.rename(columns=rename_map_vehicles, inplace=True)



In [7]:
# --------------------------------------------------------------------------------------------------------------------------------
# remove absolute irrelvant columns
# --------------------------------------------------------------------------------------------------------------------------------
df_veh.drop(['veh_public_occupants','veh_traffic_direction','veh_num'], axis=1, inplace=True, errors='ignore')

df_veh.head(3)


Unnamed: 0,acc_num,veh_cat,veh_fixed_obstacle,veh_moving_obstacle,veh_impact,veh_maneuver,veh_motor,veh_id
0,201900000001,7,0,2,5,23,1,138 306 524
1,201900000001,17,1,0,3,11,1,138 306 525
2,201900000002,7,4,0,1,0,1,138 306 523


In [12]:
# --------------------------------------------------------------------------------------------------------------------------------
# replace -1 with nan() in categorical variables 
# --------------------------------------------------------------------------------------------------------------------------------
#df_veh['veh_traffic_direction'] = df_veh['veh_traffic_direction'].replace([-1], np.nan) # cat 0 has no info here
#df_veh['veh_cat'] = df_veh['veh_cat'].replace([-1], np.nan) # 0 kept as own category (not used anyway)
#df_veh['veh_fixed_obstacle'] = df_veh['veh_fixed_obstacle'].replace([-1], np.nan) # 0 kept as own category
#df_veh['veh_moving_obstacle'] = df_veh['veh_moving_obstacle'].replace([-1], np.nan) # 0 kept as own category
df_veh['veh_impact'] = df_veh['veh_impact'].replace([-1], np.nan) # 0 kept as own category
df_veh['veh_maneuver'] = df_veh['veh_maneuver'].replace([-1,0], np.nan) # cat 0 has no info here
#df_veh['veh_motor'] = df_veh['veh_motor'].replace([-1,0], np.nan) # cat 0 has no info here

# --------------------------------------------------------------------------------------------------------------------------------
# replace -1 to 0 values in categorical variables
# --------------------------------------------------------------------------------------------------------------------------------
#df_veh['veh_traffic_direction'] = df_veh['veh_traffic_direction'].replace(-1, 0) 
df_veh['veh_cat'] = df_veh['veh_cat'].replace(-1, 0) 
df_veh['veh_fixed_obstacle'] = df_veh['veh_fixed_obstacle'].replace(-1, 0) 
df_veh['veh_moving_obstacle'] = df_veh['veh_moving_obstacle'].replace(-1, 0) 
#df_veh['veh_impact'] = df_veh['veh_impact'].replace(-1, 0) 
#df_veh['veh_maneuver'] = df_veh['veh_maneuver'].replace(-1, 0) 
df_veh['veh_motor'] = df_veh['veh_motor'].replace(-1, 0) 



In [13]:
# --------------------------------------------------------------------------------------------------------------------------------
# check missing values after replacement of -1/0 catagories
# --------------------------------------------------------------------------------------------------------------------------------
missing_counts = df_veh.isna().sum()
missing_percent = df_veh.isna().sum() / len(df_veh)

missing_summary = pd.DataFrame({
    'Missing Count': missing_counts,
    'Missing %': missing_percent.round(4)
})

print(missing_summary)

                     Missing Count  Missing %
acc_num                          0     0.0000
veh_cat                          0     0.0000
veh_fixed_obstacle               0     0.0000
veh_moving_obstacle              0     0.0000
veh_impact                     299     0.0005
veh_maneuver                 35808     0.0640
veh_motor                        0     0.0000
veh_id                           0     0.0000


In [14]:
df_veh.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 559847 entries, 0 to 559846
Data columns (total 8 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   acc_num              559847 non-null  int64  
 1   veh_cat              559847 non-null  int64  
 2   veh_fixed_obstacle   559847 non-null  int64  
 3   veh_moving_obstacle  559847 non-null  int64  
 4   veh_impact           559548 non-null  float64
 5   veh_maneuver         524039 non-null  float64
 6   veh_motor            559847 non-null  int64  
 7   veh_id               559847 non-null  object 
dtypes: float64(2), int64(5), object(1)
memory usage: 34.2+ MB


In [15]:
# --------------------------------------------------------------------------------------------------------------------------------
# check all unique values
# --------------------------------------------------------------------------------------------------------------------------------
excluded_cols = ['acc_num', 'veh_id']

for col in [c for c in df_veh.columns if c not in excluded_cols]:
    print(f"\nValue counts for '{col}':")
    print(df_veh[col].value_counts(dropna=False))


Value counts for 'veh_cat':
veh_cat
7     329344
33     44871
10     39508
1      31117
2      20478
30     15981
32     12180
50     10378
31     10082
34      5799
15      5530
14      4161
37      3871
17      3739
80      3336
3       3248
43      3172
99      2414
13      2230
0       1593
21      1486
60      1259
36      1007
38       990
40       713
20       592
16       206
39       191
42       137
35       137
41        97
Name: count, dtype: int64

Value counts for 'veh_fixed_obstacle':
veh_fixed_obstacle
0     477987
1      13214
13      9915
2       8523
4       7842
3       7425
8       6552
6       6428
12      4550
14      4150
15      3191
9       2719
16      2425
7       1682
5       1045
11       983
10       609
17       607
Name: count, dtype: int64

Value counts for 'veh_moving_obstacle':
veh_moving_obstacle
2    391713
0    104588
1     52082
9      9087
6      1352
4       555
5       470
Name: count, dtype: int64

Value counts for 'veh_impact':
veh_impact
1

In [16]:
# -------------------------------------------------------------------------------------------------
# export final dataframe to joblib
# -------------------------------------------------------------------------------------------------
from joblib import dump

dump(df_veh, (r'..\..\data\processed\2_preprocessing\1.3-leibold-data-preprocessing_vehicles.joblib'))

['..\\..\\data\\processed\\2_preprocessing\\1.3-leibold-data-preprocessing_vehicles.joblib']