# Version 1.1 – Preprocessing of BAAC Usagers Data

This notebook documents the continuation of preprocessing for the BAAC "usagers" dataset, building on the cleaned and standardized version 1.0. The goal is to produce a modeling-ready version 1.1 with the following enhancements:

- Column renaming to follow the `ind_*` naming convention
- Aggregation of rare safety equipment categories (`secu` variables: 5, 6, 7 → "9")
- convert types of columns. 
- delete column 'ind_id'
- define age bins


The final dataset will be saved as `1.1-becker-data-preprocessing_usagers` and will serve as the foundation for merging.

In [30]:
import pandas as pd
import numpy as np
import joblib

usagers_v10 = joblib.load("1.0-becker-data-preprocessing_usagers")
print("Shape v1.0:", usagers_v10.shape)

Shape v1.0: (744575, 17)


In [31]:
# rename columns v.1.1 convention
rename_map_v11 = {
    "individual_place":"ind_place",
    "individual_cat":"ind_cat",
    "individual_severity":"ind_severity",
    "individual_sex":"ind_sex",
    "individual_trip":"ind_trip",
    "individual_safety":"ind_safety",
    "individual_location":"ind_location",
    "individual_action":"ind_action",
    "individual_companionship":"ind_companionship",
    "veh_num":"veh_num",
    "individual_vehID":"ind_vehID",
    "individual_secu1":"ind_secu1",
    "individual_secu2":"ind_secu2",
    "user_secu3":"ind_secu3",
    "individual_id":"ind_id"
}

usagers_v11 = usagers_v10.rename(columns={k: v for k, v in rename_map_v11.items() if k in usagers_v10.columns})


def simplify_secu(col):
    return col.replace({5: "9", 6: "9", 7: "9"})

for col in ["ind_secu1", "ind_secu2", "ind_secu3"]:
    if col in usagers_v11.columns:
        usagers_v11[col] = simplify_secu(usagers_v11[col])

usagers_v11=usagers_v11.drop('ind_id',axis=1)
usagers_v11.reset_index()
usagers_v11.head()


Unnamed: 0,acc_num,ind_vehID,veh_num,ind_place,ind_cat,ind_severity,ind_sex,ind_trip,ind_secu1,ind_secu2,ind_secu3,ind_location,ind_action,ind_companionship,year,age
0,201900000001,138 306 524,B01,2,2,2,2,0,1,0,-1,-1,-1,-1,2019,17
1,201900000001,138 306 524,B01,1,1,2,2,5,1,0,-1,-1,-1,-1,2019,26
2,201900000001,138 306 525,A01,1,1,1,1,0,1,0,-1,-1,-1,-1,2019,60
3,201900000002,138 306 523,A01,1,1,2,2,0,1,0,-1,-1,-1,-1,2019,25
4,201900000003,138 306 520,A01,1,1,1,1,0,1,0,-1,-1,0,-1,2019,23


In [32]:
missing_vals=(usagers_v11['age'].isna().sum())/744575
print(missing_vals) # missing values < 5% -> delete NA in Age. 
before = len(usagers_v11)
usagers_v11=usagers_v11[usagers_v11['age'].notna()].copy()
after=len(usagers_v11)
print(f"Removed {before-after} rows with missing age. Remaining rows: {after}")

usagers_v11.info()


0.01437061410871974
Removed 10700 rows with missing age. Remaining rows: 733875
<class 'pandas.core.frame.DataFrame'>
Index: 733875 entries, 0 to 744574
Data columns (total 16 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   acc_num            733875 non-null  int64 
 1   ind_vehID          733875 non-null  object
 2   veh_num            733875 non-null  object
 3   ind_place          733875 non-null  int64 
 4   ind_cat            733875 non-null  int64 
 5   ind_severity       733875 non-null  int64 
 6   ind_sex            733875 non-null  int64 
 7   ind_trip           733875 non-null  int64 
 8   ind_secu1          733875 non-null  object
 9   ind_secu2          733875 non-null  object
 10  ind_secu3          733875 non-null  object
 11  ind_location       733875 non-null  int64 
 12  ind_action         733875 non-null  object
 13  ind_companionship  733875 non-null  int64 
 14  year               733875 non-null  int64

In [33]:
# create new age_group 
#
usagers_v11["age_group"] = "Unknown"

# Schritt 2: Altersgruppen manuell zuweisen
usagers_v11.loc[usagers_v11["age"] <= 17, "age_group"] = "0–17"
usagers_v11.loc[(usagers_v11["age"] >= 18) & (usagers_v11["age"] <= 24), "age_group"] = "18–24"
usagers_v11.loc[(usagers_v11["age"] >= 25) & (usagers_v11["age"] <= 44), "age_group"] = "25–44"
usagers_v11.loc[(usagers_v11["age"] >= 45) & (usagers_v11["age"] <= 64), "age_group"] = "45–64"
usagers_v11.loc[usagers_v11["age"] >= 65, "age_group"] = "65+"


In [34]:
usagers_v11.head()
joblib.dump(usagers_v11, "1.1-becker-data-preprocessing_usagers.joblib")

['1.1-becker-data-preprocessing_usagers.joblib']