In [12]:
import pandas as pd


In [13]:
# -------------------------------------------------------------------------------------------------
# 1. read initival csv files
# -------------------------------------------------------------------------------------------------
df_vehicles_2019 = pd.read_csv('../data/gov_vehicles_2019.csv', sep=';')
df_vehicles_2020 = pd.read_csv('../data/gov_vehicles_2020.csv', sep=';')
df_vehicles_2021 = pd.read_csv('../data/gov_vehicles_2021.csv', sep=';')
df_vehicles_2022 = pd.read_csv('../data/gov_vehicles_2022.csv', sep=';')
df_vehicles_2023 = pd.read_csv('../data/gov_vehicles_2023.csv', sep=';')
df_vehicles_2024 = pd.read_csv('../data/gov_vehicles_2024.csv', sep=';')

In [14]:
print('2019:', df_vehicles_2019.shape)
print('2020:', df_vehicles_2020.shape)
print('2021:', df_vehicles_2021.shape)
print('2022:', df_vehicles_2022.shape)
print('2023:', df_vehicles_2023.shape)
print('2024:', df_vehicles_2024.shape)

2019: (100710, 11)
2020: (81066, 11)
2021: (97315, 11)
2022: (94493, 11)
2023: (93585, 11)
2024: (92678, 11)


In [15]:
# -------------------------------------------------------------------------------------------------
# 2. rename differing column names
# -------------------------------------------------------------------------------------------------
df_vehicles_2024 = df_vehicles_2024.rename(columns={'id_vehicule\n': 'id_vehicule'})

In [16]:
# -------------------------------------------------------------------------------------------------
# 3. reorder columns to kaggle data structure
# -------------------------------------------------------------------------------------------------
ordered_cols = ['Num_Acc','senc','catv','occutc','obs','obsm','choc','manv','num_veh','motor', 'id_vehicule']

df_vehicles_2019 = df_vehicles_2019[ordered_cols]
df_vehicles_2020 = df_vehicles_2020[ordered_cols]
df_vehicles_2021 = df_vehicles_2021[ordered_cols]
df_vehicles_2022 = df_vehicles_2022[ordered_cols]
df_vehicles_2023 = df_vehicles_2023[ordered_cols]
df_vehicles_2024 = df_vehicles_2024[ordered_cols]

In [17]:
# -------------------------------------------------------------------------------------------------
# 4. check new shape and column structure
# -------------------------------------------------------------------------------------------------
# check the shape of all dataframes
print('2019:', df_vehicles_2019.shape)
print('2020:', df_vehicles_2020.shape)
print('2021:', df_vehicles_2021.shape)
print('2022:', df_vehicles_2022.shape)
print('2023:', df_vehicles_2023.shape)
print('2024:', df_vehicles_2024.shape)

print()

# data insight
display("2019:", df_vehicles_2019.head(3))
display("2020:", df_vehicles_2020.head(3))
display("2021:", df_vehicles_2021.head(3))
display("2022:", df_vehicles_2022.head(3))
display("2023:", df_vehicles_2023.head(3))
display("2024:", df_vehicles_2024.head(3))

2019: (100710, 11)
2020: (81066, 11)
2021: (97315, 11)
2022: (94493, 11)
2023: (93585, 11)
2024: (92678, 11)



'2019:'

Unnamed: 0,Num_Acc,senc,catv,occutc,obs,obsm,choc,manv,num_veh,motor,id_vehicule
0,201900000001,2,7,,0,2,5,23,B01,1,138Â 306Â 524
1,201900000001,2,17,,1,0,3,11,A01,1,138Â 306Â 525
2,201900000002,1,7,,4,0,1,0,A01,1,138Â 306Â 523


'2020:'

Unnamed: 0,Num_Acc,senc,catv,occutc,obs,obsm,choc,manv,num_veh,motor,id_vehicule
0,202000000001,1,7,,0,2,2,15,B01,1,154Â 742Â 274
1,202000000001,1,33,,0,2,1,2,A01,1,154Â 742Â 275
2,202000000002,3,7,,0,1,2,26,A01,1,154Â 742Â 273


'2021:'

Unnamed: 0,Num_Acc,senc,catv,occutc,obs,obsm,choc,manv,num_veh,motor,id_vehicule
0,202100000001,1,1,,0,2,1,1,B01,5,201Â 764
1,202100000001,1,7,,0,9,3,17,A01,1,201Â 765
2,202100000002,0,7,,2,2,1,1,A01,0,201Â 762


'2022:'

Unnamed: 0,Num_Acc,senc,catv,occutc,obs,obsm,choc,manv,num_veh,motor,id_vehicule
0,202200000001,1,2,,0,2,1,9,A01,1,813Â 952
1,202200000001,1,7,,0,2,2,1,B01,1,813Â 953
2,202200000002,2,7,,0,2,8,15,B01,1,813Â 950


'2023:'

Unnamed: 0,Num_Acc,senc,catv,occutc,obs,obsm,choc,manv,num_veh,motor,id_vehicule
0,202300000001,1,30,,0,0,5,1,A01,1,155Â 680Â 557
1,202300000002,2,7,,0,1,1,1,A01,1,155Â 680Â 556
2,202300000003,1,2,,0,2,1,16,B01,1,155Â 680Â 554


'2024:'

Unnamed: 0,Num_Acc,senc,catv,occutc,obs,obsm,choc,manv,num_veh,motor,id_vehicule
0,202400000001,1,7,,0,2,1,13,A01,1,155Â 781Â 758
1,202400000001,2,14,,0,2,2,21,B01,1,155Â 781Â 759
2,202400000002,1,10,,0,1,3,15,A01,1,155Â 781Â 757


In [18]:
# -------------------------------------------------------------------------------------------------
# 5. check dtypes over the year
# -------------------------------------------------------------------------------------------------
# List of years and corresponding DataFrames
years = list(range(2019, 2025))
dfs = {year: globals()[f'df_vehicles_{year}'] for year in years}

# Collect column-dtype pairs per year
records = []
for year, df in dfs.items():
    for col in df.columns:
        records.append({'Year': year, 'Column': col, 'Dtype': df[col].dtype.name})

# Create DataFrame from records
df_summary = pd.DataFrame(records)

# Crosstab: Column vs. Year, showing dtype
crosstab = pd.crosstab(df_summary['Column'], df_summary['Year'], values=df_summary['Dtype'], aggfunc='first')

crosstab

Year,2019,2020,2021,2022,2023,2024
Column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Num_Acc,int64,int64,int64,int64,int64,int64
catv,int64,int64,int64,int64,int64,int64
choc,int64,int64,int64,int64,int64,int64
id_vehicule,object,object,object,object,object,object
manv,int64,int64,int64,int64,int64,int64
motor,int64,int64,int64,int64,int64,int64
num_veh,object,object,object,object,object,object
obs,int64,int64,int64,int64,int64,int64
obsm,int64,int64,int64,int64,int64,int64
occutc,float64,float64,float64,float64,float64,float64


In [19]:
# -------------------------------------------------------------------------------------------------
# 6. check number of non-null records per year and variable
# -------------------------------------------------------------------------------------------------
# List of years and corresponding DataFrames
years = list(range(2019, 2025))
dfs = {year: globals()[f'df_vehicles_{year}'] for year in years}

# Collect non-null counts per column per year
records = []
for year, df in dfs.items():
    for col in df.columns:
        non_null_count = df[col].notna().sum()
        records.append({'Year': year, 'Column': col, 'NonNullCount': non_null_count})

# Create DataFrame from records
df_counts = pd.DataFrame(records)

# Crosstab: Column vs. Year, showing non-null counts
crosstab = pd.crosstab(df_counts['Column'], df_counts['Year'], values=df_counts['NonNullCount'], aggfunc='first')

crosstab

Year,2019,2020,2021,2022,2023,2024
Column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Num_Acc,100710,81066,97315,94493,93585,92678
catv,100710,81066,97315,94493,93585,92678
choc,100710,81066,97315,94493,93585,92678
id_vehicule,100710,81066,97315,94493,93585,92678
manv,100710,81066,97315,94493,93585,92678
motor,100710,81066,97315,94493,93585,92678
num_veh,100710,81066,97315,94493,93585,92678
obs,100710,81066,97315,94493,93585,92678
obsm,100710,81066,97315,94493,93585,92678
occutc,892,621,744,817,838,949


In [20]:
# -------------------------------------------------------------------------------------------------
# 7. concatenate 2019 - 2024 (same variable structure)
# -------------------------------------------------------------------------------------------------
df_vehicles_2019_2024 = pd.concat([df_vehicles_2019, df_vehicles_2020, df_vehicles_2021, df_vehicles_2022, df_vehicles_2023, df_vehicles_2024], axis=0)

print(df_vehicles_2019_2024.shape)
print(df_vehicles_2019_2024.info())
display(df_vehicles_2019_2024.describe())

(559847, 11)
<class 'pandas.core.frame.DataFrame'>
Index: 559847 entries, 0 to 92677
Data columns (total 11 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   Num_Acc      559847 non-null  int64  
 1   senc         559847 non-null  int64  
 2   catv         559847 non-null  int64  
 3   occutc       4861 non-null    float64
 4   obs          559847 non-null  int64  
 5   obsm         559847 non-null  int64  
 6   choc         559847 non-null  int64  
 7   manv         559847 non-null  int64  
 8   num_veh      559847 non-null  object 
 9   motor        559847 non-null  int64  
 10  id_vehicule  559847 non-null  object 
dtypes: float64(1), int64(8), object(2)
memory usage: 51.3+ MB
None


Unnamed: 0,Num_Acc,senc,catv,occutc,obs,obsm,choc,manv,motor
count,559847.0,559847.0,559847.0,4861.0,559847.0,559847.0,559847.0,559847.0,559847.0
mean,202149500000.0,1.5698,13.264917,1.887883,1.020977,1.660629,2.889757,7.042964,1.27814
std,171653200.0,0.824839,13.997204,3.508071,3.126948,1.254219,2.420351,7.968915,1.109368
min,201900000000.0,-1.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0
25%,202000000000.0,1.0,7.0,1.0,0.0,1.0,1.0,1.0,1.0
50%,202200000000.0,1.0,7.0,1.0,0.0,2.0,2.0,2.0,1.0
75%,202300000000.0,2.0,10.0,1.0,0.0,2.0,4.0,15.0,1.0
max,202400100000.0,3.0,99.0,65.0,17.0,9.0,9.0,26.0,6.0


In [21]:
# -------------------------------------------------------------------------------------------------
# 7. export final dataframe to joblib
# -------------------------------------------------------------------------------------------------
from joblib import dump

dump(df_vehicles_2019_2024, '1.0-leibold-data-exploration_vehicles.joblib')


['1.0-leibold-data-exploration_vehicles.joblib']