In [1]:
import pandas as pd
import numpy as np
from joblib import dump, load

In [2]:
"""
notebook: 1.1-simmler-data-preprocessing_accidents
---------------------------------------------------------------------
load untouched accident data from csv files as base

dropping duplicates

add date variable acc_date, transformed hrmn to acc_hour

add metro variable to distinguish France mainland and other associated countries

transformed lat/long to float

columns renamed

irrelevant columns dropped: 'acc_adress', 'acc_municipality', 'acc_department'
update: keep 'acc_municipality', 'acc_department'

replaced -1 value in categorical variables with nan()

"""

"\nnotebook: 1.1-simmler-data-preprocessing_accidents\n---------------------------------------------------------------------\nload untouched accident data from csv files as base\n\ndropping duplicates\n\nadd date variable acc_date, transformed hrmn to acc_hour\n\nadd metro variable to distinguish France mainland and other associated countries\n\ntransformed lat/long to float\n\ncolumns renamed\n\nirrelevant columns dropped: 'acc_adress', 'acc_municipality', 'acc_department'\nupdate: keep 'acc_municipality', 'acc_department'\n\nreplaced -1 value in categorical variables with nan()\n\n"

In [3]:
#------------------------------------------------------------------------------------------------
# Import joblib file with 2019-2024 untouched data for accidents
#------------------------------------------------------------------------------------------------
df_acc = load(r'..\..\data\processed\1_exploration\1.0-leibold-data-exploration_accidents.joblib')

display(df_acc.head(3))
df_acc.info()

Unnamed: 0,Num_Acc,jour,mois,an,hrmn,lum,dep,com,agg,int,atm,col,adr,lat,long
0,201900000001,30,11,2019,01:30,4,93,93053,1,1,1,2,AUTOROUTE A3,488962100,24701200
1,201900000002,30,11,2019,02:50,3,93,93066,1,1,1,6,AUTOROUTE A1,489307000,23688000
2,201900000003,28,11,2019,15:15,1,92,92036,1,1,1,4,AUTOROUTE A86,489358718,23191744


<class 'pandas.core.frame.DataFrame'>
Index: 327628 entries, 0 to 54401
Data columns (total 15 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   Num_Acc  327628 non-null  int64 
 1   jour     327628 non-null  int64 
 2   mois     327628 non-null  int64 
 3   an       327628 non-null  int64 
 4   hrmn     327628 non-null  object
 5   lum      327628 non-null  int64 
 6   dep      327628 non-null  object
 7   com      327628 non-null  object
 8   agg      327628 non-null  int64 
 9   int      327628 non-null  int64 
 10  atm      327628 non-null  int64 
 11  col      327628 non-null  int64 
 12  adr      321339 non-null  object
 13  lat      327628 non-null  object
 14  long     327628 non-null  object
dtypes: int64(9), object(6)
memory usage: 40.0+ MB


In [4]:
# -------------------------------------------------------------------------------------------------
# todo: we have recurring index, for each year, should the index be reset?
# -------------------------------------------------------------------------------------------------

#df_acc = df_acc.reset_index(drop=True)
#df_acc


In [5]:
# --------------------------------------------------------------------------------------------------------------------------------
# check and drop duplicates
# --------------------------------------------------------------------------------------------------------------------------------
print("accidents duplicates:", df_acc.duplicated().sum())

df_acc.drop_duplicates(inplace=True)

print("accidents duplicates AFTER cleaning:", df_acc.duplicated().sum())

accidents duplicates: 0
accidents duplicates AFTER cleaning: 0


In [6]:
# --------------------------------------------------------------------------------------------------------------------------------
# transform date time columns
# --------------------------------------------------------------------------------------------------------------------------------
def transform_datetime_cols(char):
    
    if 'date' not in char:
        print('transform datetime columns')
        
        #df['date_timestamp'] = pd.to_datetime(df[['year', 'month', 'day']])
        
        format = '%Y-%m-%d %H:%M'
        char['datetime'] = pd.to_datetime(char['an'].astype(str) + '-' + 
                                          char['mois'].astype(str) + '-' + 
                                          char['jour'].astype(str) + ' ' + 
                                          char['hrmn'], format=format)
        
        char['date'] = char['datetime'].dt.strftime('%Y-%m-%d')
        char['year'] = char['an']
        char['month'] = char['mois']
        char['hour'] = char['hrmn'].str[:2]
        char['hour'] = char['hour'].astype(int)
        char['date'] = pd.to_datetime(char['date'])
        
        char.drop(['datetime', 'jour', 'an', 'mois', 'hrmn'], axis=1, inplace=True)
        #char = char.drop(['datetime', 'jour', 'an', 'mois', 'hrmn'], axis=1)
    
    else:
        print('datetime columns have aleady been transformed')
    
    display(char.head())
    
transform_datetime_cols(df_acc)

transform datetime columns


Unnamed: 0,Num_Acc,lum,dep,com,agg,int,atm,col,adr,lat,long,date,year,month,hour
0,201900000001,4,93,93053,1,1,1,2,AUTOROUTE A3,488962100,24701200,2019-11-30,2019,11,1
1,201900000002,3,93,93066,1,1,1,6,AUTOROUTE A1,489307000,23688000,2019-11-30,2019,11,2
2,201900000003,1,92,92036,1,1,1,4,AUTOROUTE A86,489358718,23191744,2019-11-28,2019,11,15
3,201900000004,5,94,94069,1,1,1,4,A4,488173295,24281502,2019-11-30,2019,11,20
4,201900000005,3,94,94028,1,1,1,2,A86 INT,487763620,24332540,2019-11-30,2019,11,4


In [7]:
#------------------------------------------------------------------------------------------------
# fill department numbers with zeros
#------------------------------------------------------------------------------------------------
def zfill_str_column(db, col, number):
    
    if db[col].dtype == 'object':
        db[col] = db[col].str.zfill(number) #.astype(str)
    return db

dep_unique_1 = df_acc['dep'].unique()
zfill_str_column(df_acc, 'dep', 2)
dep_unique_2 = df_acc['dep'].unique()

print('fill dep with zeros to avoid 1, 01 formats')

print('before zfill:')
display(dep_unique_1)
      
print('after zfill:')
display(dep_unique_2)


fill dep with zeros to avoid 1, 01 formats
before zfill:


array(['93', '92', '94', '87', '69', '38', '34', '13', '988', '976',
       '974', '972', '2B', '91', '86', '83', '80', '78', '77', '76', '72',
       '71', '67', '66', '64', '60', '51', '50', '49', '45', '37', '35',
       '33', '31', '30', '29', '22', '19', '18', '17', '74', '81', '2',
       '59', '95', '63', '62', '973', '2A', '84', '9', '73', '43', '10',
       '36', '16', '7', '21', '40', '24', '4', '85', '27', '28', '52',
       '68', '42', '82', '11', '987', '44', '61', '14', '56', '58', '54',
       '47', '41', '3', '75', '1', '57', '32', '39', '15', '23', '6', '5',
       '26', '48', '986', '971', '89', '25', '12', '88', '65', '53', '70',
       '46', '90', '8', '79', '977', '55', '978', '975', '06', '03', '07',
       '01', '08', '02', '05', '04', '09'], dtype=object)

after zfill:


array(['93', '92', '94', '87', '69', '38', '34', '13', '988', '976',
       '974', '972', '2B', '91', '86', '83', '80', '78', '77', '76', '72',
       '71', '67', '66', '64', '60', '51', '50', '49', '45', '37', '35',
       '33', '31', '30', '29', '22', '19', '18', '17', '74', '81', '02',
       '59', '95', '63', '62', '973', '2A', '84', '09', '73', '43', '10',
       '36', '16', '07', '21', '40', '24', '04', '85', '27', '28', '52',
       '68', '42', '82', '11', '987', '44', '61', '14', '56', '58', '54',
       '47', '41', '03', '75', '01', '57', '32', '39', '15', '23', '06',
       '05', '26', '48', '986', '971', '89', '25', '12', '88', '65', '53',
       '70', '46', '90', '08', '79', '977', '55', '978', '975'],
      dtype=object)

In [8]:
#------------------------------------------------------------------------------------------------
# rename of columns
#------------------------------------------------------------------------------------------------
rename_map_acc = {
    'Num_Acc': 'acc_num',
    #'jour': 'acc_day',
    #'mois': 'acc_month',
    #'an': 'acc_year',    
    #'hrmn': 'acc_hour',
    'lum': 'acc_ambient_lightning',
    'dep': 'acc_department',
    'com': 'acc_municipality',
    'agg': 'acc_urbanization_level',    
    'int': 'acc_intersection',
    'atm': 'acc_atmosphere',
    'col': 'acc_collision_type',
    'adr': 'acc_adress',
    'lat': 'acc_lat',
    'long': 'acc_long',
    'date': 'acc_date',
    'year': 'acc_year',
    'month': 'acc_month',
    'hour': 'acc_hour'
}

df_acc.rename(columns=rename_map_acc, inplace=True)

df_acc.info()
display(df_acc.head(3))

<class 'pandas.core.frame.DataFrame'>
Index: 327628 entries, 0 to 54401
Data columns (total 15 columns):
 #   Column                  Non-Null Count   Dtype         
---  ------                  --------------   -----         
 0   acc_num                 327628 non-null  int64         
 1   acc_ambient_lightning   327628 non-null  int64         
 2   acc_department          327628 non-null  object        
 3   acc_municipality        327628 non-null  object        
 4   acc_urbanization_level  327628 non-null  int64         
 5   acc_intersection        327628 non-null  int64         
 6   acc_atmosphere          327628 non-null  int64         
 7   acc_collision_type      327628 non-null  int64         
 8   acc_adress              321339 non-null  object        
 9   acc_lat                 327628 non-null  object        
 10  acc_long                327628 non-null  object        
 11  acc_date                327628 non-null  datetime64[ns]
 12  acc_year                327628 non-n

Unnamed: 0,acc_num,acc_ambient_lightning,acc_department,acc_municipality,acc_urbanization_level,acc_intersection,acc_atmosphere,acc_collision_type,acc_adress,acc_lat,acc_long,acc_date,acc_year,acc_month,acc_hour
0,201900000001,4,93,93053,1,1,1,2,AUTOROUTE A3,488962100,24701200,2019-11-30,2019,11,1
1,201900000002,3,93,93066,1,1,1,6,AUTOROUTE A1,489307000,23688000,2019-11-30,2019,11,2
2,201900000003,1,92,92036,1,1,1,4,AUTOROUTE A86,489358718,23191744,2019-11-28,2019,11,15


In [9]:
#------------------------------------------------------------------------------------------------
# add column 'acc_metro' to indicate mainland vs overseas departments for further use in this notebook
#------------------------------------------------------------------------------------------------
def add_metro_col(char):
    
    if 'acc_metro' not in char:
        print('add a column acc_metro to indicate an overseas department')
        char['acc_metro'] = char['acc_department'].map(lambda x: int(x) < 900 if (x != '2A') & (x != '2B') else True)
        char['acc_metro'] = char['acc_metro'].astype(int)
    else:
        print('column acc_metro has already been added')
    
    display(char.head())

add_metro_col(df_acc)


add a column acc_metro to indicate an overseas department


Unnamed: 0,acc_num,acc_ambient_lightning,acc_department,acc_municipality,acc_urbanization_level,acc_intersection,acc_atmosphere,acc_collision_type,acc_adress,acc_lat,acc_long,acc_date,acc_year,acc_month,acc_hour,acc_metro
0,201900000001,4,93,93053,1,1,1,2,AUTOROUTE A3,488962100,24701200,2019-11-30,2019,11,1,1
1,201900000002,3,93,93066,1,1,1,6,AUTOROUTE A1,489307000,23688000,2019-11-30,2019,11,2,1
2,201900000003,1,92,92036,1,1,1,4,AUTOROUTE A86,489358718,23191744,2019-11-28,2019,11,15,1
3,201900000004,5,94,94069,1,1,1,4,A4,488173295,24281502,2019-11-30,2019,11,20,1
4,201900000005,3,94,94028,1,1,1,2,A86 INT,487763620,24332540,2019-11-30,2019,11,4,1


In [10]:
#------------------------------------------------------------------------------------------------
# tranform acc_long and acc_lat to float
#------------------------------------------------------------------------------------------------
def transform_long_lat_cols(char, drop=True):
    
    if (char['acc_long'].dtype == 'object') & (char['acc_lat'].dtype == 'object'): 
        
        print('change long/lat cols to type float and clean up values')
        
        char['long_f'] = char['acc_long'].str.replace(',', '.', regex=False).astype(float)
        char['lat_f'] = char['acc_lat'].str.replace(',', '.', regex=False).astype(float)
        
        #swap long > 40 for overseas dep
        filter_condition_1 = (char['long_f'] > 40) & (char['acc_metro'] == 1)
        cols_to_swap = ['long_f', 'lat_f']
        char.loc[filter_condition_1, cols_to_swap] = char.loc[filter_condition_1, cols_to_swap[::-1]].to_numpy()
        
        #correct lat for 2B
        #2B #2B037 Biguglia in Corsica
        #Decimal Coordinates: 42.626 N 09.42 E
        #char.loc[25733, 'lat_f'] = 42.61103  #46.61103
        filter_condition_2 = (char['long_f'] > 9) & (char['lat_f'] > 46) & (char['acc_metro'] == 1)
        char[filter_condition_2]
        char.loc[filter_condition_2, 'lat_f'] = char.loc[filter_condition_2, 'lat_f'] - 4
        
        #v1.1: correct outlier 201900033874 11.0 11.0
        #dep 93 paris LEON JOUHAUX (RUE)
        filter_condition_11 = (char['lat_f'] == 11.0) & (char['long_f'] == 11.0) & (char['acc_num'] == 201900033874)
        df_filter_11 = df_acc.loc[filter_condition_11]
        if len(df_filter_11) == 1:
            df_acc.loc[filter_condition_11]
            df_acc.loc[filter_condition_11, 'lat_f'] = 48.9518347
            df_acc.loc[filter_condition_11, 'long_f'] = 2.4797433
        
        #v1.1: check for remaining outliers
        filter_condition_long = ((char['long_f'] < -10) | (char['long_f'] > 15))
        filter_condition_lat = ((char['lat_f'] < 40) | (char['lat_f'] > 55))
        filter_condition_11 = ((char['long_f'] == 11.0) & (char['lat_f'] == 11.0))
        filter_condition_metro = (char['acc_metro'] == 1)
        filter_condition = (filter_condition_long | filter_condition_lat | filter_condition_11) & filter_condition_metro
        df_filter = char.loc[filter_condition]
        
        if len(df_filter) > 0:
            print('warning: there are long/lat outliers which havent been handled')
            display(df_filter)
        
        #drop org, rename
        if drop:
            char.drop(['acc_long', 'acc_lat'], axis=1, inplace=True)
            char.rename(columns={'long_f': 'acc_long', 'lat_f': 'acc_lat'}, inplace=True)
    
    else:
        print('long/lat cols have already been transformed')
    
    display(char.head(3))


transform_long_lat_cols(df_acc)

change long/lat cols to type float and clean up values


Unnamed: 0,acc_num,acc_ambient_lightning,acc_department,acc_municipality,acc_urbanization_level,acc_intersection,acc_atmosphere,acc_collision_type,acc_adress,acc_date,acc_year,acc_month,acc_hour,acc_metro,acc_long,acc_lat
0,201900000001,4,93,93053,1,1,1,2,AUTOROUTE A3,2019-11-30,2019,11,1,1,2.47012,48.89621
1,201900000002,3,93,93066,1,1,1,6,AUTOROUTE A1,2019-11-30,2019,11,2,1,2.3688,48.9307
2,201900000003,1,92,92036,1,1,1,4,AUTOROUTE A86,2019-11-28,2019,11,15,1,2.319174,48.935872


In [11]:
#------------------------------------------------------------------------------------------------
# visualization of acc_long and acc_lat clean-up
#------------------------------------------------------------------------------------------------
#
import seaborn as sns
import matplotlib.pyplot as plt

def show_metro_accidents_map(char, year=''):
    
    data = char.loc[char['acc_metro'] == 1]
    
    plt.figure(figsize=(8, 8))
    sns.scatterplot(data=data, x='acc_long', y='acc_lat', s=0.5) #0.8 s=0.1 s=0.5
    plt.xlabel('Longitude')
    plt.ylabel('Latitude')
    plt.title(f"Road Accidents in France {year}") 
    plt.tight_layout()
    plt.show()

#show_metro_accidents_map(df_acc)


In [12]:
#------------------------------------------------------------------------------------------------
# drop irrelvant columns
#------------------------------------------------------------------------------------------------
# adress is represented by long/lat
# department is no longer needed -> engineered feature "acc_metro" can be used to filter France mainland
#
# v1.1: 'acc_department': keep department for now, contains information re distribution imbalance
# update: keep 'acc_municipality', 'acc_department'

df_acc.drop(['acc_adress'], axis=1, inplace=True)

df_acc.info()

<class 'pandas.core.frame.DataFrame'>
Index: 327628 entries, 0 to 54401
Data columns (total 15 columns):
 #   Column                  Non-Null Count   Dtype         
---  ------                  --------------   -----         
 0   acc_num                 327628 non-null  int64         
 1   acc_ambient_lightning   327628 non-null  int64         
 2   acc_department          327628 non-null  object        
 3   acc_municipality        327628 non-null  object        
 4   acc_urbanization_level  327628 non-null  int64         
 5   acc_intersection        327628 non-null  int64         
 6   acc_atmosphere          327628 non-null  int64         
 7   acc_collision_type      327628 non-null  int64         
 8   acc_date                327628 non-null  datetime64[ns]
 9   acc_year                327628 non-null  int64         
 10  acc_month               327628 non-null  int64         
 11  acc_hour                327628 non-null  int64         
 12  acc_metro               327628 non-n

In [13]:
#------------------------------------------------------------------------------------------------
# reorder columns
#------------------------------------------------------------------------------------------------
def reorder_cols(char):

    #reorder:
    #0   acc_num                 327628 non-null  int64
    
    #1   acc_date                327628 non-null  datetime64[ns]
    #2   acc_year                327628 non-null  int64
    #3   acc_month               327628 non-null  int64
    #4   acc_hour                327628 non-null  int64
    
    #5   acc_department          327628 non-null  object
    #6   acc_municipality        327628 non-null  object
    #7   acc_metro               327628 non-null  int64
    #8   acc_long                327628 non-null  float64
    #9   acc_lat                 327628 non-null  float64
    
    #10   acc_ambient_lightning   327628 non-null  int64
    #11   acc_atmosphere          327628 non-null  int64
    #12   acc_urbanization_level  327628 non-null  int64
    #13   acc_intersection        327628 non-null  int64
    #14   acc_collision_type      327628 non-null  int64
    
    #col_order = ['Num_Acc','date','year','month','hour','dep','com','adr','metro','long','lat','lum','atm','agg','int','col']
    #col_order = ['Num_Acc','date','year','month','hour','dep','com','metro','long','lat','lum','atm','agg','int','col']
    
    col_order = ['acc_num',
                 'acc_date','acc_year','acc_month','acc_hour',
                 'acc_department','acc_municipality','acc_metro','acc_long','acc_lat',
                 'acc_ambient_lightning','acc_atmosphere','acc_urbanization_level','acc_intersection','acc_collision_type']
    
    char = char[col_order]
    
    #set index
    #char.set_index('acc_num', inplace=True)
    
    display(char.head())
    return char


df_acc = reorder_cols(df_acc)

Unnamed: 0,acc_num,acc_date,acc_year,acc_month,acc_hour,acc_department,acc_municipality,acc_metro,acc_long,acc_lat,acc_ambient_lightning,acc_atmosphere,acc_urbanization_level,acc_intersection,acc_collision_type
0,201900000001,2019-11-30,2019,11,1,93,93053,1,2.47012,48.89621,4,1,1,1,2
1,201900000002,2019-11-30,2019,11,2,93,93066,1,2.3688,48.9307,3,1,1,1,6
2,201900000003,2019-11-28,2019,11,15,92,92036,1,2.319174,48.935872,1,1,1,1,4
3,201900000004,2019-11-30,2019,11,20,94,94069,1,2.42815,48.817329,5,1,1,1,4
4,201900000005,2019-11-30,2019,11,4,94,94028,1,2.433254,48.776362,3,1,1,1,2


In [14]:
# --------------------------------------------------------------------------------------------------------------------------------
# check missing values before replacement of -1/0 catagories
# --------------------------------------------------------------------------------------------------------------------------------
missing_counts = df_acc.isna().sum()
missing_percent = df_acc.isna().sum() / len(df_acc)

missing_summary_before_replacement = pd.DataFrame({
    'Missing Count': missing_counts,
    'Missing %': missing_percent.round(4)
})

print(missing_summary_before_replacement)

                        Missing Count  Missing %
acc_num                             0        0.0
acc_date                            0        0.0
acc_year                            0        0.0
acc_month                           0        0.0
acc_hour                            0        0.0
acc_department                      0        0.0
acc_municipality                    0        0.0
acc_metro                           0        0.0
acc_long                            0        0.0
acc_lat                             0        0.0
acc_ambient_lightning               0        0.0
acc_atmosphere                      0        0.0
acc_urbanization_level              0        0.0
acc_intersection                    0        0.0
acc_collision_type                  0        0.0


In [15]:
#------------------------------------------------------------------------------------------------
# check for -1 values in categorical variables
#------------------------------------------------------------------------------------------------
#
c_amb = df_acc[df_acc['acc_ambient_lightning'] == -1].groupby('acc_year')['acc_ambient_lightning'].count()
print('acc_ambient_lightning')
print(c_amb)
print()
c_atm = df_acc[df_acc['acc_atmosphere'] == -1].groupby('acc_year')['acc_atmosphere'].count()
print('acc_atmosphere')
print(c_atm)
print()
c_agg = df_acc[df_acc['acc_urbanization_level'] == -1].groupby('acc_year')['acc_urbanization_level'].count()
print('acc_urbanization_level')
print(c_agg)
print()
c_int = df_acc[df_acc['acc_intersection'] == -1].groupby('acc_year')['acc_intersection'].count()
print('acc_intersection')
print(c_int)
print()
c_col = df_acc[df_acc['acc_collision_type'] == -1].groupby('acc_year')['acc_collision_type'].count()
print('acc_collision_type')
print(c_col)


acc_ambient_lightning
acc_year
2020    5
2022    2
2023    2
Name: acc_ambient_lightning, dtype: int64

acc_atmosphere
acc_year
2019     1
2020    12
2021     8
2022     1
2023     4
Name: acc_atmosphere, dtype: int64

acc_urbanization_level
Series([], Name: acc_urbanization_level, dtype: int64)

acc_intersection
acc_year
2020    1
2022    7
2023    6
Name: acc_intersection, dtype: int64

acc_collision_type
acc_year
2019       2
2020    1515
2021      20
2022      63
2023      13
2024       6
Name: acc_collision_type, dtype: int64


In [16]:
#------------------------------------------------------------------------------------------------
# replace -1 value within categorical variables with nan()
#
# this will convert nan columns to float
#------------------------------------------------------------------------------------------------
df_acc['acc_ambient_lightning'] = df_acc['acc_ambient_lightning'].replace([-1], np.nan) 
df_acc['acc_urbanization_level'] = df_acc['acc_urbanization_level'].replace([-1], np.nan) 
df_acc['acc_intersection'] = df_acc['acc_intersection'].replace([-1], np.nan) 
df_acc['acc_atmosphere'] = df_acc['acc_atmosphere'].replace([-1], np.nan) 
df_acc['acc_collision_type'] = df_acc['acc_collision_type'].replace([-1], np.nan) 

df_acc.info()


<class 'pandas.core.frame.DataFrame'>
Index: 327628 entries, 0 to 54401
Data columns (total 15 columns):
 #   Column                  Non-Null Count   Dtype         
---  ------                  --------------   -----         
 0   acc_num                 327628 non-null  int64         
 1   acc_date                327628 non-null  datetime64[ns]
 2   acc_year                327628 non-null  int64         
 3   acc_month               327628 non-null  int64         
 4   acc_hour                327628 non-null  int64         
 5   acc_department          327628 non-null  object        
 6   acc_municipality        327628 non-null  object        
 7   acc_metro               327628 non-null  int64         
 8   acc_long                327628 non-null  float64       
 9   acc_lat                 327628 non-null  float64       
 10  acc_ambient_lightning   327619 non-null  float64       
 11  acc_atmosphere          327602 non-null  float64       
 12  acc_urbanization_level  327628 non-n

In [17]:
# --------------------------------------------------------------------------------------------------------------------------------
# check missing values after replacement of -1/0 catagories
# --------------------------------------------------------------------------------------------------------------------------------
missing_counts = df_acc.isna().sum()
missing_percent = df_acc.isna().sum() / len(df_acc)

missing_summary = pd.DataFrame({
    'Missing Count': missing_counts,
    'Missing %': missing_percent.round(4)
})

print(missing_summary)


                        Missing Count  Missing %
acc_num                             0     0.0000
acc_date                            0     0.0000
acc_year                            0     0.0000
acc_month                           0     0.0000
acc_hour                            0     0.0000
acc_department                      0     0.0000
acc_municipality                    0     0.0000
acc_metro                           0     0.0000
acc_long                            0     0.0000
acc_lat                             0     0.0000
acc_ambient_lightning               9     0.0000
acc_atmosphere                     26     0.0001
acc_urbanization_level              0     0.0000
acc_intersection                   14     0.0000
acc_collision_type               1619     0.0049


In [18]:
# --------------------------------------------------------------------------------------------------------------------------------
# explore missing values
# --------------------------------------------------------------------------------------------------------------------------------
# most of the missing values are in variable 'acc_collision_type' (col) of 2020
#
def get_missing_for_year(df, category, year):
    
    #year_counts = df_acc.acc_year.value_counts()
    #print(year_counts)
    
    #c_year_cat = df_acc[df_acc['acc_year'] == year][category].value_counts()
    #print(c_year_cat)
    
    #df_acc.groupby('acc_year')['acc_collision_type'].apply(lambda x: x.isnull().sum())
    
    col_isnull_counts = df_acc.groupby('acc_year')[category].apply(lambda x: x.isnull().sum())
    print(f"missing values of {category} per year:")
    print(col_isnull_counts)
    print()
    
    c_year_cat_missing = df_acc[df_acc['acc_year'] == year][category].isnull().sum()
    c_year = df_acc[df_acc['acc_year'] == year].shape[0]
    
    c_year_cat_missing_rel = (c_year_cat_missing / c_year) * 100
    print(f"percent of missing values of {category} for {year}:")
    print(c_year_cat_missing_rel)


get_missing_for_year(df_acc, 'acc_collision_type', 2020)


missing values of acc_collision_type per year:
acc_year
2019       2
2020    1515
2021      20
2022      63
2023      13
2024       6
Name: acc_collision_type, dtype: int64

percent of missing values of acc_collision_type for 2020:
3.1731735924932973


In [19]:
df_acc.head()

Unnamed: 0,acc_num,acc_date,acc_year,acc_month,acc_hour,acc_department,acc_municipality,acc_metro,acc_long,acc_lat,acc_ambient_lightning,acc_atmosphere,acc_urbanization_level,acc_intersection,acc_collision_type
0,201900000001,2019-11-30,2019,11,1,93,93053,1,2.47012,48.89621,4.0,1.0,1,1.0,2.0
1,201900000002,2019-11-30,2019,11,2,93,93066,1,2.3688,48.9307,3.0,1.0,1,1.0,6.0
2,201900000003,2019-11-28,2019,11,15,92,92036,1,2.319174,48.935872,1.0,1.0,1,1.0,4.0
3,201900000004,2019-11-30,2019,11,20,94,94069,1,2.42815,48.817329,5.0,1.0,1,1.0,4.0
4,201900000005,2019-11-30,2019,11,4,94,94028,1,2.433254,48.776362,3.0,1.0,1,1.0,2.0


In [20]:
# -------------------------------------------------------------------------------------------------
# export final dataframe to joblib
# -------------------------------------------------------------------------------------------------
dump(df_acc, (r'..\..\data\processed\2_preprocessing\1.1-simmler-data-preprocessing_accidents.joblib'))


['..\\..\\data\\processed\\2_preprocessing\\1.1-simmler-data-preprocessing_accidents.joblib']