In [76]:
import pandas as pd
import numpy as np
from joblib import dump, load

In [None]:
"""
notebook: 1.0-simmler-data-preprocessing_accidents
---------------------------------------------------------------------
load untouched accident data from csv files as base

dropping duplicates

add date variable acc_date, transformed hrmn to acc_hour

add metro variable to distinguish France mainland and other associated countries

transformed lat/long to float

columns renamed

irrelevant columns dropped: 'acc_adress','acc_department','acc_municipality'

replaced -1 value in categorical variables with nan()

"""

In [77]:
#------------------------------------------------------------------------------------------------
# Import joblib file with 2019-2024 untouched data for accidents
#------------------------------------------------------------------------------------------------
df_acc = load(r'..\..\data\processed\1_exploration\1.0-leibold-data-exploration_accidents.joblib')

display(df_acc.head(3))
df_acc.info()

Unnamed: 0,Num_Acc,jour,mois,an,hrmn,lum,dep,com,agg,int,atm,col,adr,lat,long
0,201900000001,30,11,2019,01:30,4,93,93053,1,1,1,2,AUTOROUTE A3,488962100,24701200
1,201900000002,30,11,2019,02:50,3,93,93066,1,1,1,6,AUTOROUTE A1,489307000,23688000
2,201900000003,28,11,2019,15:15,1,92,92036,1,1,1,4,AUTOROUTE A86,489358718,23191744


<class 'pandas.core.frame.DataFrame'>
Index: 327628 entries, 0 to 54401
Data columns (total 15 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   Num_Acc  327628 non-null  int64 
 1   jour     327628 non-null  int64 
 2   mois     327628 non-null  int64 
 3   an       327628 non-null  int64 
 4   hrmn     327628 non-null  object
 5   lum      327628 non-null  int64 
 6   dep      327628 non-null  object
 7   com      327628 non-null  object
 8   agg      327628 non-null  int64 
 9   int      327628 non-null  int64 
 10  atm      327628 non-null  int64 
 11  col      327628 non-null  int64 
 12  adr      321339 non-null  object
 13  lat      327628 non-null  object
 14  long     327628 non-null  object
dtypes: int64(9), object(6)
memory usage: 40.0+ MB


In [78]:
# --------------------------------------------------------------------------------------------------------------------------------
# check and drop duplicates
# --------------------------------------------------------------------------------------------------------------------------------
print("accidents duplicates:", df_acc.duplicated().sum())

df_acc.drop_duplicates(inplace=True)

print("accidents duplicates AFTER cleaning:", df_acc.duplicated().sum())

accidents duplicates: 0
accidents duplicates AFTER cleaning: 0


In [79]:
def transform_datetime_cols(char):
    
    if 'date' not in char:
        print('transform datetime columns')
        
        #df['date_timestamp'] = pd.to_datetime(df[['year', 'month', 'day']])
        
        format = '%Y-%m-%d %H:%M'
        char['datetime'] = pd.to_datetime(char['an'].astype(str) + '-' + 
                                          char['mois'].astype(str) + '-' + 
                                          char['jour'].astype(str) + ' ' + 
                                          char['hrmn'], format=format)
        
        char['date'] = char['datetime'].dt.strftime('%Y-%m-%d')
        char['year'] = char['an']
        char['month'] = char['mois']
        char['hour'] = char['hrmn'].str[:2]
        char['hour'] = char['hour'].astype(int)
        char['date'] = pd.to_datetime(char['date'])
        
        char.drop(['datetime', 'jour', 'an', 'mois', 'hrmn'], axis=1, inplace=True)
        #char = char.drop(['datetime', 'jour', 'an', 'mois', 'hrmn'], axis=1)
    
    else:
        print('datetime columns have aleady been transformed')
    
    display(char.head())
    
transform_datetime_cols(df_acc)

transform datetime columns


Unnamed: 0,Num_Acc,lum,dep,com,agg,int,atm,col,adr,lat,long,date,year,month,hour
0,201900000001,4,93,93053,1,1,1,2,AUTOROUTE A3,488962100,24701200,2019-11-30,2019,11,1
1,201900000002,3,93,93066,1,1,1,6,AUTOROUTE A1,489307000,23688000,2019-11-30,2019,11,2
2,201900000003,1,92,92036,1,1,1,4,AUTOROUTE A86,489358718,23191744,2019-11-28,2019,11,15
3,201900000004,5,94,94069,1,1,1,4,A4,488173295,24281502,2019-11-30,2019,11,20
4,201900000005,3,94,94028,1,1,1,2,A86 INT,487763620,24332540,2019-11-30,2019,11,4


In [80]:
#------------------------------------------------------------------------------------------------
# rename of columns
#------------------------------------------------------------------------------------------------
rename_map_acc = {
    'Num_Acc': 'acc_num',
    #'jour': 'acc_day',
    #'mois': 'acc_month',
    #'an': 'acc_year',    
    #'hrmn': 'acc_hour',
    'lum': 'acc_ambient_lightning',
    'dep': 'acc_department',
    'com': 'acc_municipality',
    'agg': 'acc_urbanization_level',    
    'int': 'acc_intersection',
    'atm': 'acc_atmosphere',
    'col': 'acc_collision_type',
    'adr': 'acc_adress',
    'lat': 'acc_lat',
    'long': 'acc_long',
    'date': 'acc_date',
    'year': 'acc_year',
    'month': 'acc_month',
    'hour': 'acc_hour'    
}

df_acc.rename(columns=rename_map_acc, inplace=True)

df_acc.info()
display(df_acc.head(3))

<class 'pandas.core.frame.DataFrame'>
Index: 327628 entries, 0 to 54401
Data columns (total 15 columns):
 #   Column                  Non-Null Count   Dtype         
---  ------                  --------------   -----         
 0   acc_num                 327628 non-null  int64         
 1   acc_ambient_lightning   327628 non-null  int64         
 2   acc_department          327628 non-null  object        
 3   acc_municipality        327628 non-null  object        
 4   acc_urbanization_level  327628 non-null  int64         
 5   acc_intersection        327628 non-null  int64         
 6   acc_atmosphere          327628 non-null  int64         
 7   acc_collision_type      327628 non-null  int64         
 8   acc_adress              321339 non-null  object        
 9   acc_lat                 327628 non-null  object        
 10  acc_long                327628 non-null  object        
 11  acc_date                327628 non-null  datetime64[ns]
 12  acc_year                327628 non-n

Unnamed: 0,acc_num,acc_ambient_lightning,acc_department,acc_municipality,acc_urbanization_level,acc_intersection,acc_atmosphere,acc_collision_type,acc_adress,acc_lat,acc_long,acc_date,acc_year,acc_month,acc_hour
0,201900000001,4,93,93053,1,1,1,2,AUTOROUTE A3,488962100,24701200,2019-11-30,2019,11,1
1,201900000002,3,93,93066,1,1,1,6,AUTOROUTE A1,489307000,23688000,2019-11-30,2019,11,2
2,201900000003,1,92,92036,1,1,1,4,AUTOROUTE A86,489358718,23191744,2019-11-28,2019,11,15


In [81]:
#------------------------------------------------------------------------------------------------
# add column acc_metro for further use in this notebook
#------------------------------------------------------------------------------------------------
def add_metro_col(char):
    
    if 'metro' not in char:
        print('add a column metro to indicate an overseas department')
        char['acc_metro'] = char['acc_department'].map(lambda x: int(x) < 900 if (x != '2A') & (x != '2B') else True)
        char['acc_metro'] = char['acc_metro'].astype(int)
    else:
        print('column metro has already been added')
    
    display(char.head())

add_metro_col(df_acc)

add a column metro to indicate an overseas department


Unnamed: 0,acc_num,acc_ambient_lightning,acc_department,acc_municipality,acc_urbanization_level,acc_intersection,acc_atmosphere,acc_collision_type,acc_adress,acc_lat,acc_long,acc_date,acc_year,acc_month,acc_hour,acc_metro
0,201900000001,4,93,93053,1,1,1,2,AUTOROUTE A3,488962100,24701200,2019-11-30,2019,11,1,1
1,201900000002,3,93,93066,1,1,1,6,AUTOROUTE A1,489307000,23688000,2019-11-30,2019,11,2,1
2,201900000003,1,92,92036,1,1,1,4,AUTOROUTE A86,489358718,23191744,2019-11-28,2019,11,15,1
3,201900000004,5,94,94069,1,1,1,4,A4,488173295,24281502,2019-11-30,2019,11,20,1
4,201900000005,3,94,94028,1,1,1,2,A86 INT,487763620,24332540,2019-11-30,2019,11,4,1


In [82]:
#------------------------------------------------------------------------------------------------
# tranform acc_long and acc_lat to float
#------------------------------------------------------------------------------------------------
def transform_long_lat_cols(char):
    
    if (char['acc_long'].dtype == 'object') & (char['acc_lat'].dtype == 'object'): 
        
        print('change long/lat cols to type float and clean up values')
        
        char['long_f'] = char['acc_long'].str.replace(',', '.', regex=False).astype(float)
        char['lat_f'] = char['acc_lat'].str.replace(',', '.', regex=False).astype(float)
        
        #swap long > 40 for overseas dep
        filter_condition_1 = (char['long_f'] > 40) & (char['acc_metro'] == 1)
        cols_to_swap = ['long_f', 'lat_f']
        char.loc[filter_condition_1, cols_to_swap] = char.loc[filter_condition_1, cols_to_swap[::-1]].to_numpy()
        
        #correct lat for 2B
        filter_condition_2 = (char['long_f'] > 9) & (char['lat_f'] > 46) & (char['acc_metro'] == 1)
        char[filter_condition_2]
        char.loc[filter_condition_2, 'lat_f'] = char.loc[filter_condition_2, 'lat_f'] - 4
        
        #2B #2B037 Biguglia in Corsica
        #Decimal Coordinates: 42.626 N 09.42 E
        #char.loc[25733, 'lat_f'] = 42.61103  #46.61103
        
        #drop org, rename
        char.drop(['acc_long', 'acc_lat'], axis=1, inplace=True)
        char.rename(columns={'long_f': 'acc_long', 'lat_f': 'acc_lat'}, inplace=True)
        
        #char = char.drop(['long', 'lat'], axis=1)
        #char = char.rename(columns={'long_f': 'long', 'lat_f': 'lat'})
    
    else:
        print('long/lat cols have already been transformed')
    
    display(char.head(3))


transform_long_lat_cols(df_acc)


change long/lat cols to type float and clean up values


Unnamed: 0,acc_num,acc_ambient_lightning,acc_department,acc_municipality,acc_urbanization_level,acc_intersection,acc_atmosphere,acc_collision_type,acc_adress,acc_date,acc_year,acc_month,acc_hour,acc_metro,acc_long,acc_lat
0,201900000001,4,93,93053,1,1,1,2,AUTOROUTE A3,2019-11-30,2019,11,1,1,2.47012,48.89621
1,201900000002,3,93,93066,1,1,1,6,AUTOROUTE A1,2019-11-30,2019,11,2,1,2.3688,48.9307
2,201900000003,1,92,92036,1,1,1,4,AUTOROUTE A86,2019-11-28,2019,11,15,1,2.319174,48.935872


In [83]:
#------------------------------------------------------------------------------------------------
# drop irrelvant columns
#------------------------------------------------------------------------------------------------
# adress is represented by long/lat
# department is no longer needed -> engineered feature "acc_metro" can be used to filter France mainland
df_acc.drop(['acc_adress','acc_department','acc_municipality'], axis=1, inplace=True)

df_acc.info()

<class 'pandas.core.frame.DataFrame'>
Index: 327628 entries, 0 to 54401
Data columns (total 13 columns):
 #   Column                  Non-Null Count   Dtype         
---  ------                  --------------   -----         
 0   acc_num                 327628 non-null  int64         
 1   acc_ambient_lightning   327628 non-null  int64         
 2   acc_urbanization_level  327628 non-null  int64         
 3   acc_intersection        327628 non-null  int64         
 4   acc_atmosphere          327628 non-null  int64         
 5   acc_collision_type      327628 non-null  int64         
 6   acc_date                327628 non-null  datetime64[ns]
 7   acc_year                327628 non-null  int64         
 8   acc_month               327628 non-null  int64         
 9   acc_hour                327628 non-null  int64         
 10  acc_metro               327628 non-null  int64         
 11  acc_long                327628 non-null  float64       
 12  acc_lat                 327628 non-n

In [84]:
#------------------------------------------------------------------------------------------------
# replace -1 value within categorical variables with nan()
#------------------------------------------------------------------------------------------------
df_acc['acc_ambient_lightning'] = df_acc['acc_ambient_lightning'].replace([-1], np.nan) 
df_acc['acc_urbanization_level'] = df_acc['acc_urbanization_level'].replace([-1], np.nan) 
df_acc['acc_intersection'] = df_acc['acc_intersection'].replace([-1], np.nan) 
df_acc['acc_atmosphere'] = df_acc['acc_atmosphere'].replace([-1], np.nan) 
df_acc['acc_collision_type'] = df_acc['acc_collision_type'].replace([-1], np.nan) 


In [85]:
# --------------------------------------------------------------------------------------------------------------------------------
# check missing values after replacement of -1/0 catagories
# --------------------------------------------------------------------------------------------------------------------------------
missing_counts = df_acc.isna().sum()
missing_percent = df_acc.isna().sum() / len(df_acc)

missing_summary = pd.DataFrame({
    'Missing Count': missing_counts,
    'Missing %': missing_percent.round(4)
})

print(missing_summary)

                        Missing Count  Missing %
acc_num                             0     0.0000
acc_ambient_lightning               9     0.0000
acc_urbanization_level              0     0.0000
acc_intersection                   14     0.0000
acc_atmosphere                     26     0.0001
acc_collision_type               1619     0.0049
acc_date                            0     0.0000
acc_year                            0     0.0000
acc_month                           0     0.0000
acc_hour                            0     0.0000
acc_metro                           0     0.0000
acc_long                            0     0.0000
acc_lat                             0     0.0000


In [86]:
# -------------------------------------------------------------------------------------------------
# export final dataframe to joblib
# -------------------------------------------------------------------------------------------------
dump(df_acc, (r'..\..\data\processed\2_preprocessing\1.0-simmler-data-preprocessing_accidents.joblib'))

['..\\..\\data\\processed\\2_preprocessing\\1.0-simmler-data-preprocessing_accidents.joblib']