In [None]:
# author: Michael Munz
#
# in -> 1.1-munz-data-exploration_locations
# drop duplicates
# correct outliers :vma cut-off at 200 using factor 10
# out <- 1.0.1-munz-data-preprocessing_locations
# correcting data types :pr, :pr1, :lartpc, :larrout
# out <- 1.0.2-munz-data-preprocessing_locations
# renaming cols
# out <- 1.0.3-munz-data-preprocessing_locations

In [46]:
# import
import pandas as pd
from joblib import dump, load

In [63]:
# ----
# load
# ----
df = load( '../../data/processed/1_exploration/1.1-munz-data-exploration_locations.joblib' )

In [64]:
# --------------
# data cleaning
# --------------

# 1 duplicate handling
df.drop_duplicates( keep='first',
                    inplace=True )

# verify
display( df.duplicated().any() )

np.False_

In [65]:
# outlier handling

# key numerical var speed

# 1 correctiny outliers being off by factor 10
# from data exploration & analysis 200 is reasonable max speed
max_speed = 200

df['vma'] = df.vma.apply( lambda x: x/10 if x > max_speed else x )



In [66]:
# ----
# save
# ----
dump(
    df, 
    '../../data/processed/2_preprocessing/1.0.1-munz-data-preprocessing_locations.joblib'
)

['../../data/processed/2_preprocessing/1.0.1-munz-data-preprocessing_locations.joblib']

In [None]:
# ----
# load
# ----
df = load( '../../data/processed/2_preprocessing/1.0.1-munz-data-preprocessing_locations.joblib' )


In [67]:
# correcting data types

# to int = { :pr, :pr1 }

# pr = [ '6', '3', '(1)', ... ]
# extract numeric (digits)
# why chainig float + int64
# applying .astype(int) to col with missing values raises error
# float and pandas Int64 can represent missing values
df['pr'] = df.pr.str.extract( r'(\d+)' ).astype( float ).astype( 'Int64' )



In [68]:
# pr1 = [ '900', '845', '500', ... ]
# extract numeric (digits)
df['pr1'] = df.pr1.str.extract( r'(\d+)' ).astype( float ).astype( 'Int64' )



In [69]:
# to float = { :lartpc, :larrout }

# lartpc = [ 0.0, 50.0, 2.59999, 3.7999, '0', '3,2' '42' ... ]
df['lartpc'] = df.lartpc.str.extract( r'(\d+)' ).astype( float )



In [70]:
# larrout = [ 65.0, 7.0, 2.8, 6.3, '-1', '5', '3,5', ... ]
df['larrout'] = df.larrout.str.extract( r'(\d+)' ).astype( float )


In [25]:
# ----
# save
# ----
dump(
    df, 
    '../../data/processed/2_preprocessing/1.0.2-munz-data-preprocessing_locations.joblib'
)

['../../data/processed/2_preprocessing/1.0.2-munz-data-preprocessing_locations.joblib']

In [None]:
# ----
# load
# ----
df = load( '../../data/processed/2_preprocessing/1.0.2-munz-data-preprocessing_locations.joblib' )

In [71]:
# renaming column names

# listing cols
display( df.columns )

Index(['Num_Acc', 'catr', 'voie', 'v1', 'v2', 'circ', 'nbv', 'vosp', 'prof',
       'pr', 'pr1', 'plan', 'lartpc', 'larrout', 'surf', 'infra', 'situ',
       'vma'],
      dtype='object')

In [72]:
# renaming column names
dic = {
    'Num_Acc': 'acc_num', 
    'catr': 'loca_road_cat',
    'voie': 'loca_road_id',
    'v1': 'loca_road_idx1',
    'v2': 'loca_road_idx2',
    'circ': 'loca_traffic_circul',
    'nbv': 'loca_road_lanes',
    'vosp': 'loca_reserved_lane',
    'prof': 'loca_road_gradient',
    'pr': 'loca_ref_point',
    'pr1': 'loca_ref_point_dist',
    'plan': 'loca_road_view',
    'lartpc': 'loca_land_strip',
    'larrout': 'loca_road_width',
    'surf': 'loca_road_surface_cond',
    'infra': 'loca_infrastruc',
    'situ': 'loca_accident',
    'vma': 'loca_max_speed',
    'voie_number': 'loca_road_no',
    'voie_name': 'loca_road_name'
}

# rename
df.rename( columns=dic, inplace=True )


In [73]:
# drop col :voie (loca_road_id)
cols = [ 'loca_road_id',
         'loca_road_idx1',
         'loca_road_idx2',
         'loca_ref_point',
         'loca_ref_point_dist',
         'loca_reserved_lane',
         'loca_land_strip',
         'loca_road_width',
         'loca_infrastruc']

df.drop( columns=cols, 
         axis=1, 
         inplace=True )

In [74]:
# verify
display( df.columns )

Index(['acc_num', 'loca_road_cat', 'loca_traffic_circul', 'loca_road_lanes',
       'loca_road_gradient', 'loca_road_view', 'loca_road_surface_cond',
       'loca_accident', 'loca_max_speed'],
      dtype='object')

In [75]:
display( df.shape )
display( df.info() )

(359510, 9)

<class 'pandas.core.frame.DataFrame'>
Index: 359510 entries, 0 to 359511
Data columns (total 9 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   acc_num                 359510 non-null  int64  
 1   loca_road_cat           359510 non-null  int64  
 2   loca_traffic_circul     359510 non-null  int64  
 3   loca_road_lanes         359405 non-null  float64
 4   loca_road_gradient      359510 non-null  int64  
 5   loca_road_view          359510 non-null  int64  
 6   loca_road_surface_cond  359510 non-null  int64  
 7   loca_accident           359510 non-null  int64  
 8   loca_max_speed          359510 non-null  float64
dtypes: float64(2), int64(7)
memory usage: 27.4 MB


None

In [76]:
# ----
# save
# ----
dump(
    df, 
    '../../data/processed/2_preprocessing/1.0.3-munz-data-preprocessing_locations.joblib'
)

['../../data/processed/2_preprocessing/1.0.3-munz-data-preprocessing_locations.joblib']