In [None]:
# author: Michael Munz
#
# in -> 1.0.3-munz-data-preprocessing_locations
# changing road category to ordinal (ranked) hierarchy
# replacing -1 (missing) with NaN in 6 cols
# replacing -1 (missing) with 0 (unknown) in 3 cols
# out <- 1.1-munz-data-preprocessing_locations

In [None]:
# import
import pandas as pd
import numpy as np
import sys
sys.path.append( '../../library' )
import gc_storage



In [2]:
# -------------------------
# init Google Cloud storage
# -------------------------
bucket_name='sep25-bds-road-accidents'
key_path='../../auth/fiery-glass-478009-t8-18a81c8cbe63.json'

bucket = gc_storage.init_bucket( bucket=bucket_name,
                                 json_key_path=key_path )

Initialized sep25-bds-road-accidents


In [3]:
# ---------------------------------------
# listing joblibs in Google Cloud storage
# ---------------------------------------
gc_storage.list_bucket( bucket=bucket,
                        remote_folder='2_preprocessing' )


data/processed/2_preprocessing/0.1-munz-data-preprocessing_locations.joblib
data/processed/2_preprocessing/0.2-munz-data-preprocessing_locations.joblib
data/processed/2_preprocessing/0.3-munz-data-preprocessing_locations.joblib
data/processed/2_preprocessing/0.4-munz-data-preprocessing_locations.joblib
data/processed/2_preprocessing/1.0-becker-data-preprocessing_usagers.joblib
data/processed/2_preprocessing/1.0-leibold-data-preprocessing_vehicles.joblib
data/processed/2_preprocessing/1.0-simmler-data-preprocessing_accidents.joblib
data/processed/2_preprocessing/1.0.1-munz-data-preprocessing_locations.joblib
data/processed/2_preprocessing/1.0.2-munz-data-preprocessing_locations.joblib
data/processed/2_preprocessing/1.0.3-munz-data-preprocessing_locations.joblib
data/processed/2_preprocessing/1.1-becker-data-preprocessing_usagers.joblib
data/processed/2_preprocessing/1.1-leibold-data-preprocessing_concat.joblib
data/processed/2_preprocessing/1.1-leibold-data-preprocessing_vehicles.joblib

In [4]:
# ----------------------------------
# download from Google Cloud storage
# ----------------------------------
df = gc_storage.download( bucket=bucket,
                          remote_path='2_preprocessing/1.1-munz-data-preprocessing_locations.joblib' )

Downloaded data/processed/2_preprocessing/1.1-munz-data-preprocessing_locations.joblib to
 ../../data/processed/2_preprocessing/1.1-munz-data-preprocessing_locations.joblib


In [None]:
# :loca_road_cat
# is ranked hierarcy
# re-ordering road classification system from most important/accessible to
# less important/accessible
# mapping dictionary
mapping = {
    1: 1,  # highway
    2: 2,  # national_road
    7: 3,  # urban_metropolitan_road
    3: 4,  # county_road
    4: 5,  # communal_road
    5: 6,  # private_restricted_road
    6: 7,  # public_parking_lot
    9: 8   # others
}

# apply mapping
# replace applies mapping in single pass
df[ 'loca_road_cat' ] = df.loca_road_cat.replace( mapping )


In [None]:
display( df.shape )
display( df.info() )

In [None]:
# replacing modality=-1 (missing) with NaNs
# listing columns
cols_nan = [ 'loca_road_lanes',
             #'loca_reserved_lane',
             'loca_road_view',
             'loca_road_surface_cond',
             #'loca_infrastruc',
             'loca_max_speed' ]

# applying
df[ cols_nan ] = df[ cols_nan ].replace( -1, np.nan )

# verifying
display( df[ (df[cols_nan] == -1) ].any().any() )


In [None]:
# replacing modality=-1 (missing) with modality=0 (unknown)
cols_0 = [ 'loca_traffic_circul',
           'loca_road_gradient',
           'loca_accident' ]

# applying
df[ cols_0 ] = df[ cols_0 ].replace( -1, 0 )

# verifying
display( df[ (df[cols_nan] == -1) ].any().any() )

In [None]:
# modality 7 doesn't exist
# make modality 8 to 7
df[ 'loca_accident' ] = df.loca_accident.replace( 8, 7 )

# verify
display( df[ (df.loca_accident == 8 ) ].any().any() )


In [5]:
# ---------------------------
# save to google cloud bucket
# ---------------------------
gc_storage.upload( bucket=bucket,
                   obj=df,
                   local_folder='2_preprocessing',
                   file_name='1.1-munz-data-preprocessing_locations.joblib' )

Uploaded ../../data/processed/2_preprocessing/1.1-munz-data-preprocessing_locations.joblib to
 gs://sep25-bds-road-accidents/data/processed/2_preprocessing/1.1-munz-data-preprocessing_locations.joblib
