In [1]:
import sys, os
sys.path.insert(1, 'viroco/src')
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
import folium
import json

from sklearn.preprocessing import MinMaxScaler


In [7]:
root = 'data/preprocessed'
drive_pths = [os.path.join(root, fn) for fn in os.listdir(root)]
feature_cols = ['target_speed', 'speed_osrm', 'way_maxspeed', 'elevation', 'fwd_azimuth', 'way_type', 'way_surface', 'node:intersection', 'node:railway', 'node:crossing', 'node:highway', 'node:stop', 'start_stop', 'azimuth_diff', 'elevation_diff']

drives = []
for df_pth in drive_pths[:]:
    df = pd.read_csv(df_pth, index_col='index')
    df = df[feature_cols]
    drives.append(df)
    
df = pd.concat(drives)
del(drives)
df.reset_index(inplace=True, drop=True)

In [8]:
with open('data/valid_categories.json', 'r') as f:
    valid_categories = json.load(f)

In [9]:
df.way_maxspeed = df.way_maxspeed / 3.6 # unify units

In [10]:
categorical_cols = valid_categories.keys()
for cc in categorical_cols:
    df[cc].fillna('null', inplace=True) # fill NaN values with 'null' string
    df.loc[~df[cc].isin(valid_categories[cc]), cc] = 'null' # set values not in valid_categories to 'null' string

# encode categorical columns to one-hot    
encoded_categoricals = pd.concat([pd.get_dummies(df[cc], prefix=cc) for cc in categorical_cols], axis=1)
df.drop(columns=categorical_cols, inplace=True)
df = pd.concat([df, encoded_categoricals], axis=1)
del(encoded_categoricals)

window_size = 201
pad_size = window_size // 2

for cc in categorical_cols:
    for val in valid_categories[cc]:
        col_name = str(cc) + '_' + str(val)
        # ensure all columns
        if col_name not in df:
            df[col_name] = 0
            
        if col_name.startswith('node:') or col_name.startswith('start_stop'):
            padded = np.pad(df[col_name], (pad_size), 'constant', constant_values=(0))
            padded_series = pd.Series(padded).rolling(window_size, win_type='triang', center=True).sum().dropna()
            padded_series.reset_index(inplace=True, drop=True)
            df[col_name] = padded_series
        
# drop complementary columns
df.drop(columns=[cc for cc in df.columns.to_list() if cc.endswith('_null')] + ['start_stop_0'], inplace=True)

In [11]:
# reorder columns in DataFrame
ordered_columns = sorted(df.columns.to_list()[1:])
ordered_columns.insert(0, 'target_speed')
df = df[ordered_columns]

In [12]:
def min_max_scale(df, scaler_fn=None):
    scaler = MinMaxScaler()
    scaler.fit(df)
    df = pd.DataFrame(scaler.transform(df), columns=df.columns)
    
    if scaler_fn:
        with open(scaler_fn, 'wb') as out:
            pickle.dump(scaler, out)
            
    return df

In [13]:

df = min_max_scale(df, 'min_max_scaler')

df.to_csv('test.csv', index_label='index')