## Integration

This Notebook integrates different sources of input to create a feature vector for a region. Here we employ the following information:

* Traffic
* Weather
* Points of Interest 

These feature categories are time-variant, and describe a 15 minutes time interval for a geographical region R of size 5km x 5km. The final vector also employs time-related features. 

In [1]:
import pandas as pd
import numpy as np

import random
import os
import sys
import psutil

import matplotlib
import matplotlib.pyplot as plt
import math
from multiprocessing import cpu_count,Pool 
import multiprocessing

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

from sklearn.preprocessing import MinMaxScaler
import pickle


### 1- Refine POI Vector for each Geohash (or geographical region)

In [2]:
geohash_map = pd.read_csv("geohash_to_poi_vec.csv")
geohash_vec = geohash_map[[ u'Amenity', u'Bump', u'Crossing', u'Give_Way',
       u'Junction', u'Noexit', u'Railway', u'Roundabout', u'Station', u'Stop',
       u'Traffic_Calming', u'Traffic_Signal', u'Turning_Circle',
       u'Turning_Loop']]

scaler = MinMaxScaler(feature_range=(0, 1))
scaler.fit(geohash_vec.loc[:,'Amenity':]) 
scaled_values = scaler.transform(geohash_vec.loc[:,'Amenity':]) 
geohash_vec.loc[:,'Amenity':] = scaled_values

geohash_dict={}
for index, row in geohash_map.iterrows():
    geohash_dict[row.Geohash] = np.array(geohash_vec.iloc[index])
    
f = open("geo_vect_dict.pkl","wb")
pickle.dump(geohash_dict,f)
f.close()

geo_dict = dict(zip(geohash_map.Geohash.unique(), range(len(geohash_map.Geohash.unique()))))
f = open("geo_dict.pkl","wb")
pickle.dump(geo_dict,f)
f.close()

### 2- Refine Description2Vector data for each Geohash (or geographical region)

In [3]:
NLP_map = pd.read_csv("geohash_to_text_vec.csv")

NLP_dict={}
for index, row in geohash_map.iterrows():
    NLP_dict[row.Geohash] = np.array([float(x) for x in row.vec.split(' ')])

f = open("NLP_vect_dict.pkl","wb")
pickle.dump(NLP_dict,f)
f.close()

### 3- Data Clearning Steps to make Integration

In [4]:
def clean_data(filepath,storename):
    df = pd.read_csv(filepath)
    display (df.head())
    
    list_ = df.columns
    print (list_)
    
    temp_df = df [[u'TimeStep', u'T-Accident',u'Geohash', u'HOD', u'DOW', u'DayLight',
       u'T-BrokenVehicle', u'T-Congestion', u'T-Construction', u'T-Event',
       u'T-FlowIncident', u'T-Other', u'T-RoadBlocked', u'W-Humidity',
       u'W-Precipitation', u'W-Pressure', u'W-Temperature', u'W-Visibility',
       u'W-WindSpeed', u'W-Rain', u'W-Snow', u'W-Fog', u'W-Hail']]
    temp_df.to_hdf(storename+'.h5',key='set1')
    display(temp_df.head())
    
    print ("zero accident =",float(df[df['T-Accident']==0].shape[0])/df.shape[0])
    
    f = open("geo_dict.pkl","rb")
    geo_dict = pickle.load(f)
    f.close()
    
    def fun_hash(geohash):
        return geo_dict[geohash]
    df['geohash_code'] = df.apply(lambda row: fun_hash(row['Geohash']), axis=1) 
    temp_df = df [[u'TimeStep', u'T-Accident',u'Geohash',u'geohash_code', u'HOD', u'DOW', u'DayLight',
       u'T-BrokenVehicle', u'T-Congestion', u'T-Construction', u'T-Event',
       u'T-FlowIncident', u'T-Other', u'T-RoadBlocked', u'W-Humidity',
       u'W-Precipitation', u'W-Pressure', u'W-Temperature', u'W-Visibility',
       u'W-WindSpeed', u'W-Rain', u'W-Snow', u'W-Fog', u'W-Hail']]
    temp_df.to_hdf(storename+'.h5',key='set2')
    
    df = pd.read_hdf(storename+'.h5',key='set2')
    display(df.head())
    
    def week_day(DOW):
        if DOW < 5:
            return 1
        else:
            return 0
    def shift(group):
        df_list=[]
        for idx,df in group:
            df['predicted_accident'] = df['T-Accident'].shift(-1)
            df.drop(df.tail(1).index,inplace=True)
            df_list.append(df)
        return pd.concat(df_list)

    def time_interval(HOD):
        if HOD >=6 and HOD <10:
            return 0
        if HOD >= 10 and HOD<15:
            return 1
        if HOD >=15 and HOD< 18:
            return 2;
        if HOD >=18 and HOD< 22:
            return 3
        else:
            return 4; 
    def make_binary(d):
        if d > 0:
            return 1
        else:
            return 0    
    df['DOW_cat'] = df.apply(lambda row: week_day(row['DOW']), axis=1)   
    df['HOD_cat'] = df.apply(lambda row: time_interval(row['HOD']), axis=1) 
    df['T-Accident'] = df.apply(lambda row: make_binary(row['T-Accident']), axis=1) 
    group = df.groupby('Geohash')
    df = shift(group)
    temp_df = df [[u'TimeStep', u'predicted_accident',u'Geohash',u'geohash_code', u'HOD_cat', u'DOW_cat', u'T-Accident',u'DayLight',
       u'T-BrokenVehicle', u'T-Congestion', u'T-Construction', u'T-Event',
       u'T-FlowIncident', u'T-Other', u'T-RoadBlocked', u'W-Humidity',
       u'W-Precipitation', u'W-Pressure', u'W-Temperature', u'W-Visibility',
       u'W-WindSpeed', u'W-Rain', u'W-Snow', u'W-Fog', u'W-Hail']]
    temp_df.to_hdf(storename+'.h5',key='set3')

In [5]:
cities = ['Atlanta', 'Austin', 'Charlotte', 'Dallas', 'Houston', 'LosAngeles']

for city in cities:
    clean_data("{}_geo2vec_201861-2018831.csv".format(city), city)

In [6]:
list_app=[]
for city in cities:
    list_app.append(pd.read_hdf(city+'.h5',key='set3'))
list_app
final_df = pd.concat(list_app)

In [7]:
final_df.to_hdf('all_cities'+'.h5',key='set3')