In [1]:
import pandas as pd
import numpy as np

import random
import os
import sys
import psutil

import matplotlib
import matplotlib.pyplot as plt
import math
from multiprocessing import cpu_count,Pool 
import multiprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

import pickle


In [5]:
# Loading some necessary files 
f = open("geo_vect_dict.pkl","rb")
geohash_dict = pickle.load(f)
f.close()

f = open("geo_dict.pkl","rb")
geo_dict = pickle.load(f)
f.close()

f = open("NLP_vect_dict.pkl","rb")
NLP_dict = pickle.load(f)
f.close()

In [8]:
def onhot_enoceder(train):
    myEncoder = OneHotEncoder(sparse=False)
    myEncoder.fit(train['HOD_cat'].values.reshape(-1, 1))

    onehot_encode = pd.concat([train.reset_index().drop('HOD_cat',1),
                pd.DataFrame(myEncoder.transform(train['HOD_cat'].values.reshape(-1, 1)),
                             columns=['HOD_en0','HOD_en1','HOD_en2','HOD_en3','HOD_en4'])], axis=1).reindex()
    return onehot_encode.drop('index',1)

In [14]:
def create_train_set_aug_geo(frame_list,geomap):
    process_name = str(multiprocessing.current_process())
    id = int(process_name.split(',')[0].split('-')[1])
    print("process ",id," started")
    
    X_train = []
    y_train = []
    print ("process list with length of ",len(frame_list))
    for frame in frame_list:
        training_set = frame.values
        #make sure there is unique geohash per frame
        #print frame.Geohash.iloc[0]
        geo_vec = geomap[frame.Geohash.iloc[0]]
        geo_code = geo_dict[frame.Geohash.iloc[0]]
        try:
            NLP_code = NLP_dict[frame.Geohash.iloc[0]]
        except:
            NLP_code = np.zeros(100)
        for i in range(8, training_set.shape[0]):
            if training_set[i, 1] > 0 :
                a = np.concatenate((training_set[i-8:i,4:].flatten(),geo_vec),axis=0)
                a = np.concatenate((a,NLP_code),axis=0)
                a = np.append(a, geo_code)
                X_train.append(a)
                y_train.append(1)#training_set[i, 1])
                
            elif random.uniform(0, 1) > 0.98:  # negative sampling for non-accident cases 
                a = np.concatenate((training_set[i-8:i,4:].flatten(),geo_vec),axis=0)
                a = np.concatenate((a,NLP_code),axis=0)
                a = np.append(a, geo_code)
                X_train.append(a)
                y_train.append(0)#training_set[i, 1])
    return X_train, y_train

In [15]:
def create_sequences(df,geohash_dict):
    #df  = df.head(4000)
    frame_list=[]
    for idx, frame in df.groupby(df.Geohash):
        frame_list.append(frame)
    
    pool = Pool(cores)
    partition = int(np.ceil(float(len(frame_list))/partitions))
    #train_set = applyParallel (frame_list,create_train_set,pool,partition,{'geomap':geohash_dict.copy()})
    train_set = applyParallel (frame_list,create_train_set_aug_geo,pool,partition,{'geomap':geohash_dict.copy()})
    pool.close()
    pool.join()
    X_train = []
    y_train = []
    for set_ in train_set:
        X_train.extend(set_[0])
        y_train.extend(set_[1])

    X_train, y_train = np.array(X_train), np.array(y_train)    
    #X_train.shape
    return X_train,y_train

In [16]:
def train_data(filename):
    df = pd.read_hdf(filename+'.h5',key='set3')
    display(df.head())
    df_normalize = df.copy()
    train = df_normalize[df_normalize.TimeStep <= df_normalize.TimeStep.max()*5/6]
    test = df_normalize[df_normalize.TimeStep > df_normalize.TimeStep.max()*5/6]
    
    

    scaler = MinMaxScaler(feature_range=(0, 1))
    scaler.fit(train.loc[:,'T-BrokenVehicle':]) 
    scaled_values = scaler.transform(train.loc[:,'T-BrokenVehicle':]) 
    train.loc[:,'T-BrokenVehicle':] = scaled_values
    scaled_values = scaler.transform(test.loc[:,'T-BrokenVehicle':]) 
    test.loc[:,'T-BrokenVehicle':] = scaled_values
    display(test.head())
    
    train = onhot_enoceder(train)
    test = onhot_enoceder(test)
    
    display(test.head())
    
    X_train, y_train = create_sequences(train,geohash_dict)
    X_test, y_test = create_sequences(test,geohash_dict)

    np.save('train_set/X_train_'+filename,X_train)
    print (X_train.shape)
    np.save('train_set/y_train_'+filename,y_train)
    print( y_train.shape)
    np.save('train_set/X_test_'+filename,X_test)
    print (X_test.shape)
    np.save('train_set/y_test_'+filename,y_test)
    print (y_test.shape)

In [1]:
cities = ['Atlanta', 'Austin', 'Charlotte', 'Dallas', 'Houston', 'LosAngeles']

for city in cities:
    train_data(city)