In [1]:
import pandas as pd
import numpy as np

import random
import os
import sys
import psutil

import matplotlib
import matplotlib.pyplot as plt
import math
from multiprocessing import cpu_count,Pool 
import multiprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from imblearn.over_sampling import SMOTE

# Parallel Functions

In [2]:
class WithExtraArgs(object):
    def __init__(self, func, **args):
        self.func = func
        self.args = args
    def __call__(self, df):
        return self.func(df, **self.args)

def applyParallel(data, func,pool,partition, kwargs):
    data_split = [data[i:i + partition] for i in range(0, len(data), partition)]
    #data_split = np.array_split(data, min(partitions,data.shape[0]))
    data =pool.map(WithExtraArgs(func, **kwargs), data_split)
    #data = pd.concat(pool.map(WithExtraArgs(func, **kwargs), data_split))
    return data

def parallelize(data, func,pool,partition):
    data_split = [data[i:i + partition] for i in range(0, len(data), partition)]
    #data_split = np.array_split(data, partitions)
    data =pool.map(func, data_split)
    return data

In [3]:
cores = cpu_count() #Number of CPU cores on your system
partitions = cores
partitions

28

In [4]:
#!python -c 'import tensorflow as tf; print(tf.__version__)'
#!python -c 'import keras as kr; print(kr.__version__)'

In [5]:
import pickle
f = open("geo_vect_dict.pkl","rb")
geohash_dict = pickle.load(f)
f.close()

In [6]:
import pickle
f = open("geo_dict.pkl","rb")
geo_dict = pickle.load(f)
f.close()

In [7]:
import pickle
f = open("NLP_vect_dict.pkl","rb")
NLP_dict = pickle.load(f)
f.close()

# geohash feature vectors

In [8]:
def onhot_enoceder(train):
    myEncoder = OneHotEncoder(sparse=False)
    myEncoder.fit(train['HOD_cat'].values.reshape(-1, 1))

    onehot_encode = pd.concat([train.reset_index().drop('HOD_cat',1),
                pd.DataFrame(myEncoder.transform(train['HOD_cat'].values.reshape(-1, 1)),
                             columns=['HOD_en0','HOD_en1','HOD_en2','HOD_en3','HOD_en4'])], axis=1).reindex()
    return onehot_encode.drop('index',1)

In [108]:
def one_hot_check(X_res):
    X_res[:,0:10] = np.round(X_res[:,0:10])
        
    X_res[:,20:] = np.round(X_res[:,20:])
    
    return X_res
    

In [123]:
def create_train_set_aug_geo(frame_list,geomap):
    process_name = str(multiprocessing.current_process())
    id = int(process_name.split(',')[0].split('-')[1])
    print("process ",id," started")
    
    f_X_train = []
    f_y_train = []
    print ("process list with length of ",len(frame_list))
    for frame in frame_list:
        X_train = []
        y_train = []
        training_set = frame.values
        #display(frame.head()) 
        #make sure there is unique geohash per frame
        #print frame.Geohash.iloc[0]
        geo_vec = geomap[frame.Geohash.iloc[0]]
        geo_code = geo_dict[frame.Geohash.iloc[0]]
        try:
            NLP_code = NLP_dict[frame.Geohash.iloc[0]]
        except:
            NLP_code = np.zeros(100)
        for i in range(8, training_set.shape[0]):
            if training_set[i, 1] > 0 :
                sequence = training_set[i-8:i,4:].flatten()
                #a = np.concatenate((training_set[i-8:i,4:].flatten(),geo_vec),axis=0)
                #a = np.concatenate((a,NLP_code),axis=0)
                #a = np.append(a, geo_code)
                X_train.append(sequence)
                y_train.append(1)
                
            elif random.uniform(0, 1) > 0.98:
            #else:
                sequence = training_set[i-8:i,4:].flatten()
                #a = np.concatenate((training_set[i-8:i,4:].flatten(),geo_vec),axis=0)
                #a = np.concatenate((a,NLP_code),axis=0)
                #a = np.append(a, geo_code)
                X_train.append(sequence)
                y_train.append(0)
        #SMOTE
        sm = SMOTE(random_state=42)
        try:
            X_res, y_res = sm.fit_resample(np.array(X_train), np.array(y_train))
        except:
            X_res, y_res = np.array(X_train), np.array(y_train)
        
        X_res = X_res.astype(np.float)
        X_res = one_hot_check(X_res)
        a = np.concatenate((geo_vec,NLP_code),axis=0)
        a = np.append(a, geo_code)
        a = np.tile(a,(X_res.shape[0],1))
        X_res = np.concatenate((X_res,a),axis=1)
        f_X_train.extend(X_res)
        f_y_train.extend(y_res)
    #return X_train, y_train
    return f_X_train,f_y_train

In [124]:
# for logistic regression
def create_sequences(df,geohash_dict):
    #df  = df.head(4000)
    frame_list=[]
    for idx, frame in df.groupby(df.Geohash):
        frame_list.append(frame)
    
    pool = Pool(cores)
    partition = int(np.ceil(float(len(frame_list))/partitions))
    #train_set = applyParallel (frame_list,create_train_set,pool,partition,{'geomap':geohash_dict.copy()})
    train_set = applyParallel (frame_list,create_train_set_aug_geo,pool,partition,{'geomap':geohash_dict.copy()})
    pool.close()
    pool.join()
    X_train = []
    y_train = []
    for set_ in train_set:
        X_train.extend(set_[0])
        y_train.extend(set_[1])

    X_train, y_train = np.array(X_train), np.array(y_train)    
    #X_train.shape
    return X_train,y_train

In [129]:
def train_data(filename):
    df = pd.read_hdf(filename+'.h5',key='set3')
    display(df.head())
    df_normalize = df.copy()
    train = df_normalize[df_normalize.TimeStep <= df_normalize.TimeStep.max()*5/6]
    #test = df_normalize[df_normalize.TimeStep > df_normalize.TimeStep.max()*5/6]
    
    
    
    scaler = MinMaxScaler(feature_range=(0, 1))
    scaler.fit(train.loc[:,'T-BrokenVehicle':]) 
    scaled_values = scaler.transform(train.loc[:,'T-BrokenVehicle':]) 
    train.loc[:,'T-BrokenVehicle':] = scaled_values
    #scaled_values = scaler.transform(test.loc[:,'T-BrokenVehicle':]) 
    #test.loc[:,'T-BrokenVehicle':] = scaled_values
    #display(test.head())
    
    train = onhot_enoceder(train)
    #test = onhot_enoceder(test)

    display(train.columns)
    
    X_train, y_train = create_sequences(train,geohash_dict)
    #print (X_train.shape)
    #X_test, y_test = create_sequences(test,geohash_dict)

    
    np.save('train_set/X_train_smote_'+filename,X_train)
    print (X_train.shape)
    np.save('train_set/y_train_smote_'+filename,y_train)
    print( y_train.shape)
    """
    np.save('train_set/X_test_'+filename,X_test)
    print (X_test.shape)
    np.save('train_set/y_test_'+filename,y_test)
    print (y_test.shape)
    """

In [130]:
train_data('Atlanta')

Unnamed: 0,TimeStep,predicted_accident,Geohash,geohash_code,HOD_cat,DOW_cat,T-Accident,DayLight,T-BrokenVehicle,T-Congestion,...,W-Humidity,W-Precipitation,W-Pressure,W-Temperature,W-Visibility,W-WindSpeed,W-Rain,W-Snow,W-Fog,W-Hail
185451,0,0.0,djgz7,98,4,1,0,0,0,0,...,84.5,0.0,29.98,74.45,10.0,3.5,0,0,0,0
185452,1,0.0,djgz7,98,4,1,0,0,0,0,...,84.5,0.0,29.98,74.45,10.0,3.5,0,0,0,0
185453,2,0.0,djgz7,98,4,1,0,0,0,0,...,84.5,0.0,29.98,74.45,10.0,3.5,0,0,0,0
185454,3,0.0,djgz7,98,4,1,0,0,0,0,...,84.5,0.0,29.98,74.45,10.0,3.5,0,0,0,0
185455,4,0.0,djgz7,98,4,1,0,0,0,0,...,81.5,0.0,30.005,74.0,10.0,10.4,0,0,0,0


  return self.partial_fit(X, y)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


Index(['TimeStep', 'predicted_accident', 'Geohash', 'geohash_code', 'DOW_cat',
       'T-Accident', 'DayLight', 'T-BrokenVehicle', 'T-Congestion',
       'T-Construction', 'T-Event', 'T-FlowIncident', 'T-Other',
       'T-RoadBlocked', 'W-Humidity', 'W-Precipitation', 'W-Pressure',
       'W-Temperature', 'W-Visibility', 'W-WindSpeed', 'W-Rain', 'W-Snow',
       'W-Fog', 'W-Hail', 'HOD_en0', 'HOD_en1', 'HOD_en2', 'HOD_en3',
       'HOD_en4'],
      dtype='object')

process  1040  started
process list with length of  3
process  1041  started
process list with length of  3
process  1042  started
process list with length of  3
process  1043  started
process list with length of  3
process  1045  started
process list with length of  3
process  1044  started
process list with length of  3
process  1046  started
process list with length of  3
process  1047  started
process list with length of  3
process  1048  started
process list with length of  3
process  1049  started
process list with length of  3
process  1050  started
process list with length of  3
process  1051  started
process list with length of  3
process  1052  started
process list with length of  3
process  1053  started
process list with length of  3
process  1054  started
process list with length of  3
process  1055  started
process list with length of  3
process  1056  started
process list with length of  3
process  1057  started
process list with length of  3
process  1058  started
proce

In [18]:
train_data('Austin')

Unnamed: 0,TimeStep,predicted_accident,Geohash,geohash_code,HOD_cat,DOW_cat,T-Accident,DayLight,T-BrokenVehicle,T-Congestion,...,W-Humidity,W-Precipitation,W-Pressure,W-Temperature,W-Visibility,W-WindSpeed,W-Rain,W-Snow,W-Fog,W-Hail
900762,0,0.0,9v677,604,4,1,0,0,0,0,...,87.0,0.0,29.84,75.0,10.0,4.6,0,0,0,0
900763,1,0.0,9v677,604,4,1,0,0,0,0,...,87.0,0.0,29.84,75.0,10.0,4.6,0,0,0,0
900764,2,0.0,9v677,604,4,1,0,0,0,0,...,87.0,0.0,29.84,75.0,10.0,4.6,0,0,0,0
900765,3,0.0,9v677,604,4,1,0,0,0,0,...,87.0,0.0,29.84,75.0,10.0,4.6,0,0,0,0
900766,4,0.0,9v677,604,4,1,0,0,0,0,...,87.0,0.0,29.87,75.0,10.0,4.6,0,0,0,0


  return self.partial_fit(X, y)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


Unnamed: 0,TimeStep,predicted_accident,Geohash,geohash_code,HOD_cat,DOW_cat,T-Accident,DayLight,T-BrokenVehicle,T-Congestion,...,W-Humidity,W-Precipitation,W-Pressure,W-Temperature,W-Visibility,W-WindSpeed,W-Rain,W-Snow,W-Fog,W-Hail
908120,7358,0.0,9v677,604,2,1,0,1,0.0,0.0,...,0.34,0.0,0.986491,0.923993,1.0,0.416667,0.0,0.0,0.0,0.0
908121,7359,0.0,9v677,604,2,1,0,1,0.0,0.0,...,0.34,0.0,0.986491,0.923993,1.0,0.416667,0.0,0.0,0.0,0.0
908122,7360,0.0,9v677,604,2,1,0,1,0.0,0.0,...,0.34,0.0,0.986161,0.915751,1.0,0.376812,0.0,0.0,0.0,0.0
908123,7361,0.0,9v677,604,2,1,0,1,0.0,0.0,...,0.34,0.0,0.986161,0.915751,1.0,0.376812,0.0,0.0,0.0,0.0
908124,7362,0.0,9v677,604,2,1,0,1,0.0,0.0,...,0.34,0.0,0.986161,0.915751,1.0,0.376812,0.0,0.0,0.0,0.0


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


Unnamed: 0,TimeStep,predicted_accident,Geohash,geohash_code,DOW_cat,T-Accident,DayLight,T-BrokenVehicle,T-Congestion,T-Construction,...,W-WindSpeed,W-Rain,W-Snow,W-Fog,W-Hail,HOD_en0,HOD_en1,HOD_en2,HOD_en3,HOD_en4
0,7358,0.0,9v677,604,1,0,1,0.0,0.0,0.0,...,0.416667,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,7359,0.0,9v677,604,1,0,1,0.0,0.0,0.0,...,0.416667,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,7360,0.0,9v677,604,1,0,1,0.0,0.0,0.0,...,0.376812,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,7361,0.0,9v677,604,1,0,1,0.0,0.0,0.0,...,0.376812,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,7362,0.0,9v677,604,1,0,1,0.0,0.0,0.0,...,0.376812,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


process  141  started
process list with length of  5
process list with length of  5
process  142  started
process  143  started
process list with length of  5
process  144  started
process list with length of  5
process  145  started
process list with length of  5
process  146  started
process list with length of  5
process  147  started
process list with length of  5
process  148  started
process list with length of  5
process  149  started
process list with length of  5
process list with length of  5
process  150  started
process  151  started
process list with length of  5
process  152  started
process list with length of  5
process  153  started
process list with length of  5
process  154  started
process list with length of  5
process  155  started
process list with length of  5
process  156  started
process list with length of  5
process  157  started
process list with length of  5
process list with length of  5
process  158  started
process  159  started
process list with length

In [19]:
train_data('Charlotte')

Unnamed: 0,TimeStep,predicted_accident,Geohash,geohash_code,HOD_cat,DOW_cat,T-Accident,DayLight,T-BrokenVehicle,T-Congestion,...,W-Humidity,W-Precipitation,W-Pressure,W-Temperature,W-Visibility,W-WindSpeed,W-Rain,W-Snow,W-Fog,W-Hail
397395,0,0.0,dnnqg,462,4,1,0,0,0,0,...,100.0,0.0,29.93,69.1,9.0,5.8,0,0,0,0
397396,1,0.0,dnnqg,462,4,1,0,0,0,0,...,100.0,0.0,29.93,69.1,9.0,5.8,0,0,0,0
397397,2,0.0,dnnqg,462,4,1,0,0,0,0,...,100.0,0.0,29.93,69.1,9.0,5.8,0,0,0,0
397398,3,0.0,dnnqg,462,4,1,0,0,0,0,...,100.0,0.0,29.93,69.1,9.0,5.8,0,0,0,0
397399,4,0.0,dnnqg,462,4,1,0,0,0,0,...,96.0,0.0,29.9,69.1,9.0,4.6,0,0,0,0


  return self.partial_fit(X, y)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


Unnamed: 0,TimeStep,predicted_accident,Geohash,geohash_code,HOD_cat,DOW_cat,T-Accident,DayLight,T-BrokenVehicle,T-Congestion,...,W-Humidity,W-Precipitation,W-Pressure,W-Temperature,W-Visibility,W-WindSpeed,W-Rain,W-Snow,W-Fog,W-Hail
404753,7358,0.0,dnnqg,462,2,1,0,1,0.0,0.0,...,0.410256,0.0,0.617647,0.836066,1.0,0.293478,0.0,0.0,0.0,0.0
404754,7359,0.0,dnnqg,462,2,1,0,1,0.0,0.0,...,0.410256,0.0,0.617647,0.836066,1.0,0.293478,0.0,0.0,0.0,0.0
404755,7360,0.0,dnnqg,462,2,1,0,1,0.0,0.0,...,0.333333,0.0,0.588235,0.859485,1.0,0.0,0.0,0.0,0.0,0.0
404756,7361,0.0,dnnqg,462,2,1,0,1,0.0,0.0,...,0.333333,0.0,0.588235,0.859485,1.0,0.0,0.0,0.0,0.0,0.0
404757,7362,0.0,dnnqg,462,2,1,0,1,0.0,0.0,...,0.333333,0.0,0.588235,0.859485,1.0,0.0,0.0,0.0,0.0,0.0


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


Unnamed: 0,TimeStep,predicted_accident,Geohash,geohash_code,DOW_cat,T-Accident,DayLight,T-BrokenVehicle,T-Congestion,T-Construction,...,W-WindSpeed,W-Rain,W-Snow,W-Fog,W-Hail,HOD_en0,HOD_en1,HOD_en2,HOD_en3,HOD_en4
0,7358,0.0,dnnqg,462,1,0,1,0.0,0.0,0.0,...,0.293478,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,7359,0.0,dnnqg,462,1,0,1,0.0,0.0,0.0,...,0.293478,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,7360,0.0,dnnqg,462,1,0,1,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,7361,0.0,dnnqg,462,1,0,1,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,7362,0.0,dnnqg,462,1,0,1,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


process  197  started
process list with length of  5
process  198  started
process list with length of  5
process  199  started
process list with length of  5
process  200  started
process list with length of  5
process list with length of  5
process  201  started
process  202  started
process list with length of  5
process  203  started
process list with length of  5
process  204  started
process list with length of  5
process  205  started
process list with length of  5
process  206  started
process list with length of  5
process  207  started
process list with length of  5
process  208  started
process list with length of  5
process  209  started
process list with length of  5
process  210  started
process list with length of  5
process  211  started
process list with length of  5
process  212  started
process list with length of  5
process  213  started
process list with length of  5
process  214  started
process list with length of  5
process  215  started
process list with length

In [20]:
train_data('Dallas')

Unnamed: 0,TimeStep,predicted_accident,Geohash,geohash_code,HOD_cat,DOW_cat,T-Accident,DayLight,T-BrokenVehicle,T-Congestion,...,W-Humidity,W-Precipitation,W-Pressure,W-Temperature,W-Visibility,W-WindSpeed,W-Rain,W-Snow,W-Fog,W-Hail
229606,0,0.0,9vfcr,476,4,1,0,0,0,0,...,67.0,0.0,29.78,82.9,10.0,15.0,0,0,0,0
229607,1,0.0,9vfcr,476,4,1,0,0,0,0,...,67.0,0.0,29.78,82.9,10.0,15.0,0,0,0,0
229608,2,0.0,9vfcr,476,4,1,0,0,0,0,...,67.0,0.0,29.78,82.9,10.0,15.0,0,0,0,0
229609,3,0.0,9vfcr,476,4,1,0,0,0,0,...,67.0,0.0,29.78,82.9,10.0,15.0,0,0,0,0
229610,4,0.0,9vfcr,476,4,1,0,0,0,0,...,72.0,0.0,29.78,81.0,10.0,13.8,0,0,0,0


  return self.partial_fit(X, y)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


Unnamed: 0,TimeStep,predicted_accident,Geohash,geohash_code,HOD_cat,DOW_cat,T-Accident,DayLight,T-BrokenVehicle,T-Congestion,...,W-Humidity,W-Precipitation,W-Pressure,W-Temperature,W-Visibility,W-WindSpeed,W-Rain,W-Snow,W-Fog,W-Hail
236964,7358,0.0,9vfcr,476,2,1,0,1,0.0,0.0,...,0.4,0.0,0.989755,0.873874,1.0,0.202771,0.0,0.0,0.0,0.0
236965,7359,0.0,9vfcr,476,2,1,0,1,0.0,0.0,...,0.4,0.0,0.989755,0.873874,1.0,0.202771,0.0,0.0,0.0,0.0
236966,7360,0.0,9vfcr,476,2,1,0,1,0.0,0.0,...,0.4,0.0,0.989095,0.873874,1.0,0.15995,0.0,0.0,0.0,0.0
236967,7361,0.0,9vfcr,476,2,1,0,1,0.0,0.0,...,0.4,0.0,0.989095,0.873874,1.0,0.15995,0.0,0.0,0.0,0.0
236968,7362,0.0,9vfcr,476,2,1,0,1,0.0,0.0,...,0.4,0.0,0.989095,0.873874,1.0,0.15995,0.0,0.0,0.0,0.0


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


Unnamed: 0,TimeStep,predicted_accident,Geohash,geohash_code,DOW_cat,T-Accident,DayLight,T-BrokenVehicle,T-Congestion,T-Construction,...,W-WindSpeed,W-Rain,W-Snow,W-Fog,W-Hail,HOD_en0,HOD_en1,HOD_en2,HOD_en3,HOD_en4
0,7358,0.0,9vfcr,476,1,0,1,0.0,0.0,0.0,...,0.202771,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,7359,0.0,9vfcr,476,1,0,1,0.0,0.0,0.0,...,0.202771,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,7360,0.0,9vfcr,476,1,0,1,0.0,0.0,0.0,...,0.15995,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,7361,0.0,9vfcr,476,1,0,1,0.0,0.0,0.0,...,0.15995,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,7362,0.0,9vfcr,476,1,0,1,0.0,0.0,0.0,...,0.15995,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


process  253  started
process list with length of  6
process  254  started
process list with length of  6
process  255  started
process list with length of  6
process  256  started
process list with length of  6
process list with length of  6
process  257  started
process  258  started
process list with length of  6
process  259  started
process list with length of  6
process  260  started
process list with length of  6
process  261  started
process list with length of  6
process  262  started
process list with length of  6
process  263  started
process list with length of  6
process  264  started
process list with length of  6
process list with length of  6
process  265  started
process list with length of  6
process  266  started
process  267  started
process list with length of  6
process  268  started
process list with length of  6
process  269  started
process list with length of  6
process  270  started
process list with length of  6
process  271  started
process list with length

In [21]:
train_data('Houston')

Unnamed: 0,TimeStep,predicted_accident,Geohash,geohash_code,HOD_cat,DOW_cat,T-Accident,DayLight,T-BrokenVehicle,T-Congestion,...,W-Humidity,W-Precipitation,W-Pressure,W-Temperature,W-Visibility,W-WindSpeed,W-Rain,W-Snow,W-Fog,W-Hail
2004637,0,0.0,9v5zu,613,4,1,0,0,0,0,...,88.0,0.0,29.87,79.0,9.0,6.9,0,0,0,0
2004638,1,0.0,9v5zu,613,4,1,0,0,0,0,...,88.0,0.0,29.87,79.0,9.0,6.9,0,0,0,0
2004639,2,0.0,9v5zu,613,4,1,0,0,0,0,...,88.0,0.0,29.87,79.0,9.0,6.9,0,0,0,0
2004640,3,0.0,9v5zu,613,4,1,0,0,0,0,...,88.0,0.0,29.87,79.0,9.0,6.9,0,0,0,0
2004641,4,0.0,9v5zu,613,4,1,0,0,0,0,...,90.0,0.0,29.86,79.0,9.0,6.9,0,0,0,0


  return self.partial_fit(X, y)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


Unnamed: 0,TimeStep,predicted_accident,Geohash,geohash_code,HOD_cat,DOW_cat,T-Accident,DayLight,T-BrokenVehicle,T-Congestion,...,W-Humidity,W-Precipitation,W-Pressure,W-Temperature,W-Visibility,W-WindSpeed,W-Rain,W-Snow,W-Fog,W-Hail
2011995,7358,0.0,9v5zu,613,2,1,0,1,0.0,0.0,...,0.58,0.0,0.993711,0.927567,0.5,0.466667,1.0,0.0,0.0,0.0
2011996,7359,0.0,9v5zu,613,2,1,0,1,0.0,0.0,...,0.7,0.012987,0.993711,0.907876,0.25,0.301449,1.0,0.0,0.0,0.0
2011997,7360,0.0,9v5zu,613,2,1,0,1,0.0,0.0,...,0.57,0.025974,0.99338,0.935302,0.5,0.333333,0.0,0.0,0.0,0.0
2011998,7361,0.0,9v5zu,613,2,1,0,1,0.0,0.0,...,0.57,0.025974,0.99338,0.935302,0.5,0.333333,0.0,0.0,0.0,0.0
2011999,7362,0.0,9v5zu,613,2,1,0,1,0.0,0.0,...,0.57,0.025974,0.99338,0.935302,0.5,0.333333,0.0,0.0,0.0,0.0


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


Unnamed: 0,TimeStep,predicted_accident,Geohash,geohash_code,DOW_cat,T-Accident,DayLight,T-BrokenVehicle,T-Congestion,T-Construction,...,W-WindSpeed,W-Rain,W-Snow,W-Fog,W-Hail,HOD_en0,HOD_en1,HOD_en2,HOD_en3,HOD_en4
0,7358,0.0,9v5zu,613,1,0,1,0.0,0.0,0.0,...,0.466667,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,7359,0.0,9v5zu,613,1,0,1,0.0,0.0,0.0,...,0.301449,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,7360,0.0,9v5zu,613,1,0,1,0.0,0.0,0.0,...,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,7361,0.0,9v5zu,613,1,0,1,0.0,0.0,0.0,...,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,7362,0.0,9v5zu,613,1,0,1,0.0,0.0,0.0,...,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


process  309  started
process list with length of  9
process  310  started
process list with length of  9
process  311  started
process list with length of  9
process  312  started
process list with length of  9
process list with length of  9
process  313  started
process  314  started
process list with length of  9
process  315  started
process list with length of  9
process  316  started
process list with length of  9
process  317  started
process list with length of  9
process list with length of  9
process  318  started
process  319  started
process list with length of  9
process  320  started
process list with length of  9
process  321  started
process list with length of  9
process  322  started
process list with length of  9
process  323  started
process list with length of  9
process  324  started
process list with length of  9
process list with length of  9
process  325  started
process  326  started
process list with length of  9
process  327  started
process list with length

In [22]:
train_data('LosAngeles')

Unnamed: 0,TimeStep,predicted_accident,Geohash,geohash_code,HOD_cat,DOW_cat,T-Accident,DayLight,T-BrokenVehicle,T-Congestion,...,W-Humidity,W-Precipitation,W-Pressure,W-Temperature,W-Visibility,W-WindSpeed,W-Rain,W-Snow,W-Fog,W-Hail
0,0,0.0,9mgzc,237,4,1,0,0,0,0,...,82.0,0.0,29.98,57.2,10.0,0.0,0,0,0,0
1,1,0.0,9mgzc,237,4,1,0,0,0,0,...,82.0,0.0,29.98,57.2,10.0,0.0,0,0,0,0
2,2,0.0,9mgzc,237,4,1,0,0,0,0,...,82.0,0.0,29.98,57.2,10.0,0.0,0,0,0,0
3,3,0.0,9mgzc,237,4,1,0,0,0,0,...,82.0,0.0,29.98,57.2,10.0,0.0,0,0,0,0
4,4,0.0,9mgzc,237,4,1,0,0,0,0,...,82.0,0.0,29.98,57.2,10.0,0.0,0,0,0,0


  return self.partial_fit(X, y)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


Unnamed: 0,TimeStep,predicted_accident,Geohash,geohash_code,HOD_cat,DOW_cat,T-Accident,DayLight,T-BrokenVehicle,T-Congestion,...,W-Humidity,W-Precipitation,W-Pressure,W-Temperature,W-Visibility,W-WindSpeed,W-Rain,W-Snow,W-Fog,W-Hail
7358,7358,0.0,9mgzc,237,2,1,0,1,0.0,0.0,...,0.65,0.0,0.99502,0.618524,0.666667,0.136701,0.0,0.0,0.0,0.0
7359,7359,0.0,9mgzc,237,2,1,0,1,0.0,0.0,...,0.65,0.0,0.99502,0.618524,0.666667,0.136701,0.0,0.0,0.0,0.0
7360,7360,0.0,9mgzc,237,2,1,0,1,0.0,0.0,...,0.65,0.0,0.99502,0.618524,0.666667,0.111441,0.0,0.0,0.0,0.0
7361,7361,0.0,9mgzc,237,2,1,0,1,0.0,0.0,...,0.65,0.0,0.99502,0.618524,0.666667,0.111441,0.0,0.0,0.0,0.0
7362,7362,0.0,9mgzc,237,2,1,0,1,0.0,0.0,...,0.65,0.0,0.99502,0.618524,0.666667,0.111441,0.0,0.0,0.0,0.0


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


Unnamed: 0,TimeStep,predicted_accident,Geohash,geohash_code,DOW_cat,T-Accident,DayLight,T-BrokenVehicle,T-Congestion,T-Construction,...,W-WindSpeed,W-Rain,W-Snow,W-Fog,W-Hail,HOD_en0,HOD_en1,HOD_en2,HOD_en3,HOD_en4
0,7358,0.0,9mgzc,237,1,0,1,0.0,0.0,0.0,...,0.136701,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,7359,0.0,9mgzc,237,1,0,1,0.0,0.0,0.0,...,0.136701,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,7360,0.0,9mgzc,237,1,0,1,0.0,0.0,0.0,...,0.111441,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,7361,0.0,9mgzc,237,1,0,1,0.0,0.0,0.0,...,0.111441,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,7362,0.0,9mgzc,237,1,0,1,0.0,0.0,0.0,...,0.111441,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


process  365  started
process list with length of  6
process  366  started
process list with length of  6
process  367  started
process list with length of  6
process  368  started
process list with length of  6
process  369  started
process list with length of  6
process  370  started
process list with length of  6
process  371  started
process list with length of  6
process  372  started
process list with length of  6
process  373  started
process list with length of  6
process  374  started
process list with length of  6
process  375  started
process list with length of  6
process  376  started
process list with length of  6
process  377  started
process list with length of  6
process  378  started
process list with length of  6
process  379  started
process list with length of  6
process list with length of  6
process  380  started
process  381  started
process list with length of  6
process  382  started
process list with length of  6
process list with length of  6
process  383  s

In [23]:
train_data('Miami')

Unnamed: 0,TimeStep,predicted_accident,Geohash,geohash_code,HOD_cat,DOW_cat,T-Accident,DayLight,T-BrokenVehicle,T-Congestion,...,W-Humidity,W-Precipitation,W-Pressure,W-Temperature,W-Visibility,W-WindSpeed,W-Rain,W-Snow,W-Fog,W-Hail
176620,0,0.0,dhwf4,546,4,1,0,0,0,0,...,88.0,0.0,30.01,77.0,10.0,3.5,0,0,0,0
176621,1,0.0,dhwf4,546,4,1,0,0,0,0,...,88.0,0.0,30.01,77.0,10.0,3.5,0,0,0,0
176622,2,0.0,dhwf4,546,4,1,0,0,0,0,...,88.0,0.0,30.01,77.0,10.0,3.5,0,0,0,0
176623,3,0.0,dhwf4,546,4,1,0,0,0,0,...,88.0,0.0,30.01,77.0,10.0,3.5,0,0,0,0
176624,4,0.0,dhwf4,546,4,1,0,0,0,0,...,91.0,0.0,30.0,75.9,10.0,3.5,0,0,0,0


  return self.partial_fit(X, y)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


Unnamed: 0,TimeStep,predicted_accident,Geohash,geohash_code,HOD_cat,DOW_cat,T-Accident,DayLight,T-BrokenVehicle,T-Congestion,...,W-Humidity,W-Precipitation,W-Pressure,W-Temperature,W-Visibility,W-WindSpeed,W-Rain,W-Snow,W-Fog,W-Hail
183978,7358,0.0,dhwf4,546,2,1,0,1,0.0,0.052632,...,0.59,0.0,0.84375,0.936524,1.0,0.341102,0.0,0.0,0.0,0.0
183979,7359,0.0,dhwf4,546,2,1,0,1,0.0,0.0,...,0.59,0.0,0.84375,0.936524,1.0,0.341102,0.0,0.0,0.0,0.0
183980,7360,0.0,dhwf4,546,2,1,0,1,0.0,0.052632,...,0.59,0.0,0.8125,0.936524,1.0,0.243644,0.0,0.0,0.0,0.0
183981,7361,0.0,dhwf4,546,2,1,0,1,0.0,0.052632,...,0.59,0.0,0.8125,0.936524,1.0,0.243644,0.0,0.0,0.0,0.0
183982,7362,0.0,dhwf4,546,2,1,0,1,0.0,0.052632,...,0.59,0.0,0.8125,0.936524,1.0,0.243644,0.0,0.0,0.0,0.0


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


Unnamed: 0,TimeStep,predicted_accident,Geohash,geohash_code,DOW_cat,T-Accident,DayLight,T-BrokenVehicle,T-Congestion,T-Construction,...,W-WindSpeed,W-Rain,W-Snow,W-Fog,W-Hail,HOD_en0,HOD_en1,HOD_en2,HOD_en3,HOD_en4
0,7358,0.0,dhwf4,546,1,0,1,0.0,0.052632,0.0,...,0.341102,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,7359,0.0,dhwf4,546,1,0,1,0.0,0.0,0.0,...,0.341102,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,7360,0.0,dhwf4,546,1,0,1,0.0,0.052632,0.0,...,0.243644,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,7361,0.0,dhwf4,546,1,0,1,0.0,0.052632,0.0,...,0.243644,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,7362,0.0,dhwf4,546,1,0,1,0.0,0.052632,0.0,...,0.243644,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


process  421  started
process list with length of  2
process  422  started
process list with length of  2
process  423  started
process list with length of  2
process  424  started
process list with length of  2
process  425  started
process list with length of  2
process list with length of  2
process  426  started
process list with length of  2
process  427  started
process list with length of  2
process  428  started
process list with length of  2
process  429  started
process  430  started
process list with length of  2
process  431  started
process  432  started
process list with length of  2
process list with length of  2
process  433  started
process list with length of  2
process  434  started
process list with length of  2
process  435  started
process list with length of  2
process  436  started
process  437  started
process list with length of  2
process list with length of  2
process  438  started
process list with length of  2
process  439  started
process list with length

In [24]:
train_data('all_cities')

Unnamed: 0,TimeStep,predicted_accident,Geohash,geohash_code,HOD_cat,DOW_cat,T-Accident,DayLight,T-BrokenVehicle,T-Congestion,...,W-Humidity,W-Precipitation,W-Pressure,W-Temperature,W-Visibility,W-WindSpeed,W-Rain,W-Snow,W-Fog,W-Hail
185451,0,0.0,djgz7,98,4,1,0,0,0,0,...,84.5,0.0,29.98,74.45,10.0,3.5,0,0,0,0
185452,1,0.0,djgz7,98,4,1,0,0,0,0,...,84.5,0.0,29.98,74.45,10.0,3.5,0,0,0,0
185453,2,0.0,djgz7,98,4,1,0,0,0,0,...,84.5,0.0,29.98,74.45,10.0,3.5,0,0,0,0
185454,3,0.0,djgz7,98,4,1,0,0,0,0,...,84.5,0.0,29.98,74.45,10.0,3.5,0,0,0,0
185455,4,0.0,djgz7,98,4,1,0,0,0,0,...,81.5,0.0,30.005,74.0,10.0,10.4,0,0,0,0


  return self.partial_fit(X, y)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


Unnamed: 0,TimeStep,predicted_accident,Geohash,geohash_code,HOD_cat,DOW_cat,T-Accident,DayLight,T-BrokenVehicle,T-Congestion,...,W-Humidity,W-Precipitation,W-Pressure,W-Temperature,W-Visibility,W-WindSpeed,W-Rain,W-Snow,W-Fog,W-Hail
192809,7358,0.0,djgz7,98,2,1,0,1,0.0,0.0,...,0.49,0.0,0.991928,0.773895,0.5,0.043091,0.0,0.0,0.0,0.0
192810,7359,0.0,djgz7,98,2,1,0,1,0.0,0.0,...,0.49,0.0,0.991928,0.773895,0.5,0.043091,0.0,0.0,0.0,0.0
192811,7360,0.0,djgz7,98,2,1,0,1,0.0,0.0,...,0.52,0.0,0.991598,0.771207,0.5,0.055721,0.0,0.0,0.0,0.0
192812,7361,0.0,djgz7,98,2,1,0,1,0.0,0.0,...,0.52,0.0,0.991598,0.771207,0.5,0.055721,0.0,0.0,0.0,0.0
192813,7362,0.0,djgz7,98,2,1,0,1,0.0,0.0,...,0.52,0.0,0.991598,0.771207,0.5,0.055721,0.0,0.0,0.0,0.0


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


Unnamed: 0,TimeStep,predicted_accident,Geohash,geohash_code,DOW_cat,T-Accident,DayLight,T-BrokenVehicle,T-Congestion,T-Construction,...,W-WindSpeed,W-Rain,W-Snow,W-Fog,W-Hail,HOD_en0,HOD_en1,HOD_en2,HOD_en3,HOD_en4
0,7358,0.0,djgz7,98,1,0,1,0.0,0.0,0.0,...,0.043091,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,7359,0.0,djgz7,98,1,0,1,0.0,0.0,0.0,...,0.043091,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,7360,0.0,djgz7,98,1,0,1,0.0,0.0,0.0,...,0.055721,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,7361,0.0,djgz7,98,1,0,1,0.0,0.0,0.0,...,0.055721,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,7362,0.0,djgz7,98,1,0,1,0.0,0.0,0.0,...,0.055721,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


process  477  started
process list with length of  34
process  478  started
process list with length of  34
process  479  started
process list with length of  34
process  480  started
process list with length of  34
process  481  started
process list with length of  34
process  482  started
process list with length of  34
process  483  started
process list with length of  34
process  484  started
process list with length of  34
process  485  started
process list with length of  34
process  486  started
process list with length of  34
process  487  started
process list with length of  34
process  488  started
process list with length of  34
process  489  started
process list with length of  34
process  490  started
process list with length of  34
process  491  started
process list with length of  34
process  492  started
process list with length of  34
process  493  started
process list with length of  34
process  494  started
process list with length of  34
process  495  started
proces