In [1]:
import pandas as pd
import numpy as np

import random
import os
import sys
import psutil

import matplotlib
import matplotlib.pyplot as plt
import math
from multiprocessing import cpu_count,Pool 
import multiprocessing

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

from sklearn.preprocessing import MinMaxScaler
import pickle


# Parallel Functions

In [2]:
class WithExtraArgs(object):
    def __init__(self, func, **args):
        self.func = func
        self.args = args
    def __call__(self, df):
        return self.func(df, **self.args)

def applyParallel(pool,data, func, kwargs):
    data_split = np.array_split(data, min(partitions,data.shape[0]))
    data = pd.concat(pool.map(WithExtraArgs(func, **kwargs), data_split))
    return data

def parallelize(data, func,pool,partition):
    data_split = [frame_list[i:i + partition] for i in xrange(0, len(frame_list), partition)]
    #data_split = np.array_split(data, partitions)
    data =pool.map(func, data_split)
    return data


In [3]:
cores = cpu_count() #Number of CPU cores on your system
partitions = cores
partitions

28

In [4]:
def clean_data(filepath,storename):
    df = pd.read_csv(filepath)
    display (df.head())
    
    list_ = df.columns
    print (list_)
    
    temp_df = df [[u'TimeStep', u'T-Accident',u'Geohash', u'HOD', u'DOW', u'DayLight',
       u'T-BrokenVehicle', u'T-Congestion', u'T-Construction', u'T-Event',
       u'T-FlowIncident', u'T-Other', u'T-RoadBlocked', u'W-Humidity',
       u'W-Precipitation', u'W-Pressure', u'W-Temperature', u'W-Visibility',
       u'W-WindSpeed', u'W-Rain', u'W-Snow', u'W-Fog', u'W-Hail']]
    temp_df.to_hdf(storename+'.h5',key='set1')
    display(temp_df.head())
    
    print ("zero accident =",float(df[df['T-Accident']==0].shape[0])/df.shape[0])
    
    f = open("geo_dict.pkl","rb")
    geo_dict = pickle.load(f)
    f.close()
    
    def fun_hash(geohash):
        return geo_dict[geohash]
    df['geohash_code'] = df.apply(lambda row: fun_hash(row['Geohash']), axis=1) 
    temp_df = df [[u'TimeStep', u'T-Accident',u'Geohash',u'geohash_code', u'HOD', u'DOW', u'DayLight',
       u'T-BrokenVehicle', u'T-Congestion', u'T-Construction', u'T-Event',
       u'T-FlowIncident', u'T-Other', u'T-RoadBlocked', u'W-Humidity',
       u'W-Precipitation', u'W-Pressure', u'W-Temperature', u'W-Visibility',
       u'W-WindSpeed', u'W-Rain', u'W-Snow', u'W-Fog', u'W-Hail']]
    temp_df.to_hdf(storename+'.h5',key='set2')
    
    df = pd.read_hdf(storename+'.h5',key='set2')
    display(df.head())
    
    def week_day(DOW):
        if DOW < 5:
            return 1
        else:
            return 0
    def shift(group):
        df_list=[]
        for idx,df in group:
            df['predicted_accident'] = df['T-Accident'].shift(-1)
            df.drop(df.tail(1).index,inplace=True)
            df_list.append(df)
        return pd.concat(df_list)

    def time_interval(HOD):
        if HOD >=6 and HOD <10:
            return 0
        if HOD >= 10 and HOD<15:
            return 1
        if HOD >=15 and HOD< 18:
            return 2;
        if HOD >=18 and HOD< 22:
            return 3
        else:
            return 4; 
    def make_binary(d):
        if d > 0:
            return 1
        else:
            return 0    
    df['DOW_cat'] = df.apply(lambda row: week_day(row['DOW']), axis=1)   
    df['HOD_cat'] = df.apply(lambda row: time_interval(row['HOD']), axis=1) 
    df['T-Accident'] = df.apply(lambda row: make_binary(row['T-Accident']), axis=1) 
    group = df.groupby('Geohash')
    df = shift(group)
    temp_df = df [[u'TimeStep', u'predicted_accident',u'Geohash',u'geohash_code', u'HOD_cat', u'DOW_cat', u'T-Accident',u'DayLight',
       u'T-BrokenVehicle', u'T-Congestion', u'T-Construction', u'T-Event',
       u'T-FlowIncident', u'T-Other', u'T-RoadBlocked', u'W-Humidity',
       u'W-Precipitation', u'W-Pressure', u'W-Temperature', u'W-Visibility',
       u'W-WindSpeed', u'W-Rain', u'W-Snow', u'W-Fog', u'W-Hail']]
    temp_df.to_hdf(storename+'.h5',key='set3')

# Analyze City

In [5]:
np.__version__

'1.15.4'

In [6]:
clean_data("../Traffic/Accidents/vectors/Atlanta_geo2vec_201861-2018831.csv",'Atlanta')

Unnamed: 0,Geohash,TimeStep,DOW,HOD,DayLight,T-Accident,T-BrokenVehicle,T-Congestion,T-Construction,T-Event,...,W-Humidity,W-Precipitation,W-Pressure,W-Temperature,W-Visibility,W-WindSpeed,W-Rain,W-Snow,W-Fog,W-Hail
0,dn5b5,0,4,0,0,0,0,0,0,0,...,90.0,0.0,29.97,75.0,10.0,0.0,0,0,0,0
1,dn5b5,1,4,0,0,0,0,0,0,0,...,90.0,0.0,29.97,75.0,10.0,0.0,0,0,0,0
2,dn5b5,2,4,0,0,0,0,0,0,0,...,90.0,0.0,29.97,75.0,10.0,0.0,0,0,0,0
3,dn5b5,3,4,0,0,0,0,0,0,0,...,90.0,0.0,29.97,75.0,10.0,0.0,0,0,0,0
4,dn5b5,4,4,1,0,0,0,0,0,0,...,90.0,0.0,30.01,73.0,10.0,17.3,0,0,0,0


Index(['Geohash', 'TimeStep', 'DOW', 'HOD', 'DayLight', 'T-Accident',
       'T-BrokenVehicle', 'T-Congestion', 'T-Construction', 'T-Event',
       'T-FlowIncident', 'T-Other', 'T-RoadBlocked', 'W-Humidity',
       'W-Precipitation', 'W-Pressure', 'W-Temperature', 'W-Visibility',
       'W-WindSpeed', 'W-Rain', 'W-Snow', 'W-Fog', 'W-Hail'],
      dtype='object')


Unnamed: 0,TimeStep,T-Accident,Geohash,HOD,DOW,DayLight,T-BrokenVehicle,T-Congestion,T-Construction,T-Event,...,W-Humidity,W-Precipitation,W-Pressure,W-Temperature,W-Visibility,W-WindSpeed,W-Rain,W-Snow,W-Fog,W-Hail
0,0,0,dn5b5,0,4,0,0,0,0,0,...,90.0,0.0,29.97,75.0,10.0,0.0,0,0,0,0
1,1,0,dn5b5,0,4,0,0,0,0,0,...,90.0,0.0,29.97,75.0,10.0,0.0,0,0,0,0
2,2,0,dn5b5,0,4,0,0,0,0,0,...,90.0,0.0,29.97,75.0,10.0,0.0,0,0,0,0
3,3,0,dn5b5,0,4,0,0,0,0,0,...,90.0,0.0,29.97,75.0,10.0,0.0,0,0,0,0
4,4,0,dn5b5,1,4,0,0,0,0,0,...,90.0,0.0,30.01,73.0,10.0,17.3,0,0,0,0


zero accident = 0.9956087179521339


Unnamed: 0,TimeStep,T-Accident,Geohash,geohash_code,HOD,DOW,DayLight,T-BrokenVehicle,T-Congestion,T-Construction,...,W-Humidity,W-Precipitation,W-Pressure,W-Temperature,W-Visibility,W-WindSpeed,W-Rain,W-Snow,W-Fog,W-Hail
0,0,0,dn5b5,343,0,4,0,0,0,0,...,90.0,0.0,29.97,75.0,10.0,0.0,0,0,0,0
1,1,0,dn5b5,343,0,4,0,0,0,0,...,90.0,0.0,29.97,75.0,10.0,0.0,0,0,0,0
2,2,0,dn5b5,343,0,4,0,0,0,0,...,90.0,0.0,29.97,75.0,10.0,0.0,0,0,0,0
3,3,0,dn5b5,343,0,4,0,0,0,0,...,90.0,0.0,29.97,75.0,10.0,0.0,0,0,0,0
4,4,0,dn5b5,343,1,4,0,0,0,0,...,90.0,0.0,30.01,73.0,10.0,17.3,0,0,0,0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [7]:
clean_data("../Traffic/Accidents/vectors/Austin_geo2vec_201861-2018831.csv",'Austin')

Unnamed: 0,Geohash,TimeStep,DOW,HOD,DayLight,T-Accident,T-BrokenVehicle,T-Congestion,T-Construction,T-Event,...,W-Humidity,W-Precipitation,W-Pressure,W-Temperature,W-Visibility,W-WindSpeed,W-Rain,W-Snow,W-Fog,W-Hail
0,9v6mn,0,4,0,0,0,0,0,0,0,...,77.5,0.0,29.865,78.7,10.0,8.65,0,0,0,0
1,9v6mn,1,4,0,0,0,0,0,0,0,...,77.5,0.0,29.865,78.7,10.0,8.65,0,0,0,0
2,9v6mn,2,4,0,0,0,0,0,0,0,...,77.5,0.0,29.87,78.45,10.0,7.5,0,0,0,0
3,9v6mn,3,4,0,0,0,0,0,0,0,...,78.0,0.0,29.865,78.1,10.0,7.5,0,0,0,0
4,9v6mn,4,4,1,0,0,0,0,0,0,...,80.0,0.0,29.885,77.45,10.0,8.65,0,0,0,0


Index(['Geohash', 'TimeStep', 'DOW', 'HOD', 'DayLight', 'T-Accident',
       'T-BrokenVehicle', 'T-Congestion', 'T-Construction', 'T-Event',
       'T-FlowIncident', 'T-Other', 'T-RoadBlocked', 'W-Humidity',
       'W-Precipitation', 'W-Pressure', 'W-Temperature', 'W-Visibility',
       'W-WindSpeed', 'W-Rain', 'W-Snow', 'W-Fog', 'W-Hail'],
      dtype='object')


Unnamed: 0,TimeStep,T-Accident,Geohash,HOD,DOW,DayLight,T-BrokenVehicle,T-Congestion,T-Construction,T-Event,...,W-Humidity,W-Precipitation,W-Pressure,W-Temperature,W-Visibility,W-WindSpeed,W-Rain,W-Snow,W-Fog,W-Hail
0,0,0,9v6mn,0,4,0,0,0,0,0,...,77.5,0.0,29.865,78.7,10.0,8.65,0,0,0,0
1,1,0,9v6mn,0,4,0,0,0,0,0,...,77.5,0.0,29.865,78.7,10.0,8.65,0,0,0,0
2,2,0,9v6mn,0,4,0,0,0,0,0,...,77.5,0.0,29.87,78.45,10.0,7.5,0,0,0,0
3,3,0,9v6mn,0,4,0,0,0,0,0,...,78.0,0.0,29.865,78.1,10.0,7.5,0,0,0,0
4,4,0,9v6mn,1,4,0,0,0,0,0,...,80.0,0.0,29.885,77.45,10.0,8.65,0,0,0,0


zero accident = 0.9963011978102815


Unnamed: 0,TimeStep,T-Accident,Geohash,geohash_code,HOD,DOW,DayLight,T-BrokenVehicle,T-Congestion,T-Construction,...,W-Humidity,W-Precipitation,W-Pressure,W-Temperature,W-Visibility,W-WindSpeed,W-Rain,W-Snow,W-Fog,W-Hail
0,0,0,9v6mn,0,0,4,0,0,0,0,...,77.5,0.0,29.865,78.7,10.0,8.65,0,0,0,0
1,1,0,9v6mn,0,0,4,0,0,0,0,...,77.5,0.0,29.865,78.7,10.0,8.65,0,0,0,0
2,2,0,9v6mn,0,0,4,0,0,0,0,...,77.5,0.0,29.87,78.45,10.0,7.5,0,0,0,0
3,3,0,9v6mn,0,0,4,0,0,0,0,...,78.0,0.0,29.865,78.1,10.0,7.5,0,0,0,0
4,4,0,9v6mn,0,1,4,0,0,0,0,...,80.0,0.0,29.885,77.45,10.0,8.65,0,0,0,0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [8]:
clean_data("../Traffic/Accidents/vectors/Charlotte_geo2vec_201861-2018831.csv",'Charlotte')

Unnamed: 0,Geohash,TimeStep,DOW,HOD,DayLight,T-Accident,T-BrokenVehicle,T-Congestion,T-Construction,T-Event,...,W-Humidity,W-Precipitation,W-Pressure,W-Temperature,W-Visibility,W-WindSpeed,W-Rain,W-Snow,W-Fog,W-Hail
0,dnnr5,0,4,0,0,0,0,0,0,0,...,100.0,0.0,29.93,69.1,9.0,5.8,0,0,0,0
1,dnnr5,1,4,0,0,0,0,0,0,0,...,100.0,0.0,29.93,69.1,9.0,5.8,0,0,0,0
2,dnnr5,2,4,0,0,0,0,0,0,0,...,100.0,0.0,29.93,69.1,9.0,5.8,0,0,0,0
3,dnnr5,3,4,0,0,0,0,0,0,0,...,100.0,0.0,29.93,69.1,9.0,5.8,0,0,0,0
4,dnnr5,4,4,1,0,0,0,0,0,0,...,96.0,0.0,29.9,69.1,9.0,4.6,0,0,0,0


Index(['Geohash', 'TimeStep', 'DOW', 'HOD', 'DayLight', 'T-Accident',
       'T-BrokenVehicle', 'T-Congestion', 'T-Construction', 'T-Event',
       'T-FlowIncident', 'T-Other', 'T-RoadBlocked', 'W-Humidity',
       'W-Precipitation', 'W-Pressure', 'W-Temperature', 'W-Visibility',
       'W-WindSpeed', 'W-Rain', 'W-Snow', 'W-Fog', 'W-Hail'],
      dtype='object')


Unnamed: 0,TimeStep,T-Accident,Geohash,HOD,DOW,DayLight,T-BrokenVehicle,T-Congestion,T-Construction,T-Event,...,W-Humidity,W-Precipitation,W-Pressure,W-Temperature,W-Visibility,W-WindSpeed,W-Rain,W-Snow,W-Fog,W-Hail
0,0,0,dnnr5,0,4,0,0,0,0,0,...,100.0,0.0,29.93,69.1,9.0,5.8,0,0,0,0
1,1,0,dnnr5,0,4,0,0,0,0,0,...,100.0,0.0,29.93,69.1,9.0,5.8,0,0,0,0
2,2,0,dnnr5,0,4,0,0,0,0,0,...,100.0,0.0,29.93,69.1,9.0,5.8,0,0,0,0
3,3,0,dnnr5,0,4,0,0,0,0,0,...,100.0,0.0,29.93,69.1,9.0,5.8,0,0,0,0
4,4,0,dnnr5,1,4,0,0,0,0,0,...,96.0,0.0,29.9,69.1,9.0,4.6,0,0,0,0


zero accident = 0.9947404180250196


Unnamed: 0,TimeStep,T-Accident,Geohash,geohash_code,HOD,DOW,DayLight,T-BrokenVehicle,T-Congestion,T-Construction,...,W-Humidity,W-Precipitation,W-Pressure,W-Temperature,W-Visibility,W-WindSpeed,W-Rain,W-Snow,W-Fog,W-Hail
0,0,0,dnnr5,241,0,4,0,0,0,0,...,100.0,0.0,29.93,69.1,9.0,5.8,0,0,0,0
1,1,0,dnnr5,241,0,4,0,0,0,0,...,100.0,0.0,29.93,69.1,9.0,5.8,0,0,0,0
2,2,0,dnnr5,241,0,4,0,0,0,0,...,100.0,0.0,29.93,69.1,9.0,5.8,0,0,0,0
3,3,0,dnnr5,241,0,4,0,0,0,0,...,100.0,0.0,29.93,69.1,9.0,5.8,0,0,0,0
4,4,0,dnnr5,241,1,4,0,0,0,0,...,96.0,0.0,29.9,69.1,9.0,4.6,0,0,0,0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [9]:
clean_data("../Traffic/Accidents/vectors/Dallas_geo2vec_201861-2018831.csv",'Dallas')

Unnamed: 0,Geohash,TimeStep,DOW,HOD,DayLight,T-Accident,T-BrokenVehicle,T-Congestion,T-Construction,T-Event,...,W-Humidity,W-Precipitation,W-Pressure,W-Temperature,W-Visibility,W-WindSpeed,W-Rain,W-Snow,W-Fog,W-Hail
0,9vg67,0,4,0,0,0,0,0,0,0,...,67.0,0.0,29.84,81.1,10.0,10.4,0,0,0,0
1,9vg67,1,4,0,0,0,0,0,0,0,...,67.0,0.0,29.84,81.1,10.0,10.4,0,0,0,0
2,9vg67,2,4,0,0,0,0,0,0,0,...,69.0,0.0,29.84,80.6,10.0,10.4,0,0,0,0
3,9vg67,3,4,0,0,0,0,0,0,0,...,70.0,0.0,29.84,80.1,10.0,10.4,0,0,0,0
4,9vg67,4,4,1,0,0,0,0,0,0,...,68.0,0.0,29.84,80.4,10.0,10.4,0,0,0,0


Index(['Geohash', 'TimeStep', 'DOW', 'HOD', 'DayLight', 'T-Accident',
       'T-BrokenVehicle', 'T-Congestion', 'T-Construction', 'T-Event',
       'T-FlowIncident', 'T-Other', 'T-RoadBlocked', 'W-Humidity',
       'W-Precipitation', 'W-Pressure', 'W-Temperature', 'W-Visibility',
       'W-WindSpeed', 'W-Rain', 'W-Snow', 'W-Fog', 'W-Hail'],
      dtype='object')


Unnamed: 0,TimeStep,T-Accident,Geohash,HOD,DOW,DayLight,T-BrokenVehicle,T-Congestion,T-Construction,T-Event,...,W-Humidity,W-Precipitation,W-Pressure,W-Temperature,W-Visibility,W-WindSpeed,W-Rain,W-Snow,W-Fog,W-Hail
0,0,0,9vg67,0,4,0,0,0,0,0,...,67.0,0.0,29.84,81.1,10.0,10.4,0,0,0,0
1,1,0,9vg67,0,4,0,0,0,0,0,...,67.0,0.0,29.84,81.1,10.0,10.4,0,0,0,0
2,2,0,9vg67,0,4,0,0,0,0,0,...,69.0,0.0,29.84,80.6,10.0,10.4,0,0,0,0
3,3,0,9vg67,0,4,0,0,0,0,0,...,70.0,0.0,29.84,80.1,10.0,10.4,0,0,0,0
4,4,0,9vg67,1,4,0,0,0,0,0,...,68.0,0.0,29.84,80.4,10.0,10.4,0,0,0,0


zero accident = 0.9976703464237677


Unnamed: 0,TimeStep,T-Accident,Geohash,geohash_code,HOD,DOW,DayLight,T-BrokenVehicle,T-Congestion,T-Construction,...,W-Humidity,W-Precipitation,W-Pressure,W-Temperature,W-Visibility,W-WindSpeed,W-Rain,W-Snow,W-Fog,W-Hail
0,0,0,9vg67,716,0,4,0,0,0,0,...,67.0,0.0,29.84,81.1,10.0,10.4,0,0,0,0
1,1,0,9vg67,716,0,4,0,0,0,0,...,67.0,0.0,29.84,81.1,10.0,10.4,0,0,0,0
2,2,0,9vg67,716,0,4,0,0,0,0,...,69.0,0.0,29.84,80.6,10.0,10.4,0,0,0,0
3,3,0,9vg67,716,0,4,0,0,0,0,...,70.0,0.0,29.84,80.1,10.0,10.4,0,0,0,0
4,4,0,9vg67,716,1,4,0,0,0,0,...,68.0,0.0,29.84,80.4,10.0,10.4,0,0,0,0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [10]:
clean_data("../Traffic/Accidents/vectors/Houston_geo2vec_201861-2018831.csv",'Houston')

Unnamed: 0,Geohash,TimeStep,DOW,HOD,DayLight,T-Accident,T-BrokenVehicle,T-Congestion,T-Construction,T-Event,...,W-Humidity,W-Precipitation,W-Pressure,W-Temperature,W-Visibility,W-WindSpeed,W-Rain,W-Snow,W-Fog,W-Hail
0,9vk3m,0,4,0,0,0,0,0,0,0,...,89.0,0.0,29.88,78.8,10.0,4.6,0,0,0,0
1,9vk3m,1,4,0,0,0,0,0,0,0,...,89.0,0.0,29.88,78.8,10.0,4.6,0,0,0,0
2,9vk3m,2,4,0,0,0,0,0,0,0,...,89.0,0.0,29.88,78.8,10.0,4.6,0,0,0,0
3,9vk3m,3,4,0,0,0,0,0,0,0,...,89.0,0.0,29.88,78.8,10.0,4.6,0,0,0,0
4,9vk3m,4,4,1,0,0,0,0,0,0,...,89.0,0.0,29.88,78.8,10.0,6.9,0,0,0,0


Index(['Geohash', 'TimeStep', 'DOW', 'HOD', 'DayLight', 'T-Accident',
       'T-BrokenVehicle', 'T-Congestion', 'T-Construction', 'T-Event',
       'T-FlowIncident', 'T-Other', 'T-RoadBlocked', 'W-Humidity',
       'W-Precipitation', 'W-Pressure', 'W-Temperature', 'W-Visibility',
       'W-WindSpeed', 'W-Rain', 'W-Snow', 'W-Fog', 'W-Hail'],
      dtype='object')


Unnamed: 0,TimeStep,T-Accident,Geohash,HOD,DOW,DayLight,T-BrokenVehicle,T-Congestion,T-Construction,T-Event,...,W-Humidity,W-Precipitation,W-Pressure,W-Temperature,W-Visibility,W-WindSpeed,W-Rain,W-Snow,W-Fog,W-Hail
0,0,0,9vk3m,0,4,0,0,0,0,0,...,89.0,0.0,29.88,78.8,10.0,4.6,0,0,0,0
1,1,0,9vk3m,0,4,0,0,0,0,0,...,89.0,0.0,29.88,78.8,10.0,4.6,0,0,0,0
2,2,0,9vk3m,0,4,0,0,0,0,0,...,89.0,0.0,29.88,78.8,10.0,4.6,0,0,0,0
3,3,0,9vk3m,0,4,0,0,0,0,0,...,89.0,0.0,29.88,78.8,10.0,4.6,0,0,0,0
4,4,0,9vk3m,1,4,0,0,0,0,0,...,89.0,0.0,29.88,78.8,10.0,6.9,0,0,0,0


zero accident = 0.9973318707965141


Unnamed: 0,TimeStep,T-Accident,Geohash,geohash_code,HOD,DOW,DayLight,T-BrokenVehicle,T-Congestion,T-Construction,...,W-Humidity,W-Precipitation,W-Pressure,W-Temperature,W-Visibility,W-WindSpeed,W-Rain,W-Snow,W-Fog,W-Hail
0,0,0,9vk3m,112,0,4,0,0,0,0,...,89.0,0.0,29.88,78.8,10.0,4.6,0,0,0,0
1,1,0,9vk3m,112,0,4,0,0,0,0,...,89.0,0.0,29.88,78.8,10.0,4.6,0,0,0,0
2,2,0,9vk3m,112,0,4,0,0,0,0,...,89.0,0.0,29.88,78.8,10.0,4.6,0,0,0,0
3,3,0,9vk3m,112,0,4,0,0,0,0,...,89.0,0.0,29.88,78.8,10.0,4.6,0,0,0,0
4,4,0,9vk3m,112,1,4,0,0,0,0,...,89.0,0.0,29.88,78.8,10.0,6.9,0,0,0,0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [11]:
clean_data("../Traffic/Accidents/vectors/LosAngeles_geo2vec_201861-2018831.csv",'LosAngeles')

Unnamed: 0,Geohash,TimeStep,DOW,HOD,DayLight,T-Accident,T-BrokenVehicle,T-Congestion,T-Construction,T-Event,...,W-Humidity,W-Precipitation,W-Pressure,W-Temperature,W-Visibility,W-WindSpeed,W-Rain,W-Snow,W-Fog,W-Hail
0,9mgzc,0,4,0,0,0,0,0,0,0,...,82.0,0.0,29.98,57.2,10.0,0.0,0,0,0,0
1,9mgzc,1,4,0,0,0,0,0,0,0,...,82.0,0.0,29.98,57.2,10.0,0.0,0,0,0,0
2,9mgzc,2,4,0,0,0,0,0,0,0,...,82.0,0.0,29.98,57.2,10.0,0.0,0,0,0,0
3,9mgzc,3,4,0,0,0,0,0,0,0,...,82.0,0.0,29.98,57.2,10.0,0.0,0,0,0,0
4,9mgzc,4,4,1,0,0,0,0,0,0,...,82.0,0.0,29.98,57.2,10.0,0.0,0,0,0,0


Index(['Geohash', 'TimeStep', 'DOW', 'HOD', 'DayLight', 'T-Accident',
       'T-BrokenVehicle', 'T-Congestion', 'T-Construction', 'T-Event',
       'T-FlowIncident', 'T-Other', 'T-RoadBlocked', 'W-Humidity',
       'W-Precipitation', 'W-Pressure', 'W-Temperature', 'W-Visibility',
       'W-WindSpeed', 'W-Rain', 'W-Snow', 'W-Fog', 'W-Hail'],
      dtype='object')


Unnamed: 0,TimeStep,T-Accident,Geohash,HOD,DOW,DayLight,T-BrokenVehicle,T-Congestion,T-Construction,T-Event,...,W-Humidity,W-Precipitation,W-Pressure,W-Temperature,W-Visibility,W-WindSpeed,W-Rain,W-Snow,W-Fog,W-Hail
0,0,0,9mgzc,0,4,0,0,0,0,0,...,82.0,0.0,29.98,57.2,10.0,0.0,0,0,0,0
1,1,0,9mgzc,0,4,0,0,0,0,0,...,82.0,0.0,29.98,57.2,10.0,0.0,0,0,0,0
2,2,0,9mgzc,0,4,0,0,0,0,0,...,82.0,0.0,29.98,57.2,10.0,0.0,0,0,0,0
3,3,0,9mgzc,0,4,0,0,0,0,0,...,82.0,0.0,29.98,57.2,10.0,0.0,0,0,0,0
4,4,0,9mgzc,1,4,0,0,0,0,0,...,82.0,0.0,29.98,57.2,10.0,0.0,0,0,0,0


zero accident = 0.9944858881269143


Unnamed: 0,TimeStep,T-Accident,Geohash,geohash_code,HOD,DOW,DayLight,T-BrokenVehicle,T-Congestion,T-Construction,...,W-Humidity,W-Precipitation,W-Pressure,W-Temperature,W-Visibility,W-WindSpeed,W-Rain,W-Snow,W-Fog,W-Hail
0,0,0,9mgzc,237,0,4,0,0,0,0,...,82.0,0.0,29.98,57.2,10.0,0.0,0,0,0,0
1,1,0,9mgzc,237,0,4,0,0,0,0,...,82.0,0.0,29.98,57.2,10.0,0.0,0,0,0,0
2,2,0,9mgzc,237,0,4,0,0,0,0,...,82.0,0.0,29.98,57.2,10.0,0.0,0,0,0,0
3,3,0,9mgzc,237,0,4,0,0,0,0,...,82.0,0.0,29.98,57.2,10.0,0.0,0,0,0,0
4,4,0,9mgzc,237,1,4,0,0,0,0,...,82.0,0.0,29.98,57.2,10.0,0.0,0,0,0,0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [12]:
clean_data("../Traffic/Accidents/vectors/Miami_geo2vec_201861-2018831.csv",'Miami')

Unnamed: 0,Geohash,TimeStep,DOW,HOD,DayLight,T-Accident,T-BrokenVehicle,T-Congestion,T-Construction,T-Event,...,W-Humidity,W-Precipitation,W-Pressure,W-Temperature,W-Visibility,W-WindSpeed,W-Rain,W-Snow,W-Fog,W-Hail
0,dhx58,0,4,0,0,0,0,0,0,0,...,84.0,0.0,30.04,79.0,10.0,0.0,0,0,0,0
1,dhx58,1,4,0,0,0,0,0,0,0,...,84.0,0.0,30.04,79.0,10.0,0.0,0,0,0,0
2,dhx58,2,4,0,0,0,0,0,0,0,...,84.0,0.0,30.04,79.0,10.0,0.0,0,0,0,0
3,dhx58,3,4,0,0,0,0,0,0,0,...,84.0,0.0,30.04,79.0,10.0,0.0,0,0,0,0
4,dhx58,4,4,1,0,0,0,0,0,0,...,85.0,0.0,30.02,80.1,10.0,0.0,0,0,0,0


Index(['Geohash', 'TimeStep', 'DOW', 'HOD', 'DayLight', 'T-Accident',
       'T-BrokenVehicle', 'T-Congestion', 'T-Construction', 'T-Event',
       'T-FlowIncident', 'T-Other', 'T-RoadBlocked', 'W-Humidity',
       'W-Precipitation', 'W-Pressure', 'W-Temperature', 'W-Visibility',
       'W-WindSpeed', 'W-Rain', 'W-Snow', 'W-Fog', 'W-Hail'],
      dtype='object')


Unnamed: 0,TimeStep,T-Accident,Geohash,HOD,DOW,DayLight,T-BrokenVehicle,T-Congestion,T-Construction,T-Event,...,W-Humidity,W-Precipitation,W-Pressure,W-Temperature,W-Visibility,W-WindSpeed,W-Rain,W-Snow,W-Fog,W-Hail
0,0,0,dhx58,0,4,0,0,0,0,0,...,84.0,0.0,30.04,79.0,10.0,0.0,0,0,0,0
1,1,0,dhx58,0,4,0,0,0,0,0,...,84.0,0.0,30.04,79.0,10.0,0.0,0,0,0,0
2,2,0,dhx58,0,4,0,0,0,0,0,...,84.0,0.0,30.04,79.0,10.0,0.0,0,0,0,0
3,3,0,dhx58,0,4,0,0,0,0,0,...,84.0,0.0,30.04,79.0,10.0,0.0,0,0,0,0
4,4,0,dhx58,1,4,0,0,0,0,0,...,85.0,0.0,30.02,80.1,10.0,0.0,0,0,0,0


zero accident = 0.9967261792423155


Unnamed: 0,TimeStep,T-Accident,Geohash,geohash_code,HOD,DOW,DayLight,T-BrokenVehicle,T-Congestion,T-Construction,...,W-Humidity,W-Precipitation,W-Pressure,W-Temperature,W-Visibility,W-WindSpeed,W-Rain,W-Snow,W-Fog,W-Hail
0,0,0,dhx58,789,0,4,0,0,0,0,...,84.0,0.0,30.04,79.0,10.0,0.0,0,0,0,0
1,1,0,dhx58,789,0,4,0,0,0,0,...,84.0,0.0,30.04,79.0,10.0,0.0,0,0,0,0
2,2,0,dhx58,789,0,4,0,0,0,0,...,84.0,0.0,30.04,79.0,10.0,0.0,0,0,0,0
3,3,0,dhx58,789,0,4,0,0,0,0,...,84.0,0.0,30.04,79.0,10.0,0.0,0,0,0,0
4,4,0,dhx58,789,1,4,0,0,0,0,...,85.0,0.0,30.02,80.1,10.0,0.0,0,0,0,0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [13]:
cities=['Atlanta','Austin','Charlotte','Dallas','Houston','LosAngeles','Miami']
list_app=[]
for city in cities:
    list_app.append(pd.read_hdf(city+'.h5',key='set3'))
list_app
final_df = pd.concat(list_app)

In [14]:
final_df.head()

Unnamed: 0,TimeStep,predicted_accident,Geohash,geohash_code,HOD_cat,DOW_cat,T-Accident,DayLight,T-BrokenVehicle,T-Congestion,...,W-Humidity,W-Precipitation,W-Pressure,W-Temperature,W-Visibility,W-WindSpeed,W-Rain,W-Snow,W-Fog,W-Hail
185451,0,0.0,djgz7,98,4,1,0,0,0,0,...,84.5,0.0,29.98,74.45,10.0,3.5,0,0,0,0
185452,1,0.0,djgz7,98,4,1,0,0,0,0,...,84.5,0.0,29.98,74.45,10.0,3.5,0,0,0,0
185453,2,0.0,djgz7,98,4,1,0,0,0,0,...,84.5,0.0,29.98,74.45,10.0,3.5,0,0,0,0
185454,3,0.0,djgz7,98,4,1,0,0,0,0,...,84.5,0.0,29.98,74.45,10.0,3.5,0,0,0,0
185455,4,0.0,djgz7,98,4,1,0,0,0,0,...,81.5,0.0,30.005,74.0,10.0,10.4,0,0,0,0


In [16]:
final_df.to_hdf('all_cities'+'.h5',key='set3')

In [18]:
final_df.columns

Index(['TimeStep', 'predicted_accident', 'Geohash', 'geohash_code', 'HOD_cat',
       'DOW_cat', 'T-Accident', 'DayLight', 'T-BrokenVehicle', 'T-Congestion',
       'T-Construction', 'T-Event', 'T-FlowIncident', 'T-Other',
       'T-RoadBlocked', 'W-Humidity', 'W-Precipitation', 'W-Pressure',
       'W-Temperature', 'W-Visibility', 'W-WindSpeed', 'W-Rain', 'W-Snow',
       'W-Fog', 'W-Hail'],
      dtype='object')