## Geo-Spatial Features

In [1]:
import pandas as pd
from tqdm import tqdm
tqdm.pandas()
# import pandas_profiling as pf 

import os
import numpy as np
import seaborn as sns
import matplotlib as plt
pd.options.mode.chained_assignment = None
%matplotlib inline

## Geocode
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter


  from pandas import Panel


## Load DataSet

In [2]:
data = pd.read_csv('Data/train.csv')

data=data.rename(columns = {'Occurrence Local Date Time':'Datetime'})
data=data.rename(columns = {'road_segment_id':'segment_id'})

### Date conversion (make dayfirst=True)
### Clean Bad Data

data['Datetime'] = pd.to_datetime(data['Datetime'], dayfirst=True)

bad_seg = ['-34.0436786939','-33.9622761744','-33.8891283413','-34.0894652753','-33.9680008638']
for seg in bad_seg:
    data.loc[data['segment_id'] == seg , 'longitude']  = data.loc[data['segment_id'] == seg , 'latitude']
    data.loc[data['segment_id'] == seg , 'latitude']  = data.loc[data['segment_id'] == seg , 'segment_id']
    data.loc[data['segment_id'] == seg , 'segment_id']  = np.nan

In [3]:
data[data.segment_id.isna()]

Unnamed: 0,EventId,Datetime,Reporting Agency,Cause,Subcause,Status,longitude,latitude,segment_id
16693,88632,2016-12-09 04:45:00,,Cam,Accident,Single Vehicle,18.5642,-33.9622761744,
23455,100642,2017-05-16 07:22:00,,cdam,Congestion,Any,18.8397,-34.0894652753,
23747,101150,2017-05-21 08:30:00,,cam,Routine Road Maintenance,Any,18.5798,-33.9680008638,
27586,108150,2017-08-30 07:12:00,,cam,Congestion,Any,18.6146,-33.8891283413,
30341,112789,2017-10-21 08:03:00,,cam,Stationary Vehicle,Vehicle On Shoulder,18.7424,-34.0436786939,


In [4]:
data.loc[data['EventId'] == 88632, 'segment_id']   = '6OTRJQF'
data.loc[data['EventId'] == 100642, 'segment_id']   = 'Q03FQ74'
data.loc[data['EventId'] == 101150, 'segment_id']   = 'S2QPOTD'
data.loc[data['EventId'] == 108150, 'segment_id']   = 'LNO3W8J'
data.loc[data['EventId'] == 112789, 'segment_id']   = 'IUTMY1U'

data.longitude = data.longitude.astype(float)
data.latitude = data.latitude.astype(float)

### Extraction for a geo-coordinates

### GeoData Creation

In [5]:
GeoData = data[['segment_id','EventId','latitude', 'longitude']]
display(GeoData.head(2))
print(GeoData.shape)

Unnamed: 0,segment_id,EventId,latitude,longitude
0,S0B3CGQ,60558,-33.888275,18.540896
1,RYJYAPI,60559,-34.140857,18.930756


(53845, 4)


### GeoCoordinates Rounding

In [6]:
# Rounding off to 2 decimal places
GeoData['latitudeX'] = GeoData.latitude.round(3)
GeoData['longitudeX'] = GeoData.longitude.round(3)
print('GeoData size = ',len(GeoData))

GeoData size =  53845


### Dropping duplicates in latitudeX & latitudeX

In [7]:
#Combining latitudeX & latitudeX
GeoData['LatLong'] = GeoData.latitudeX.astype('str')+'+'+GeoData.longitudeX.astype('str')
GeoDataX = GeoData.drop_duplicates(['LatLong'], keep='last')
# Reset index
GeoDataX.reset_index(drop=True, inplace=True)
display(GeoDataX.head(2))
print('GeoDataX size = ',len(GeoDataX))
#Dropping LatLong
del GeoDataX['LatLong']

Unnamed: 0,segment_id,EventId,latitude,longitude,latitudeX,longitudeX,LatLong
0,X4UA382,60564,-33.885498,18.638471,-33.885,18.638,-33.885+18.638
1,SPPGKO2,60578,-33.855022,18.531405,-33.855,18.531,-33.855+18.531


GeoDataX size =  2215


### Extracting the Geo info from Geocordinate

In [8]:
# geolocator = Nominatim(user_agent='WarrieJ',timeout=None)
# def get_suburb(row):
#     pos = str(row['latitudeX']) + ', ' + str(row['longitudeX'])
#     location = geolocator.reverse(pos)
#     return location.raw['address']

geolocator = Nominatim(user_agent='Wale',timeout=None)
def get_suburb(row):
    pos = str(row['latitudeX']) + ', ' + str(row['longitudeX'])
    location = RateLimiter(geolocator.reverse(pos), min_delay_seconds=1)
    return location

In [9]:
len(GeoDataX)/2

1107.5

In [10]:
# GeoData_001to500 = 
GeoData_001to1107=  GeoDataX[:1107]
GeoData_1107tolast=  GeoDataX[1107:]

## Be Careful with this Section
Run just once with a unique machine to get Suburb, then Save output to csv. It runs on rquest error after too many request

### First  1107 

In [15]:
GeoData_001to1107['address'] = GeoData_001to1107[['latitudeX','longitudeX']].progress_apply(get_suburb,axis=1)

# extract dict "address" from  function
GeoData_001to1107['point'] = GeoData_001to1107['address'].apply(lambda loc: loc.func.raw if loc else None)
address = GeoData_001to1107['point'].apply(pd.Series)['address']

# limit the keys of dict to only 8 feat('suburb', 'city', 'county', 'state', 'postcode', 'country','country_code', 'road')
df_address = address.apply(pd.Series).iloc[:, :8]
GeoData_001to1107 =  pd.concat([GeoData_001to1107, df_address],axis=1)  # Merge new address to the datadrame
GeoData_001to1107.shape

#Saving the results
GeoData_001to1107.to_csv('Data/GeoData_001to1107.csv',index=None)
GeoData_001to1107[:2]




  0%|          | 0/1107 [00:00<?, ?it/s][A[A[A


  0%|          | 2/1107 [00:02<23:31,  1.28s/it][A[A[A


  0%|          | 3/1107 [00:07<43:28,  2.36s/it][A[A[A


  0%|          | 4/1107 [00:09<39:56,  2.17s/it][A[A[A


  0%|          | 5/1107 [00:10<36:19,  1.98s/it][A[A[A


  1%|          | 6/1107 [00:13<39:46,  2.17s/it][A[A[A


  1%|          | 7/1107 [00:15<39:16,  2.14s/it][A[A[A


  1%|          | 8/1107 [00:16<34:06,  1.86s/it][A[A[A


  1%|          | 9/1107 [00:17<28:54,  1.58s/it][A[A[A


  1%|          | 10/1107 [00:18<26:19,  1.44s/it][A[A[A


  1%|          | 11/1107 [00:19<23:01,  1.26s/it][A[A[A


  1%|          | 12/1107 [00:20<19:30,  1.07s/it][A[A[A


  1%|          | 13/1107 [00:21<22:24,  1.23s/it][A[A[A


  1%|▏         | 14/1107 [00:22<20:45,  1.14s/it][A[A[A


  1%|▏         | 15/1107 [00:23<17:57,  1.01it/s][A[A[A


  1%|▏         | 16/1107 [00:23<16:01,  1.13it/s][A[A[A


  2%|▏         | 17/1107 [00:24<15:23,

 24%|██▍       | 266/1107 [04:35<08:58,  1.56it/s][A[A[A


 24%|██▍       | 267/1107 [04:35<08:50,  1.58it/s][A[A[A


 24%|██▍       | 268/1107 [04:36<08:52,  1.57it/s][A[A[A


 24%|██▍       | 269/1107 [04:36<08:50,  1.58it/s][A[A[A


 24%|██▍       | 270/1107 [04:37<08:48,  1.58it/s][A[A[A


 24%|██▍       | 271/1107 [04:38<08:45,  1.59it/s][A[A[A


 25%|██▍       | 272/1107 [04:38<08:44,  1.59it/s][A[A[A


 25%|██▍       | 273/1107 [04:39<08:41,  1.60it/s][A[A[A


 25%|██▍       | 274/1107 [04:40<08:47,  1.58it/s][A[A[A


 25%|██▍       | 275/1107 [04:40<08:53,  1.56it/s][A[A[A


 25%|██▍       | 276/1107 [04:41<08:45,  1.58it/s][A[A[A


 25%|██▌       | 277/1107 [04:42<08:39,  1.60it/s][A[A[A


 25%|██▌       | 278/1107 [04:42<08:38,  1.60it/s][A[A[A


 25%|██▌       | 279/1107 [04:43<08:39,  1.59it/s][A[A[A


 25%|██▌       | 280/1107 [04:43<08:38,  1.59it/s][A[A[A


 25%|██▌       | 281/1107 [04:44<08:39,  1.59it/s][A[A[A


 25%|██▌

 48%|████▊     | 529/1107 [10:18<06:02,  1.59it/s][A[A[A


 48%|████▊     | 530/1107 [10:18<06:04,  1.59it/s][A[A[A


 48%|████▊     | 531/1107 [10:19<06:02,  1.59it/s][A[A[A


 48%|████▊     | 532/1107 [10:20<06:00,  1.60it/s][A[A[A


 48%|████▊     | 533/1107 [10:20<05:59,  1.59it/s][A[A[A


 48%|████▊     | 534/1107 [10:21<05:59,  1.59it/s][A[A[A


 48%|████▊     | 535/1107 [10:22<05:56,  1.61it/s][A[A[A


 48%|████▊     | 536/1107 [10:22<05:55,  1.60it/s][A[A[A


 49%|████▊     | 537/1107 [10:23<05:56,  1.60it/s][A[A[A


 49%|████▊     | 538/1107 [10:23<05:56,  1.60it/s][A[A[A


 49%|████▊     | 539/1107 [10:24<05:56,  1.59it/s][A[A[A


 49%|████▉     | 540/1107 [10:25<05:54,  1.60it/s][A[A[A


 49%|████▉     | 541/1107 [10:25<05:54,  1.60it/s][A[A[A


 49%|████▉     | 542/1107 [10:26<05:54,  1.60it/s][A[A[A


 49%|████▉     | 543/1107 [10:27<05:53,  1.60it/s][A[A[A


 49%|████▉     | 544/1107 [10:27<05:51,  1.60it/s][A[A[A


 49%|███

 72%|███████▏  | 793/1107 [14:07<03:21,  1.56it/s][A[A[A


 72%|███████▏  | 794/1107 [14:07<03:19,  1.57it/s][A[A[A


 72%|███████▏  | 795/1107 [14:08<03:16,  1.59it/s][A[A[A


 72%|███████▏  | 796/1107 [14:08<03:15,  1.59it/s][A[A[A


 72%|███████▏  | 797/1107 [14:09<03:16,  1.58it/s][A[A[A


 72%|███████▏  | 798/1107 [14:10<03:17,  1.56it/s][A[A[A


 72%|███████▏  | 799/1107 [14:10<03:16,  1.57it/s][A[A[A


 72%|███████▏  | 800/1107 [14:11<03:15,  1.57it/s][A[A[A


 72%|███████▏  | 801/1107 [14:12<03:13,  1.59it/s][A[A[A


 72%|███████▏  | 802/1107 [14:12<03:10,  1.60it/s][A[A[A


 73%|███████▎  | 803/1107 [14:13<03:11,  1.58it/s][A[A[A


 73%|███████▎  | 804/1107 [14:13<03:11,  1.59it/s][A[A[A


 73%|███████▎  | 805/1107 [14:14<03:11,  1.58it/s][A[A[A


 73%|███████▎  | 806/1107 [14:15<03:09,  1.59it/s][A[A[A


 73%|███████▎  | 807/1107 [14:15<03:09,  1.59it/s][A[A[A


 73%|███████▎  | 808/1107 [14:16<03:07,  1.59it/s][A[A[A


 73%|███

 95%|█████████▌| 1056/1107 [16:52<00:32,  1.58it/s][A[A[A


 95%|█████████▌| 1057/1107 [16:53<00:31,  1.58it/s][A[A[A


 96%|█████████▌| 1058/1107 [16:53<00:30,  1.59it/s][A[A[A


 96%|█████████▌| 1059/1107 [16:54<00:30,  1.59it/s][A[A[A


 96%|█████████▌| 1060/1107 [16:54<00:29,  1.60it/s][A[A[A


 96%|█████████▌| 1061/1107 [16:55<00:28,  1.61it/s][A[A[A


 96%|█████████▌| 1062/1107 [16:56<00:28,  1.61it/s][A[A[A


 96%|█████████▌| 1063/1107 [16:56<00:27,  1.60it/s][A[A[A


 96%|█████████▌| 1064/1107 [16:57<00:26,  1.60it/s][A[A[A


 96%|█████████▌| 1065/1107 [16:58<00:26,  1.59it/s][A[A[A


 96%|█████████▋| 1066/1107 [16:58<00:25,  1.59it/s][A[A[A


 96%|█████████▋| 1067/1107 [16:59<00:25,  1.59it/s][A[A[A


 96%|█████████▋| 1068/1107 [16:59<00:24,  1.59it/s][A[A[A


 97%|█████████▋| 1069/1107 [17:00<00:23,  1.60it/s][A[A[A


 97%|█████████▋| 1070/1107 [17:01<00:23,  1.60it/s][A[A[A


 97%|█████████▋| 1071/1107 [17:01<00:24,  1.49it/s][A

Unnamed: 0,segment_id,EventId,latitude,longitude,latitudeX,longitudeX,address,point,road,suburb,town,county,state,postcode,country,country_code
0,X4UA382,60564,-33.885498,18.638471,-33.885,18.638,<geopy.extra.rate_limiter.RateLimiter object a...,"{'place_id': 93711760, 'licence': 'Data © Open...",Springfield Road,Cape Town Ward 21,Bellville,City of Cape Town,Western Cape,7530,South Africa,za
1,SPPGKO2,60578,-33.855022,18.531405,-33.855,18.531,<geopy.extra.rate_limiter.RateLimiter object a...,"{'place_id': 186346427, 'licence': 'Data © Ope...",N7,Milnerton,,City of Cape Town,Western Cape,7441,South Africa,za


### From 1107 to last (2215) 

In [30]:
GeoData_1107tolast['address'] = GeoData_1107tolast[['latitudeX','longitudeX']].progress_apply(get_suburb,axis=1)

# extract dict "address" from  function
GeoData_1107tolast['point'] = GeoData_1107tolast['address'].apply(lambda loc: loc.func.raw if loc else None)
address = GeoData_1107tolast['point'].apply(pd.Series)['address']

# limit the keys of dict to only 8 feat('suburb', 'city', 'county', 'state', 'postcode', 'country','country_code', 'road')
df_address = address.apply(pd.Series).iloc[:, :8]
GeoData_1107tolast =  pd.concat([GeoData_1107tolast, df_address],axis=1)  # Merge new address to the datadrame
GeoData_1107tolast.shape

#Saving the results
GeoData_1107tolast.to_csv('Data/GeoData_1107tolast.csv',index=None)
GeoData_1107tolast[:2]

  1%|          | 6/1108 [00:03<10:18,  1.78it/s]

KeyboardInterrupt: 

### Read the generated Geoordinate

In [11]:
GeoData_001to1107 = pd.read_csv('Data/GeoData_001to1107.csv')
GeoData_1107tolast = pd.read_csv('Data/GeoData_1107tolast.csv')
## Merge the 2 cordinate together
GeoDataX_with_address =  pd.concat([GeoData_001to1107,GeoData_1107tolast], ignore_index=True)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  after removing the cwd from sys.path.


In [12]:
GeoDataX_with_address.shape

(2215, 17)

In [13]:
GeoDataX_with_address.columns

Index(['EventId', 'address', 'city', 'country', 'country_code', 'county',
       'latitude', 'latitudeX', 'longitude', 'longitudeX', 'point', 'postcode',
       'road', 'segment_id', 'state', 'suburb', 'town'],
      dtype='object')

### Merge GeoDataX_with_address to Geodata File

In [14]:
GeoData = pd.merge(GeoData,GeoDataX_with_address[['suburb','road','town','latitudeX','longitudeX']],how='left',on=['latitudeX','longitudeX'])

In [15]:
suburb_cordinate = GeoData[['suburb','segment_id']]
sub = suburb_cordinate.sort_values(['segment_id','suburb']).drop_duplicates(['segment_id'])

In [16]:
sub.to_csv('Data/segment_id_suburb_v2.csv',index=False)

## Road Featuress

In [17]:
import shapefile
road_segments = shapefile.Reader("Data/road_segments/road_segments.shp")
print(road_segments)

shapefile Reader
    644 shapes (type 'POLYLINE')
    644 records (11 fields)


### Extract field from shape File
And convert to Pandas

In [18]:
fields = [x[0] for x in road_segments.fields][1:]
records = [y[:] for y in road_segments.records()]
shps = [s.points for s in road_segments.shapes()]

In [19]:
road_segments_data = pd.DataFrame(columns = fields, data = records)
road_segments_data.head(3)
road_segments_data.columns = map(str.lower, road_segments_data.columns)

In [20]:
road_segments_data.head()

Unnamed: 0,roadno,class,region,width,lanes,surftype,pavetype,condition,length_1,segment_id
0,R300,Primary,Western Cape,20.2,2,Paved,FLEX,Good,471.207,D1U6OOF
1,R300,Primary,Western Cape,20.2,2,Paved,FLEX,Good,471.207,NG4X2MD
2,R300,Primary,Western Cape,20.2,2,Paved,FLEX,Good,471.207,792705Z
3,R300,Primary,Western Cape,20.2,2,Paved,FLEX,Good,471.207,IK67XHB
4,R300,Primary,Western Cape,20.2,2,Paved,FLEX,Good,471.207,OWCF2MH


#### Get only segment_ids present in the train dataset

In [21]:
segment_ids = data.segment_id.unique()
road_segments_data = road_segments_data[road_segments_data['segment_id'].isin(segment_ids)]

In [22]:
road_segments_data.shape

(544, 10)

### Create Csv for different feat of segment_ids

In [23]:
road_segments_data[['segment_id','roadno']].to_csv('Data/seg_roadtype_v2.csv',index=False)
road_segments_data[['segment_id','surftype']].to_csv('Data/seg_pavement_v2.csv',index=False)
road_segments_data[['segment_id','length_1']].to_csv('Data/seg_Length_v2.csv',index=False)

In [25]:
# road_segments_data[['segment_id','roadno']].to_csv('Data/seg_roadtype.csv',index=False)
# road_segments_data[['segment_id','surftype']].to_csv('Data/seg_pavement.csv',index=False)
# road_segments_data[['segment_id','length_1']].to_csv('Data/seg_Length.csv',index=False)