In [278]:
import glob
import matplotlib
import matplotlib.pyplot as plt
import os
import pandas as pd
import numpy as np
import pandas as pd
import seaborn as sns
import sklearn as sk

from math import radians, cos, sin, asin, sqrt, pi
from pathlib import Path
from functools import reduce
from multiprocessing.pool import ThreadPool as Pool
# from pathos.multiprocessing import ProcessPool
from sklearn.cluster import KMeans
from toolz.sandbox.parallel import fold

from os.path import expanduser
from sklearn.preprocessing import StandardScaler

In [8]:
home_path = Path.home()
dir_name = 'ml_eq'

data_sample_path = home_path / dir_name / 'data' / 'DataSample.csv'
poi_path = home_path / dir_name / 'data' / 'POIList.csv'

In [51]:
data_df = pd.read_csv(
    data_sample_path,
    names=['_ID', 'TimeSt', 'Country', 'Province', 'City', 'Latitude', 'Longitude'],
    parse_dates=['TimeSt'],
    header=0,
    index_col=0,
    na_values=["\\N"]
)

In [52]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22025 entries, 4516516 to 5615006
Data columns (total 6 columns):
TimeSt       22025 non-null datetime64[ns]
Country      22025 non-null object
Province     22025 non-null object
City         22025 non-null object
Latitude     22025 non-null float64
Longitude    22025 non-null float64
dtypes: datetime64[ns](1), float64(2), object(3)
memory usage: 1.2+ MB


In [53]:
data_df.head()

Unnamed: 0_level_0,TimeSt,Country,Province,City,Latitude,Longitude
_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
4516516,2017-06-21 00:00:00.143,CA,ON,Waterloo,43.49347,-80.49123
4516547,2017-06-21 18:00:00.193,CA,ON,London,42.9399,-81.2709
4516550,2017-06-21 15:00:00.287,CA,ON,Guelph,43.5776,-80.2201
4516600,2017-06-21 15:00:00.307,CA,ON,Stratford,43.3716,-80.9773
4516613,2017-06-21 15:00:00.497,CA,ON,Stratford,43.3716,-80.9773


In [44]:
data_df.nunique()

TimeSt       19972
Country          1
Province        12
City           656
Latitude      5282
Longitude     5296
dtype: int64

#### A city can have multiple longitude and/or latitude values

In [69]:
group_cols = ['City']
agg_dict = {'Latitude': ['nunique'], 'Longitude': ['nunique']}
city_coordinate_count_df = data_df.groupby(group_cols).agg(agg_dict)

In [70]:
level0 = city_coordinate_count_df.columns.get_level_values(0)
level1 = city_coordinate_count_df.columns.get_level_values(1)
city_coordinate_count_df.columns = level0 + '_' + level1

In [85]:
city_coordinate_count_df.sort_values(by='Latitude_nunique', ascending=False).head()

Unnamed: 0_level_0,Latitude_nunique,Longitude_nunique
City,Unnamed: 1_level_1,Unnamed: 2_level_1
Calgary,738,702
Edmonton,609,609
Toronto,260,261
Mississauga,158,161
Saskatoon,126,122


#### One particular combnation of geographical coordinates belongs to two different cities. 

Pointe-Claire is a suburb of greater Montreal. City labels might need to be checked.

In [110]:
group_cols = ['Latitude', 'Longitude']
data_df.groupby(group_cols).agg('nunique')['City'].sort_values(ascending=False).head()

Latitude  Longitude 
60.00000  -95.00000     2
62.82730  -136.53957    1
45.44144  -73.77847     1
45.43550  -75.66560     1
45.43553  -75.78757     1
Name: City, dtype: int64

In [111]:
criteria = (data_df['Latitude'] == 60.00000) & (data_df['Longitude'] == -95.00000)
data_df.loc[criteria, :]

Unnamed: 0_level_0,TimeSt,Country,Province,City,Latitude,Longitude
_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
4867724,2017-06-21 05:24:38.087,CA,QC,Pointe-Claire,60.0,-95.0
5108404,2017-06-21 15:41:35.473,CA,QC,Montreal,60.0,-95.0


#### Number of entries sharing same the timestamp and geographical identifiers

In [96]:
group_cols = ['TimeSt', 'Latitude', 'Longitude']
data_df.groupby(group_cols).agg('count')['City'].sort_values(ascending=False).head()

TimeSt                   Latitude  Longitude
2017-06-21 16:59:47.400  43.3756   -79.8145     2
2017-06-21 04:44:00.283  48.4084   -89.2550     2
2017-06-21 17:41:23.493  43.3320   -79.8070     2
2017-06-21 17:41:16.563  44.2377   -76.5602     2
2017-06-21 04:44:19.453  53.0957   -113.4680    2
Name: City, dtype: int64

In [106]:
group_cols = ['TimeSt', 'City']
data_df.groupby(group_cols).agg('count')['Latitude'].sort_values(ascending=False).head()

TimeSt                   City      
2017-06-21 16:59:47.400  Burlington    2
2017-06-21 17:41:23.493  Burlington    2
2017-06-21 04:44:19.453  Millet        2
2017-06-21 17:41:12.660  Edmonton      2
2017-06-21 17:41:11.987  Bewdley       2
Name: Latitude, dtype: int64

#### An example of such an entry

In [104]:
criteria = (data_df['TimeSt'] == '2017-06-21 16:59:47.400') & (data_df['Latitude'] == 43.3756) & (data_df['Longitude'] == -79.8145)
data_df.loc[criteria, :]

Unnamed: 0_level_0,TimeSt,Country,Province,City,Latitude,Longitude
_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
5365152,2017-06-21 16:59:47.400,CA,ON,Burlington,43.3756,-79.8145
5365376,2017-06-21 16:59:47.400,CA,ON,Burlington,43.3756,-79.8145


#### Dropping duplicates in two ways to check consistency of both methods

Both ways amount to the same result. No duplicated entries left

In [244]:
data_df_filtered = data_df.drop_duplicates(keep=False)

In [245]:
data_df_filtered.head()

Unnamed: 0_level_0,TimeSt,Country,Province,City,Latitude,Longitude
_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
4516516,2017-06-21 00:00:00.143,CA,ON,Waterloo,43.49347,-80.49123
4516547,2017-06-21 18:00:00.193,CA,ON,London,42.9399,-81.2709
4516550,2017-06-21 15:00:00.287,CA,ON,Guelph,43.5776,-80.2201
4516600,2017-06-21 15:00:00.307,CA,ON,Stratford,43.3716,-80.9773
4516613,2017-06-21 15:00:00.497,CA,ON,Stratford,43.3716,-80.9773


In [114]:
data_df_filtered.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17973 entries, 4516516 to 5615006
Data columns (total 6 columns):
TimeSt       17973 non-null datetime64[ns]
Country      17973 non-null object
Province     17973 non-null object
City         17973 non-null object
Latitude     17973 non-null float64
Longitude    17973 non-null float64
dtypes: datetime64[ns](1), float64(2), object(3)
memory usage: 982.9+ KB


In [115]:
data_df_filtered_2 = data_df.drop_duplicates(subset=['TimeSt', 'Latitude', 'Longitude'], keep=False)

In [116]:
data_df_filtered_2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17973 entries, 4516516 to 5615006
Data columns (total 6 columns):
TimeSt       17973 non-null datetime64[ns]
Country      17973 non-null object
Province     17973 non-null object
City         17973 non-null object
Latitude     17973 non-null float64
Longitude    17973 non-null float64
dtypes: datetime64[ns](1), float64(2), object(3)
memory usage: 982.9+ KB


In [117]:
group_cols = ['TimeSt', 'Latitude', 'Longitude']
data_df_filtered.groupby(group_cols).agg('count')['City'].sort_values(ascending=False).head()

TimeSt                   Latitude  Longitude
2017-06-21 22:59:56.167  49.7828   -94.4411     1
2017-06-21 07:41:51.570  45.4873   -73.6043     1
2017-06-21 07:41:17.497  51.0104   -114.0660    1
2017-06-21 07:41:17.583  49.6994   -112.8220    1
2017-06-21 07:41:27.723  51.0630   -113.8890    1
Name: City, dtype: int64

#### Labeling

In [152]:
poi_df = pd.read_csv(
    poi_path,
    names=['POIID', 'Latitude', 'Longitude'],
    header=0,
    na_values=["\\N"]
)

In [153]:
poi_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 3 columns):
POIID        4 non-null object
Latitude     4 non-null float64
Longitude    4 non-null float64
dtypes: float64(2), object(1)
memory usage: 224.0+ bytes


In [204]:
poi_df.head()

Unnamed: 0,POIID,Latitude,Longitude
0,POI1,53.546167,-113.485734
1,POI2,53.546167,-113.485734
2,POI3,45.521629,-73.566024
3,POI4,45.22483,-63.232729


#### There are duplicate entries within the POIID list. POI1 and POI2 are the same.

In [214]:
poi_df_filtered = poi_df \
    .drop_duplicates(subset=['Latitude', 'Longitude'], keep='first') \
    .reset_index(drop=True)

In [215]:
poi_df_filtered.head()

Unnamed: 0,POIID,Latitude,Longitude
0,POI1,53.546167,-113.485734
1,POI3,45.521629,-73.566024
2,POI4,45.22483,-63.232729


#### Calculating the nearest POI for each request via the Haversine distance formula

In [216]:
def haversine_dist(lat1, lon1, lat2, lon2):
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
    
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    R = 6371 # Radius of earth in kilometers
    return c * R

In [251]:
def find_poi_id_and_distance(lat, long):
    distances = poi_df_filtered.apply(
        lambda row: haversine_dist(lat, long, row['Latitude'], row['Longitude']), 
        axis=1
    )
    
    poiid_col_index = poi_df_filtered.columns.get_loc('POIID')
    
    return pd.Series([poi_df_filtered.iloc[distances.idxmin(), poiid_col_index], distances.iloc[distances.idxmin()]], index=['POIID', 'Distance'])

In [252]:
%%time

result_df = data_df_filtered.apply(
    lambda row: find_poi_id_and_distance(row['Latitude'], row['Longitude']),
    result_type='expand',
    axis=1
)

CPU times: user 22.4 s, sys: 53.1 ms, total: 22.5 s
Wall time: 22.5 s


In [257]:
result_df

Unnamed: 0_level_0,POIID,Distance
_ID,Unnamed: 1_level_1,Unnamed: 2_level_1
4516516,POI3,593.413441
4516547,POI3,677.309655
4516550,POI3,569.647737
4516600,POI3,634.733996
4516613,POI3,634.733996
...,...,...
5614689,POI3,674.211228
5614801,POI3,522.019302
5614909,POI3,461.995129
5614912,POI3,536.874099


In [254]:
result_df['POIID'].value_counts()

POI3    8802
POI1    8749
POI4     422
Name: POIID, dtype: int64

In [261]:
group_cols = ['POIID']
poi_distance_stat_df = result_df.groupby(group_cols).agg(['mean', 'std'])

In [262]:
level0 = poi_distance_stat_df.columns.get_level_values(0)
level1 = poi_distance_stat_df.columns.get_level_values(1)
poi_distance_stat_df.columns = level0 + '_' + level1

In [263]:
poi_distance_stat_df

Unnamed: 0_level_0,Distance_mean,Distance_std
POIID,Unnamed: 1_level_1,Unnamed: 2_level_1
POI1,300.714748,388.273385
POI3,451.651149,223.631742
POI4,514.997172,1506.889971


In [270]:
group_cols = ['POIID']
poi_distance_radius_df = result_df.groupby(group_cols).agg(['max', 'count'])

In [274]:
poi_distance_radius_df.columns = ['radius', 'req_count']

In [283]:
poi_distance_radius_df['density'] = poi_distance_radius_df['req_count']/(pi*pow(poi_distance_radius_df['radius'], 2))

In [284]:
poi_distance_radius_df

Unnamed: 0_level_0,radius,req_count,density
POIID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
POI1,11531.820832,8749,2.1e-05
POI3,1474.580962,8802,0.001289
POI4,9349.57277,422,2e-06
