In [88]:
import glob
import matplotlib.pyplot as plt
import os
import pandas as pd
import numpy as np
import multiprocessing as mp
import pandas as pd
import seaborn as sns
import sklearn as sk

from functools import reduce
from math import radians, cos, sin, asin, sqrt, pi
from pathlib import Path
from functools import reduce
from pathos.multiprocessing import ProcessPool as Pool
from sklearn.cluster import KMeans
from toolz.sandbox.parallel import fold

from sklearn.base import TransformerMixin, BaseEstimator

In [49]:
home_path = Path.home()
dir_name = 'ml_eq'

data_sample_path = home_path / dir_name / 'data' / 'DataSample.csv'
poi_path = home_path / dir_name / 'data' / 'POIList.csv'

In [50]:
data_df = pd.read_csv(
    data_sample_path,
    names=['ID', 'TimeSt', 'Country', 'Province', 'City', 'Latitude', 'Longitude'],
    parse_dates=['TimeSt'],
    header=0,
    na_values=["\\N"]
)

In [51]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22025 entries, 0 to 22024
Data columns (total 7 columns):
ID           22025 non-null int64
TimeSt       22025 non-null datetime64[ns]
Country      22025 non-null object
Province     22025 non-null object
City         22025 non-null object
Latitude     22025 non-null float64
Longitude    22025 non-null float64
dtypes: datetime64[ns](1), float64(2), int64(1), object(3)
memory usage: 1.2+ MB


In [52]:
data_df.head()

Unnamed: 0,ID,TimeSt,Country,Province,City,Latitude,Longitude
0,4516516,2017-06-21 00:00:00.143,CA,ON,Waterloo,43.49347,-80.49123
1,4516547,2017-06-21 18:00:00.193,CA,ON,London,42.9399,-81.2709
2,4516550,2017-06-21 15:00:00.287,CA,ON,Guelph,43.5776,-80.2201
3,4516600,2017-06-21 15:00:00.307,CA,ON,Stratford,43.3716,-80.9773
4,4516613,2017-06-21 15:00:00.497,CA,ON,Stratford,43.3716,-80.9773


In [53]:
data_df.nunique()

ID           22025
TimeSt       19972
Country          1
Province        12
City           656
Latitude      5282
Longitude     5296
dtype: int64

In [54]:
data_df_filtered = data_df.drop_duplicates(subset=['TimeSt', 'Latitude', 'Longitude'], keep=False)

In [55]:
data_df_filtered.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17973 entries, 0 to 22024
Data columns (total 7 columns):
ID           17973 non-null int64
TimeSt       17973 non-null datetime64[ns]
Country      17973 non-null object
Province     17973 non-null object
City         17973 non-null object
Latitude     17973 non-null float64
Longitude    17973 non-null float64
dtypes: datetime64[ns](1), float64(2), int64(1), object(3)
memory usage: 1.1+ MB


In [56]:
group_cols = ['TimeSt', 'Latitude', 'Longitude']
data_df_filtered.groupby(group_cols).agg('count')['City'].sort_values(ascending=False).head()

TimeSt                   Latitude  Longitude
2017-06-21 22:59:56.167  49.7828   -94.4411     1
2017-06-21 07:41:51.570  45.4873   -73.6043     1
2017-06-21 07:41:17.497  51.0104   -114.0660    1
2017-06-21 07:41:17.583  49.6994   -112.8220    1
2017-06-21 07:41:27.723  51.0630   -113.8890    1
Name: City, dtype: int64

#### Labeling

In [57]:
poi_df = pd.read_csv(
    poi_path,
    names=['POIID', 'Latitude', 'Longitude'],
    header=0,
    na_values=["\\N"]
)

In [58]:
poi_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 3 columns):
POIID        4 non-null object
Latitude     4 non-null float64
Longitude    4 non-null float64
dtypes: float64(2), object(1)
memory usage: 224.0+ bytes


In [59]:
poi_df.head()

Unnamed: 0,POIID,Latitude,Longitude
0,POI1,53.546167,-113.485734
1,POI2,53.546167,-113.485734
2,POI3,45.521629,-73.566024
3,POI4,45.22483,-63.232729


#### There are duplicate entries within the POIID list. POI1 and POI2 are the same.

In [60]:
poi_df_filtered = poi_df \
    .drop_duplicates(subset=['Latitude', 'Longitude'], keep='first') \
    .reset_index(drop=True)

In [61]:
poi_df_filtered.head()

Unnamed: 0,POIID,Latitude,Longitude
0,POI1,53.546167,-113.485734
1,POI3,45.521629,-73.566024
2,POI4,45.22483,-63.232729


#### Calculating the nearest POI for each request via the Haversine distance formula

In [62]:
def haversine_dist(lat1, lon1, lat2, lon2):
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
    
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    R = 6371 # Radius of earth in kilometers
    return c * R

In [63]:
def find_poi_id_and_distance(lat, long):
    distances = poi_df_filtered.apply(
        lambda row: haversine_dist(lat, long, row['Latitude'], row['Longitude']), 
        axis=1
    )
    
    poiid_col_index = poi_df_filtered.columns.get_loc('POIID')
    
    return pd.Series([poi_df_filtered.iloc[distances.idxmin(), poiid_col_index], distances.iloc[distances.idxmin()]], index=['POIID', 'Distance'])

In [64]:
def find_poi_id_and_distance_2(lat, long):
    distances = poi_df_filtered.apply(
        lambda row: haversine_dist(lat, long, row['Latitude'], row['Longitude']), 
        axis=1
    )
    
    poiid_col_index = poi_df_filtered.columns.get_loc('POIID')
    
    return pd.Series([poi_df_filtered.iloc[distances.idxmin(), poiid_col_index], distances.iloc[distances.idxmin()]], index=['POIID', 'Distance'])

In [65]:
def ex_work(row):
    latitude = row.iat[0, lat_loc]
    longitude = row.iat[0, long_loc]
    return find_poi_id_and_distance_2(latitude, longitude)

In [66]:
def concat_df(df1, df2):
    return pd.concat((df1, df2), axis="rows")

In [97]:
class MapReducer(BaseEstimator, TransformerMixin):
    
    def __init__(self, n_jobs=1):
        self.n_jobs = n_jobs
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X_copy = X.copy()
        
        MAX_WORKERS = mp.cpu_count()
        
        if self.n_jobs <= -1:
            self.num_partitions = MAX_WORKERS
        elif self.n_jobs == 0:
            self.num_partitions = 1
        else:
            self.num_partitions = min(self.n_jobs, MAX_WORKERS)
        
        # splitting data into batches
        data_split_list = np.array_split(X_copy, self.num_partitions)
        
        with Pool(processes=self.num_partitions) as P:
            result_df = fold(concat_df, P.imap(self._transform_part, data_split_list), map=P.imap, combine=concat_df)
            
        return result_df
    
    def _transform_part(self, df_part):
        df_list = []
        
        for ID, lat, long in zip(df_part['ID'], df_part['Latitude'], df_part['Longitude']):
            df_list.append(self._find_poi_id_and_distance(ID, lat, long))
            
        return pd.DataFrame(df_list, columns =['ID', 'POIID', 'Distance'])
    
    def _find_poi_id_and_distance(self, ID, lat, long):    
        distances = poi_df_filtered.apply(
            lambda row: haversine_dist(lat, long, row['Latitude'], row['Longitude']),
            axis=1
        )
        
        poiid_col_index = poi_df_filtered.columns.get_loc('POIID')
    
        return (ID, poi_df_filtered.iloc[distances.idxmin(), poiid_col_index], distances.iloc[distances.idxmin()])

In [103]:
map_reducer = MapReducer(n_jobs=3)

In [101]:
%%time

result = map_reducer.transform(data_df_filtered)

CPU times: user 189 ms, sys: 17.8 ms, total: 207 ms
Wall time: 5.66 s


In [102]:
class MapReducer(BaseEstimator, TransformerMixin):
    
    def __init__(self, n_jobs=1):
        self.n_jobs = n_jobs
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X_copy = X.copy()
        
        MAX_WORKERS = mp.cpu_count()
        num_partitions = 1
        
        if self.n_jobs <= -1:
            num_partitions = MAX_WORKERS
        elif self.n_jobs == 0:
            num_partitions = 1
        else:
            num_partitions = min(self.n_jobs, MAX_WORKERS)
        
        if num_partitions == 1:
            # transform sequentially
            return X_copy.apply(self._transform_one)
        
        # splitting data into batches
        data_split_list = np.array_split(X_copy, num_partitions)
        
        with Pool(processes=num_partitions) as P:
            result_df = fold(concat_df, P.imap(self._transform_part, data_split_list), map=P.imap, combine=concat_df)
            
        return result_df
    
    def _transform_part(self, df_part):
        return df_part.apply(
            lambda row: self._find_poi_id_and_distance(row['Latitude'], row['Longitude']),
            result_type='expand',
            axis=1
        )
    
    
    def _find_poi_id_and_distance(self, lat, long):
        distances = poi_df_filtered.apply(
            lambda row: haversine_dist(lat, long, row['Latitude'], row['Longitude']),
            axis=1
        )
        
        poiid_col_index = poi_df_filtered.columns.get_loc('POIID')
    
        return pd.Series([poi_df_filtered.iloc[distances.idxmin(), poiid_col_index], distances.iloc[distances.idxmin()]], index=['POIID', 'Distance'])

In [104]:
%%time

result = map_reducer.transform(data_df_filtered)

CPU times: user 146 ms, sys: 32.5 ms, total: 178 ms
Wall time: 7.8 s


In [114]:
result

Unnamed: 0,ID,POIID,Distance
0,4516516,POI3,593.413441
1,4516547,POI3,677.309655
2,4516550,POI3,569.647737
3,4516600,POI3,634.733996
4,4516613,POI3,634.733996
...,...,...,...
4488,5614689,POI3,674.211228
4489,5614801,POI3,522.019302
4490,5614909,POI3,461.995129
4491,5614912,POI3,536.874099


In [105]:
%%time

result_df = data_df_filtered.apply(
    lambda row: find_poi_id_and_distance(row['Latitude'], row['Longitude']),
    result_type='expand',
    axis=1
)

CPU times: user 22.8 s, sys: 913 ms, total: 23.7 s
Wall time: 23 s


In [257]:
result_df

Unnamed: 0_level_0,POIID,Distance
_ID,Unnamed: 1_level_1,Unnamed: 2_level_1
4516516,POI3,593.413441
4516547,POI3,677.309655
4516550,POI3,569.647737
4516600,POI3,634.733996
4516613,POI3,634.733996
...,...,...
5614689,POI3,674.211228
5614801,POI3,522.019302
5614909,POI3,461.995129
5614912,POI3,536.874099


In [254]:
result_df['POIID'].value_counts()

POI3    8802
POI1    8749
POI4     422
Name: POIID, dtype: int64

In [261]:
group_cols = ['POIID']
poi_distance_stat_df = result_df.groupby(group_cols).agg(['mean', 'std'])

In [262]:
level0 = poi_distance_stat_df.columns.get_level_values(0)
level1 = poi_distance_stat_df.columns.get_level_values(1)
poi_distance_stat_df.columns = level0 + '_' + level1

In [263]:
poi_distance_stat_df

Unnamed: 0_level_0,Distance_mean,Distance_std
POIID,Unnamed: 1_level_1,Unnamed: 2_level_1
POI1,300.714748,388.273385
POI3,451.651149,223.631742
POI4,514.997172,1506.889971


In [270]:
group_cols = ['POIID']
poi_distance_radius_df = result_df.groupby(group_cols).agg(['max', 'count'])

In [274]:
poi_distance_radius_df.columns = ['radius', 'req_count']

In [283]:
poi_distance_radius_df['density'] = poi_distance_radius_df['req_count']/(pi*pow(poi_distance_radius_df['radius'], 2))

In [284]:
poi_distance_radius_df

Unnamed: 0_level_0,radius,req_count,density
POIID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
POI1,11531.820832,8749,2.1e-05
POI3,1474.580962,8802,0.001289
POI4,9349.57277,422,2e-06
