In [101]:
import pandas as pd

In [102]:
data_df = pd.read_csv("DataSample.csv")
data_df.rename(columns=lambda x: x.strip(), inplace=True)

## Part 1

data_df.drop_duplicates(["TimeSt", "Latitude", "Longitude"], keep="last", inplace=True)

data_df
# data.count
# data.count_distinct

Unnamed: 0,_ID,TimeSt,Country,Province,City,Latitude,Longitude
0,4516516,2017-06-21 00:00:00.143,CA,ON,Waterloo,43.49347,-80.49123
1,4516547,2017-06-21 18:00:00.193,CA,ON,London,42.93990,-81.27090
2,4516550,2017-06-21 15:00:00.287,CA,ON,Guelph,43.57760,-80.22010
3,4516600,2017-06-21 15:00:00.307,CA,ON,Stratford,43.37160,-80.97730
4,4516613,2017-06-21 15:00:00.497,CA,ON,Stratford,43.37160,-80.97730
5,4516693,2017-06-21 14:00:00.597,CA,ON,Kitchener,43.43810,-80.50990
6,4516771,2017-06-21 10:00:00.873,CA,ON,Sarnia,42.96100,-82.37300
8,4516915,2017-06-21 15:00:01.310,CA,ON,London,43.00910,-81.17650
9,4516953,2017-06-21 16:00:01.700,CA,ON,Kitchener,43.42780,-80.51350
10,4516966,2017-06-21 01:00:01.787,CA,ON,Kitchener,43.43810,-80.50990


In [103]:
## Part 2

poi_df = pd.read_csv("POIList.csv")
poi_df.rename(columns=lambda x: x.strip(), inplace=True)
poi_map = {}

for _, row in poi_df.iterrows():
    poi_map[(row["Latitude"], row["Longitude"])] = row["POIID"]
    
def calc_dist(p1, p2):
    return ((p1[0] - p2[0])**2 + (p1[1] - p2[1])**2)**0.5
    
def closest_poi(row, mapping):
    distances = {k: calc_dist([row["Latitude"], row["Longitude"]], k) for k in mapping.keys()}
    place = min(distances, key=distances.get)
    return mapping[place]

data_df['ClosestPOI'] = data_df.apply(lambda row: closest_poi(row, poi_map), axis=1)

data_df.groupby(['ClosestPOI']).agg(['count'])

Unnamed: 0_level_0,_ID,TimeSt,Country,Province,City,Latitude,Longitude
Unnamed: 0_level_1,count,count,count,count,count,count,count
ClosestPOI,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
POI2,9698,9698,9698,9698,9698,9698,9698
POI3,9817,9817,9817,9817,9817,9817,9817
POI4,484,484,484,484,484,484,484


In [111]:
## Part 3

def poi_dist(row, mapping):
    return min([calc_dist([row["Latitude"], row["Longitude"]], k) for k in mapping.keys()])

data_df['ClosestDist'] = data_df.apply(lambda row: poi_dist(row, poi_map), axis=1)

In [123]:
# 1)

summary = data_df[['ClosestPOI', 'ClosestDist']].groupby(['ClosestPOI']).agg(['mean', 'std'])

# Since POI1 and POI2 are the same geographical location
poi_1 = summary.loc["POI2"]
poi_1.rename("POI1", inplace=True)

summary = summary.append(poi_1)
summary

Unnamed: 0_level_0,ClosestDist,ClosestDist
Unnamed: 0_level_1,mean,std
ClosestPOI,Unnamed: 1_level_2,Unnamed: 2_level_2
POI2,3.348183,3.85849
POI3,5.537951,2.85869
POI4,8.810411,28.67513
POI1,3.348183,3.85849


In [163]:
# 2)

poi_range = data_df[['ClosestPOI', 'ClosestDist']].groupby(['ClosestPOI']).agg(['max', 'count'])
poi_range.columns = poi_range.columns.droplevel()
poi_range['density'] = poi_range.apply(lambda row: row['count']/row['max'], axis=1)
poi_range.drop(['count'], axis=1, inplace=True)
poi_range.rename(columns={'max': 'radius'}, inplace=True)

poi_1_range = poi_range.loc["POI2"]
poi_1_range.rename("POI1", inplace=True)

poi_range = poi_range.append(poi_1_range)
poi_range

Unnamed: 0_level_0,radius,density
ClosestPOI,Unnamed: 1_level_1,Unnamed: 2_level_1
POI2,24.851937,390.231148
POI3,20.155378,487.066038
POI4,192.704991,2.511611
POI1,24.851937,390.231148


Unnamed: 0_level_0,_ID,TimeSt,Country,Province,City,Latitude,Longitude,ClosestDist
Unnamed: 0_level_1,count,count,count,count,count,count,count,count
ClosestPOI,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
POI2,9698,9698,9698,9698,9698,9698,9698,9698
POI3,9817,9817,9817,9817,9817,9817,9817,9817
POI4,484,484,484,484,484,484,484,484
