In [1]:
import pandas as pd

In [2]:
data_df = pd.read_csv("DataSample.csv")
data_df.rename(columns=lambda x: x.strip(), inplace=True)
print(f'Original length: {len(data_df)}')

## Part 1

data_df.drop_duplicates(["TimeSt", "Latitude", "Longitude"], keep="last", inplace=True)

print(f'Deduped length: {len(data_df)}')

Original length: 22025
Deduped length: 19999


In [3]:
## Part 2

poi_df = pd.read_csv("POIList.csv")
poi_df.rename(columns=lambda x: x.strip(), inplace=True)
poi_map = {}

for _, row in poi_df.iterrows():
    poi_lst = poi_map.get((row["Latitude"], row["Longitude"]))
    poi_map[(row["Latitude"], row["Longitude"])] = (poi_lst or []) + [row["POIID"]]
    
def calc_dist(p1, p2):
    return ((p1[0] - p2[0])**2 + (p1[1] - p2[1])**2)**0.5
    
def closest_poi(row, mapping):
    distances = {k: calc_dist([row["Latitude"], row["Longitude"]], k) for k in mapping.keys()}
    place = min(distances, key=distances.get)
    return mapping[place]

data_df['ClosestPOI'] = data_df.apply(lambda row: closest_poi(row, poi_map), axis=1).astype(str)

data_df.groupby(['ClosestPOI']).agg(['count'])

Unnamed: 0_level_0,_ID,TimeSt,Country,Province,City,Latitude,Longitude
Unnamed: 0_level_1,count,count,count,count,count,count,count
ClosestPOI,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
"['POI1', 'POI2']",9698,9698,9698,9698,9698,9698,9698
['POI3'],9817,9817,9817,9817,9817,9817,9817
['POI4'],484,484,484,484,484,484,484


In [4]:
## Part 3

def poi_dist(row, mapping):
    return min([calc_dist([row["Latitude"], row["Longitude"]], k) for k in mapping.keys()])

data_df['ClosestDist'] = data_df.apply(lambda row: poi_dist(row, poi_map), axis=1)

In [5]:
# 1)

summary = data_df[['ClosestPOI', 'ClosestDist']].groupby(['ClosestPOI']).agg(['mean', 'std'])
summary

Unnamed: 0_level_0,ClosestDist,ClosestDist
Unnamed: 0_level_1,mean,std
ClosestPOI,Unnamed: 1_level_2,Unnamed: 2_level_2
"['POI1', 'POI2']",3.348183,3.85849
['POI3'],5.537951,2.85869
['POI4'],8.810411,28.67513


In [6]:
# 2)

def poi_area(row):
    PI = 3.141592653589793
    tot = row['count']
    area = PI*(row['max']**2)
    return tot/area

poi_range = data_df[['ClosestPOI', 'ClosestDist']].groupby(['ClosestPOI']).agg(['max', 'count'])
poi_range.columns = poi_range.columns.droplevel()
poi_range['density'] = poi_range.apply(lambda row: poi_area(row), axis=1)
poi_range.drop(['count'], axis=1, inplace=True)
poi_range.rename(columns={'max': 'radius'}, inplace=True)

poi_range

Unnamed: 0_level_0,radius,density
ClosestPOI,Unnamed: 1_level_1,Unnamed: 2_level_1
"['POI1', 'POI2']",24.851937,4.998179
['POI3'],20.155378,7.692137
['POI4'],192.704991,0.004149


In [7]:
## Part 4 b)

# Implementation
## shortest_path(starts, goal, order) produces the least amount of 
##     tasks required to accomplish goal from start, using the
##     dependencies found in order. If a path doesn't exist, False 
##     is returned
## shortest_path: (setof Str) Str (dictof Str (listof Str)) -> (anyof (listof Str) False)
def shortest_path(starts, goal, graph):
    finished = set()
    f_queue = []
    for start in starts:
        f_queue += list(graph.get(start) or set())
        graph[start] = set()
    while f_queue:
        v = f_queue.pop()
        finished.update({v})
        f_queue += list(graph.get(v) or set())
    path = [goal]
    queue = [goal]
    while queue:
        v = queue.pop()
        for task in (graph.get(v) or set()) - set(path):
            if task in finished: continue
            path = [task] + path
            queue.append(task)
    return path

# Data Cleaning
starts = set()
goal = ''

question = open("question.txt", "r")
starts = question.readline().strip().split(':')[1]
starts = {x.strip() for x in starts.split(',')}

goal = question.readline().strip().split(':')[1].strip()

graph = {}

relations = open("relations.txt", "r")
for line in relations:
    vals = line.strip().split('->')
    graph[vals[1]] = (graph.get(vals[1]) or set()).union({vals[0]})
    
# Example Result    
shortest_path(starts, goal, graph)

['112', '100', '73', '21', '20', '94', '56', '97', '102', '36']

In [8]:
# Test from repo
graph = {
    'C': {'A', 'B'},
    'E': {'C'},
    'F': {'E'}}

print(list(shortest_path({'A'}, 'F', graph)))
print(list(shortest_path({'A', 'C'}, 'F', graph)))

['A', 'B', 'C', 'E', 'F']
['C', 'E', 'F']
