In [28]:
import pandas as pd
import math
df = pd.read_csv(r'D:\Python Study\EQ Works data engineer\DataSample.csv')
#They consider the columns that contain the same geoinformation and timeset as suspicious  so I need to filter out those record
df.head()

Unnamed: 0,_ID,TimeSt,Country,Province,City,Latitude,Longitude
0,4516516,2017-06-21 00:00:00.143,CA,ON,Waterloo,43.49347,-80.49123
1,4516547,2017-06-21 18:00:00.193,CA,ON,London,42.9399,-81.2709
2,4516550,2017-06-21 15:00:00.287,CA,ON,Guelph,43.5776,-80.2201
3,4516600,2017-06-21 15:00:00.307,CA,ON,Stratford,43.3716,-80.9773
4,4516613,2017-06-21 15:00:00.497,CA,ON,Stratford,43.3716,-80.9773


In [29]:
#Construct a new column in the dataframe called duplicates to mark all the requests with identical geo info
df['duplicates'] = df[[' TimeSt','Latitude','Longitude']].duplicated(keep=False)
#drop those requests from last operation
df = df.drop(df[df['duplicates']==True].index)
#drop the duplicates column after its purpose served
df = df.drop('duplicates',axis=1)
df.head()

Unnamed: 0,_ID,TimeSt,Country,Province,City,Latitude,Longitude
0,4516516,2017-06-21 00:00:00.143,CA,ON,Waterloo,43.49347,-80.49123
1,4516547,2017-06-21 18:00:00.193,CA,ON,London,42.9399,-81.2709
2,4516550,2017-06-21 15:00:00.287,CA,ON,Guelph,43.5776,-80.2201
3,4516600,2017-06-21 15:00:00.307,CA,ON,Stratford,43.3716,-80.9773
4,4516613,2017-06-21 15:00:00.497,CA,ON,Stratford,43.3716,-80.9773


In [76]:
poi = pd.read_csv(r'D:\Python Study\EQ Works data engineer\POIlist.csv')
poi.head()

Unnamed: 0,POIID,Latitude,Longitude
0,POI1,53.546167,-113.485734
1,POI2,53.546167,-113.485734
2,POI3,45.521629,-73.566024
3,POI4,45.22483,-63.232729


In [77]:
#construct a dummy list to store the name of closest POIID for every request
dummy_list = []
#for every request, iterate through every POI to find the closest distance
for row in df.itertuples():
    dic = {}
    for row2 in poi.itertuples():
        dic[row2.POIID] = abs(row.Latitude-row2.Latitude)+abs(row.Longitude-row2.Longitude)
    dummy_list.append(min(dic.items(),key=lambda x:x[1])[0])
#construct a new column in the dataframe with closest POIID, for merging purpose later 
df['POIID'] = dummy_list
#merge the main dataframe with POI data on the column of POIID
df = df.merge(poi, on='POIID', how='inner')
df.head()
    

Unnamed: 0,_ID,TimeSt,Country,Province,City,Latitude_x,Longitude_x,POIID,Latitude_y,Longitude_y
0,4516516,2017-06-21 00:00:00.143,CA,ON,Waterloo,43.49347,-80.49123,POI3,45.521629,-73.566024
1,4516547,2017-06-21 18:00:00.193,CA,ON,London,42.9399,-81.2709,POI3,45.521629,-73.566024
2,4516550,2017-06-21 15:00:00.287,CA,ON,Guelph,43.5776,-80.2201,POI3,45.521629,-73.566024
3,4516600,2017-06-21 15:00:00.307,CA,ON,Stratford,43.3716,-80.9773,POI3,45.521629,-73.566024
4,4516613,2017-06-21 15:00:00.497,CA,ON,Stratford,43.3716,-80.9773,POI3,45.521629,-73.566024


In [93]:
import numpy as np
#construct a dictionary to store the distance of every request for every POI
distance = {'POI1':[],'POI2':[],'POI3':[],'POI4':[]}
#Fill the dictionary with distances
for row in df.itertuples():
    distance[row.POIID].append(math.sqrt((row.Latitude_x-row.Latitude_y)**2+(row.Longitude_x-row.Longitude_y)**2))

#using iteration to calculate the radius of request for every POI and the density within the circle
poi_distance = {}
for key, value in distance.items():
    if value:
        radius = max(value)
        n = len(value)
    else:
        radius = 0
        n = 1
    poi_distance[key] = [np.std(value),np.mean(value),radius,radius**2*3.14159/n]
#transform the results into dataframe
poi_distance = pd.DataFrame(poi_distance)
#add index for the results dataframe
poi_distance.index = ['stdev', 'average','radius','density']
poi_distance.head()

Unnamed: 0,POI1,POI2,POI3,POI4
stdev,3.853624,,2.85447,27.302557
average,3.355088,,5.536262,8.52239
radius,24.851937,0.0,20.123844,192.704991
density,0.222359,0.0,0.144246,273.216899


In [31]:
relation = open(r'D:\Python Study\EQ Works data engineer\relations.txt','r')
#read the text file of dependency and transform them to list
dependency = [i.split('->') for i in relation.read().splitlines()] 
list_tasks = [97,75,100,102,16,39,41,62,112,20,21,73,56,55,36,37,94,31]
#transform the text file of relations into Python dictionary 
dependency_dic = {}
for k,v in dependency:
    dependency_dic[int(v)] = dependency_dic.get(int(v), []) + [int(k)]

#The class 'Path' is my solution for question 4b pipeline dependency. This solution employes depth fist search algorithm to 
#solve the problem
class Path:
    def __init__(self, dependency_dic, List, startings):
        self.List = List
        self.dependency_dic = dependency_dic
        self.path = []
        self.visited = {i: False for i in self.List}
        self.startings = startings
    
    def DFS(self, goal):
        if not self.visited[goal]:
            self.visited[goal] = True
            if goal in self.dependency_dic and goal not in self.startings:
                for i in self.dependency_dic[goal]:
                    self.DFS(i)
            self.path.append(goal)
p = Path(dependency_dic, list_tasks, [73])
p.DFS(36)
print(p.path)

[41, 112, 39, 100, 21, 73, 20, 97, 94, 56, 102, 36]
