In [7]:
import pandas as pd
import ast
import openrouteservice as ors
import time

# OpenRouteService API Key 
# you can generate your own API key from the openrouteservice website 
client1 = ors.Client(key='')

In [8]:
# Get longitude and Latitude in Cloest_shops, 
# to check whether the location is none
# Input: shop location in Cloest_shops's column (Choest shops)
# Ouput: the shops coordinate

def extract_coordinates(coor):
    try:
        shops_list = ast.literal_eval(coor)
        return [(shops_list[2], shops_list[1])]
    except:
        return [(None, None)]
    

In [9]:
# Extracts and preprocesses the data from the DataFrame.
# Input: the dataframe for all information
# Output: a list of property coordinates and a list of shops_coordinates.

def preprocess_data(df):
    coordinates_list = list(zip(df['longitude'], df['Latitude']))
    
    shops_coordinates = df['Cloest_shops'].apply(extract_coordinates)
    
    return coordinates_list, shops_coordinates


In [10]:
# Calls the API and returns the distance and duration between two coordinates.
# Input: the corrdinates for shops and property
# The client is the API key for Open Route Service
# The profile parameter determines the mode of movement (e.g., walking, driving).
# Output: the summary for the distance and time from Open Route Service (in seconds)
# If errored in Open Route Service return None

def get_distance_time(coord1, coord2, client, profile='driving-car'):
    coordinates = [list(coord1),list(coord2)]
    
    try:
        geojson = client.directions(
            coordinates=coordinates,
            profile=profile,
            format='geojson',
        )
        
        # get summary of distance and duration
        distance = geojson['features'][0]['properties']['summary']['distance']
        duration = geojson['features'][0]['properties']['summary']['duration']
        return distance, duration
    
    except Exception as e:
        print("Error calling the API:", e)
        print("Coordinates:", coordinates)
        return None, None


In [11]:
# This function to process the CSV and populate both 'Drive distance/time' and 'Walk distance/time' columns.
# If the number of API calls exceeds 2000, the rest of the entries will be set to 0. For the convenience of 
# second day use
# Input: dataframe (all information), client(API key), start_index(the row to start), 
# output_csv(the result from function and the path for output)
# Output: dataframe(current information) maybe shortage during to the API limitation, 
# if not shortage, it will save as output_csv
# and it will return the index of the function.(which row does the function ended), 
# if None means finished all row

def main(df, client, start_index=0, output_csv="processed_data.csv"):
    
    # Get the location in each row
    coordinates_list, shops_coordinates = preprocess_data(df)
    
    # If starting from an index other than 0, fetch the existing data from the dataframe
    if start_index > 0:
        drive_distance_time = df['Drive distance/time'].tolist()
        walk_distance_time = df['Walk distance/time'].tolist()
        # Remove the 0 from the list
        while 0 in drive_distance_time:
            drive_distance_time.remove(0)
        while 0 in walk_distance_time:
            walk_distance_time.remove(0)
    
    else:
        drive_distance_time = []
        walk_distance_time = []

    
    calls_counter = 0
    API_LIMIT = 2000
    num_API_calls = 0
    
    for i in range(start_index, len(coordinates_list)):
        # Check if we've exceeded the API call limit
        if num_API_calls >= API_LIMIT:
                # Fill the rest of the lists with 0
            remaining = len(df) - i
            drive_distance_time.extend([0] * remaining)
            walk_distance_time.extend([0] * remaining)

            df['Drive distance/time'] = drive_distance_time
            df['Walk distance/time'] = walk_distance_time
            return df, i  # Return the index where we stopped
        
        current_coords = coordinates_list[i]
        
        # If the shops_corrdinates is None, skep that row
        if shops_coordinates[i][0] == (None, None):  
            drive_distance_time.append(None)
            walk_distance_time.append(None)
        else:
            # Use openroute to get the infomation needed
            drive_distance, drive_duration = get_distance_time(current_coords, shops_coordinates[i][0], client, 'driving-car')
            walk_distance, walk_duration = get_distance_time(current_coords, shops_coordinates[i][0], client, 'foot-walking')
        
            drive_distance_time.append(f"{drive_distance}/{drive_duration}")
            walk_distance_time.append(f"{walk_distance}/{walk_duration}")
        
        # Since we are making two API calls (drive and walk)
        num_API_calls += 2
        
        # the API only calls 40 data per min
        calls_counter += 2
        if calls_counter == 40:
            time.sleep(60) # Pause for 60 seconds
            calls_counter = 0 # Reset the counter
            
    df['Drive distance/time'] = drive_distance_time
    df['Walk distance/time'] = walk_distance_time
            
    # Save to CSV file
    df.to_csv(output_csv, index=False)
        
    # None indicates that all rows were processed
    return df, None


In [None]:
# process of the data

house1 = pd.read_csv('../data/cruated/tenth_clean/house_1.csv')
house1, unfinish = main(house1, client1)
house1, result = main(house1, client3, start_index = unfinish, output_csv = "../data/curated/eleventh_clean/house_1.csv ")

# house_2
house2 = pd.read_csv('../data/cruated/tenth_clean/house_2.csv')
house2, unfinish1 = main(house2, client1)
house2, unfinish2 = main(house2, client2, start_index = unfinish1)
house2, unfinish3 = main(house2, client3, start_index = unfinish2)
house2, result = main(house2, client4, start_index = unfinish3, output_csv="../data/curated/eleventh_clean/house_2.csv")

#house3
house3 = pd.read_csv('../data/cruated/tenth_clean/house_3.csv')
house3, result = main(house1, client1, output_csv = "../data/curated/eleventh_clean/house_3.csv")

#house4
house4 = pd.read_csv('../data/cruated/tenth_clean/house_4.csv')
house4, result = main(house4, client1, output_csv = "../data/curated/eleventh_clean/house_4.csv")

#apartment1
apartment1 = pd.read_csv('../data/cruated/tenth_clean/apartment_1.csv')
apartment1, result = main(apartment1, client1, output_csv = "../data/curated/eleventh_clean/apartment_1.csv")

#apartment2
apartment2 = pd.read_csv('../data/cruated/tenth_clean/apartment_2.csv')
apartment2, result = main(apartment1, client1, output_csv = "../data/curated/eleventh_clean/apartment_2.csv")

#apartment3
apartment3 = pd.read_csv('../data/cruated/tenth_clean/apartment_3.csv')
apartment3, result = main(apartment3, client1, output_csv = "../data/curated/eleventh_clean/apartment_3.csv")

#apartment4
apartment4 = pd.read_csv('../data/cruated/tenth_clean/apartment_3.csv')
apartment4, result = main(apartment4, client1, output_csv = "../data/curated/eleventh_clean/apartment_4.csv")

#apartment5
apartment5 = pd.read_csv('../data/cruated/tenth_clean/apartment_3.csv')
apartment5, result = main(apartment5, client1, output_csv = "../data/curated/eleventh_clean/apartment_5.csv")

#town_house
town_house = pd.read_csv('../data/cruated/tenth_clean/town_house.csv')
town_house, unfinish1 = main(town_house, client5)
town_house, result = main(town_house, client6, start_index = unfinish1, output_csv="../data/curated/eleventh_clean/town_house.csv")