**Notebook to compare the different cities**

**Libraries**

In [2]:
import os
import pandas as pd
import time
import re
import io
import requests
import json
from dotenv import load_dotenv
import geopandas as gpd
from cartoframes.viz import Map, Layer, popup_element
import math

# Weighted distances tables

**Importing everything for the comparison table**

In [3]:
# Activities/categories datasets:
sanfran = pd.read_csv("data/sanfrancisco_activities.csv")
south_sanfran = pd.read_csv("data/south_sanfrancisco_activities.csv")
newyork = pd.read_csv("data/newyork_activities.csv")
london = pd.read_csv("data/london_activities.csv")

In [4]:
# Companies datasets:
sanfran_companies = pd.read_csv('./data/companies_sanfrancisco.csv')
newyork_companies = pd.read_csv('./data/companies_newyork.csv')
london_companies = pd.read_csv('./data/companies_london.csv')
south_sanfran_companies = pd.read_csv('./data/companies_south_sanfrancisco.csv')

**Coordinates of my company in each city (adapted for distance function)**

In [5]:
# Original
sanfran_coord = [37.781929, -122.404176]
south_sanfran_coord = [37.656246, -122.399735]
newyork_coord = [40.739930, -73.993049]
london_coord = [51.514165, -0.109017]

In [6]:
# Transformed
sanfran_hav = sanfran_coord[::-1]
south_sanfran_hav = south_sanfran_coord[::-1]
newyork_hav = newyork_coord[::-1]
london_hav = london_coord[::-1]

## Calculate distance between gaming/tech hub and my company

**Function to calculate distance between two points**

In [8]:
def haversine(coord1, coord2):
 
    # Coordinates in decimal degrees (e.g. 2.89078, 12.79797)
    lon1, lat1 = coord1
    lon2, lat2 = coord2

    R = 6371000  # radius of Earth in meters
    phi_1 = math.radians(lat1)
    phi_2 = math.radians(lat2)

    delta_phi = math.radians(lat2 - lat1)
    delta_lambda = math.radians(lon2 - lon1)

    a = math.sin(delta_phi / 2.0) ** 2 + math.cos(phi_1) * math.cos(phi_2) * math.sin(delta_lambda / 2.0) ** 2
    
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))

    meters = R * c  # output distance in meters
    meters = round(meters)
    
    return meters

**Function to add distance column**

In [9]:
def addDistanceHaversine(df, coord_my_company):
    '''This function receives a dataframe and adds a new column called 'distance'
    which is the distance between my company and the rest of the companies
    '''
    # Reorganizing latitude and longitude
    coord_list = []
    for index, row in df.iterrows():
        coord_list.append([row['longitude'], row['latitude']])
    
    df['coordinates'] = coord_list
    
    # Applying the distance function
    df['distance'] = df['coordinates'].apply(lambda x: haversine(x, coord_my_company))
    
    # Sorting by distance
    df = df.sort_values(by=["distance"])
    
    # Removes rows with distance == 0 (my company) and distance > 5000m:
    df = df[(df['distance'] != 0) & (df['distance'] <= 5000)]
    
    # Add column for later processing
    df['query'] = 'companies nearby' 
    
    return df

**Applying the function to each df**

In [10]:
sanfran_companies_dist = addDistanceHaversine(sanfran_companies, sanfran_hav)
south_sanfran_companies_dist = addDistanceHaversine(south_sanfran_companies, south_sanfran_hav)
newyork_companies_dist = addDistanceHaversine(newyork_companies, newyork_hav)
london_companies_dist = addDistanceHaversine(london_companies, london_hav)

## Summary weighted distance tables

**Function to build a summary Table**

In [11]:
def addCountDistanceCategory(df):
    
    series = [df.groupby("query").size(), df.groupby("query")["distance"].sum()]
    new_df = pd.concat(series, axis = 1).reset_index().rename(columns={0: 'counts', 'distance': 'sum_distance'})
    
    return new_df

In [12]:
def addWeightRadius(df):
    '''This function receives a df and adds weight and radius columns.
    '''
    weight = pd.Series([0.2])
    radius = pd.Series([2000])
    df_1 = pd.concat([weight, radius], axis = 1).rename(columns={0:"weight",1:"radius"})
    new_df = pd.concat([df_1, df], axis = 1)
    new_df = new_df[['query', 'weight', 'radius', 'counts', 'sum_distance']]
    
    return new_df

In [13]:
def createTableCity(df_categories, df_companies):
    '''This function receives two df and returns a new df where the number of categories are grouped
    and the weighted distance of each category is calculated.
    :df_categories: the df containing the categories retrieved from foursquare (clean).
    :df_companies: the df containing the gaming-tech-design company hub.
    '''
    # Create categories, weight and radius columns
    category = pd.Series(["airport", "basketball", "club", "dog hairdresser", "school", "starbucks", "vegan restaurant"])
    weight = pd.Series([0.01, 0.05, 0.1, 0.05, 0.3, 0.2, 0.1])
    radius = pd.Series([50000, 10000, 2000, 10000, 2000, 2000, 2000])
    df_1 = pd.concat([category, weight, radius], axis = 1).rename(columns={0: 'query', 1: 'weight', 2: 'radius'})
    
    # Add columns 'count' and 'sum of distance'
    df_2 = addCountDistanceCategory(df_categories)
    df_2_companies = addCountDistanceCategory(df_companies)
    
    # Transform df_companies
    df_2_companies = addWeightRadius(df_2_companies)
    
    # Merge
    df_3 = df_1.merge(df_2, how = "outer")
    df_4 = pd.concat([df_3, df_2_companies], axis = 0, ignore_index=True)
    
    # Add columns 'average_distance' and penalizing empty cells based on radius:
    df_4["avg_distance"] = df_4["sum_distance"] / df_4["counts"]
    df_4["avg_distance"] = df_4["avg_distance"].fillna(df_4["radius"])
    
    # Calculating 'weighted average distance'
    df_4["weighted_avg_distance"] = df_4["avg_distance"] * df_4["weight"]

    return df_4

**Applying the function to each dataset**

In [15]:
sanfran_t = createTableCity(sanfran, sanfran_companies_dist)
south_sanfran_t = createTableCity(south_sanfran, south_sanfran_companies_dist)
newyork_t = createTableCity(newyork, newyork_companies_dist)
london_t = createTableCity(london, london_companies_dist)

In [16]:
sanfran_t

Unnamed: 0,query,weight,radius,counts,sum_distance,avg_distance,weighted_avg_distance
0,airport,0.01,50000,2,36893,18446.5,184.465
1,basketball,0.05,10000,4,11052,2763.0,138.15
2,club,0.1,2000,5,1869,373.8,37.38
3,dog hairdresser,0.05,10000,2,4118,2059.0,102.95
4,school,0.3,2000,5,2279,455.8,136.74
5,starbucks,0.2,2000,5,1916,383.2,76.64
6,vegan restaurant,0.1,2000,5,2780,556.0,55.6
7,companies nearby,0.2,2000,24,28282,1178.416667,235.683333


In [17]:
south_sanfran_t

Unnamed: 0,query,weight,radius,counts,sum_distance,avg_distance,weighted_avg_distance
0,airport,0.01,50000,4.0,44084.0,11021.0,110.21
1,basketball,0.05,10000,,,10000.0,500.0
2,club,0.1,2000,2.0,2060.0,1030.0,103.0
3,dog hairdresser,0.05,10000,,,10000.0,500.0
4,school,0.3,2000,5.0,6862.0,1372.4,411.72
5,starbucks,0.2,2000,3.0,2265.0,755.0,151.0
6,vegan restaurant,0.1,2000,2.0,1709.0,854.5,85.45
7,companies nearby,0.2,2000,7.0,10959.0,1565.571429,313.114286


In [18]:
newyork_t

Unnamed: 0,query,weight,radius,counts,sum_distance,avg_distance,weighted_avg_distance
0,airport,0.01,50000,3,47988,15996.0,159.96
1,basketball,0.05,10000,5,20903,4180.6,209.03
2,club,0.1,2000,5,1339,267.8,26.78
3,dog hairdresser,0.05,10000,5,22480,4496.0,224.8
4,school,0.3,2000,5,906,181.2,54.36
5,starbucks,0.2,2000,5,1799,359.8,71.96
6,vegan restaurant,0.1,2000,5,1777,355.4,35.54
7,companies nearby,0.2,2000,13,25866,1989.692308,397.938462


In [19]:
london_t

Unnamed: 0,query,weight,radius,counts,sum_distance,avg_distance,weighted_avg_distance
0,airport,0.01,50000,3.0,75537.0,25179.0,251.79
1,basketball,0.05,10000,,,10000.0,500.0
2,club,0.1,2000,5.0,2466.0,493.2,49.32
3,dog hairdresser,0.05,10000,,,10000.0,500.0
4,school,0.3,2000,5.0,4578.0,915.6,274.68
5,starbucks,0.2,2000,5.0,2145.0,429.0,85.8
6,vegan restaurant,0.1,2000,5.0,2772.0,554.4,55.44
7,companies nearby,0.2,2000,1.0,4168.0,4168.0,833.6


**Calculating total score for each Country: the lower the better as it is proportional to distance**

In [21]:
def calculateTotal(list_of_df):
    '''This function receives a dataframe and calculates the total score
    as the sum of the weighted distance.
    '''
    total_list = []
    for i in list_of_df:
        total = sum(i["weighted_avg_distance"])
        total_list.append(total)

    total_df = pd.concat([pd.Series([i for i in list_of_df]), 
                          pd.Series(total_list)], axis = 1).rename(columns={0: 'city', 1: 'total'})
    
    return total_df

In [22]:
calculateTotal([sanfran_t, south_sanfran_t, newyork_t, london_t])

Unnamed: 0,city,total
0,query weight radius counts s...,967.608333
1,query weight radius counts s...,2174.494286
2,query weight radius counts s...,1180.368462
3,query weight radius counts s...,2550.63


**Exportin tables for map visualization**

In [24]:
def renameColumns(df):
    '''Function that renames columns latitude and longitude
    '''
    df = df.rename(columns = {"latitude" : "lat", "longitude" : "lon"})
    
    return df

In [25]:
sanfran_companies_dist = renameColumns(sanfran_companies_dist)
newyork_companies_dist = renameColumns(newyork_companies_dist)
london_companies_dist = renameColumns(london_companies_dist)
south_sanfran_companies_dist = renameColumns(south_sanfran_companies_dist)

In [26]:
# Companies datasets
sanfran_companies_dist.to_csv('./data/companies_sanfrancisco_map.csv', index = False)
newyork_companies_dist.to_csv('./data/companies_newyork_map.csv', index = False)
london_companies_dist.to_csv('./data/companies_london_map.csv', index = False)
south_sanfran_companies_dist.to_csv('./data/companies_south_sanfrancisco_map.csv', index = False)