## Libraries

In [1]:
import pandas as pd
import geopandas as gdp
import folium
from folium import Choropleth, Circle, Marker, Icon, Map
from folium.plugins import HeatMap, MarkerCluster
import re

import sys
sys.path.append('../')
import src.compare_cities_functions as ccf


import warnings
warnings.filterwarnings('ignore')

In [2]:
df_san_francisco_usa = pd.read_csv("../data/Near_geoqueries/near_san_francisco.csv")

In [3]:
df_new_york_usa = pd.read_csv("../data/Near_geoqueries/near_new_york.csv")

## Checking categories Foursquare found

In [4]:
df_san_francisco_usa.category.unique()

array(['Daycare', 'Club House', 'Vegetarian / Vegan Restaurant', 'Hotel',
       'Airport Tram', 'Plane', 'Nightclub', 'Comedy Club', 'Gym',
       'Preschool', 'Strip Club', 'Government Building', 'Café',
       'Business Service', 'Rental Car Location', 'Cocktail Bar',
       'Italian Restaurant', 'Taxi', 'Gym / Fitness Center', 'Resort',
       'Tennis Court', 'Airport Service', 'Pet Service', 'Event Space',
       'School', 'Grocery Store', 'Bar', 'Park', 'Airport Gate',
       'Latin American Restaurant'], dtype=object)

In [5]:
df_new_york_usa.category.unique()

array(['Baggage Claim', 'Preschool', 'Airport Tram', 'Pet Store',
       'Vegetarian / Vegan Restaurant', 'Nightclub', 'Club House',
       'Transportation Service', 'Cocktail Bar', 'Music Venue',
       'Pet Service', 'Taxi', 'Airport Lounge', 'Rental Car Location',
       'Airport', 'General Travel', 'Government Building', 'Plane',
       'Rock Club', 'Bank', 'Pub', 'Airport Terminal', 'Bar',
       'Taxi Stand', 'Swiss Restaurant', 'Café', 'Scenic Lookout',
       'Travel Agency', 'American Restaurant', 'Airport Service',
       'Bus Station', 'Art Gallery', 'Boat or Ferry'], dtype=object)

## Categories consolidation

#### We want to consolidate the categories found into our main categories

- NEED 1: The CEO is vegan = 1 --> Vegan Restaurants

- NEED 2: The office dog—"Dobby" needs a hairdresser every month. Ensure there's one not too far away = 87 --> Pet Services (vet or pet hairdresser)

- NEED 3: Everyone in the company is between 25 and 40, give them some place to go party = 87  --> Clubbing (pubs, bars and clubs)

- NEED 4: 30% of the company staff have at least 1 child = 26 --> Preschool (nurseries or schools)


- NEED 6: Account managers need to travel a lot = 20 --> Transportation (airports or train stations)

In [6]:
df_san_francisco_usa["category_simplify"] = df_san_francisco_usa["category"]
df_new_york_usa["category_simplify"] = df_new_york_usa["category"]

In [7]:
ccf.categoryAggr(df_san_francisco_usa, "category_simplify")

In [8]:
ccf.categoryAggr(df_new_york_usa, "category_simplify")

#### We've got a few categories results that are not of our interest. We delete those results

In [9]:
main_requirements_list = ['Preschool', 'Going Out', 'Vegan Restaurant', 'Transportation', 'Pet Services']

In [10]:
# For San Francisco
df_san_francisco_usa.drop(df_san_francisco_usa.loc[~df_san_francisco_usa['category_simplify'].isin(main_requirements_list)].index, inplace=True)

In [11]:
df_san_francisco_usa.category_simplify.unique()

array(['Preschool', 'Going Out', 'Vegan Restaurant', 'Transportation',
       'Pet Services'], dtype=object)

In [12]:
# For New York
df_new_york_usa.drop(df_new_york_usa.loc[~df_new_york_usa['category_simplify'].isin(main_requirements_list)].index, inplace=True)

In [13]:
df_new_york_usa.category_simplify.unique()

array(['Preschool', 'Transportation', 'Pet Services', 'Vegan Restaurant',
       'Going Out'], dtype=object)

#### We can now put the two dataframes together

In [14]:
df_san_francisco_usa["city"] = "San Francisco"
df_new_york_usa["city"] = "New York"

In [15]:
df_san_francisco_usa.shape

(128, 9)

In [16]:
df_new_york_usa.shape

(191, 9)

In [17]:
df_cities = pd.concat([df_san_francisco_usa, df_new_york_usa], axis=0)

In [18]:
df_cities.shape

(319, 9)

In [19]:
df_cities.head(2)

Unnamed: 0,_id,name,lat,long,category,geometry,distance,category_simplify,city
0,6135fecc0c5109129a29ad63,Bright Horizons at Spear Street,37.790787,-122.392536,Daycare,"{'type': 'Point', 'coordinates': [-122.3925358...",0.122825,Preschool,San Francisco
1,6135fecc0c5109129a29ad81,Millennium Tower Club Level,37.790882,-122.396038,Club House,"{'type': 'Point', 'coordinates': [-122.3960379...",0.275584,Going Out,San Francisco


## Calculate City Ranking

#### Normalize "distance"

Data Normalization is a common practice which consists of transforming numeric columns to a common scale. For this case we are going to use the min-max scaling

In [20]:
df_cities_mean = df_cities.groupby(["category_simplify", "city"]).mean()

In [21]:
df_cities_mean.reset_index(drop = False)

Unnamed: 0,category_simplify,city,lat,long,distance
0,Going Out,New York,40.70805,-73.973732,3.084052
1,Going Out,San Francisco,37.779652,-122.410273,2.226409
2,Pet Services,New York,40.705295,-73.992689,2.582285
3,Pet Services,San Francisco,37.798542,-122.424608,2.866785
4,Preschool,New York,40.699645,-73.980175,2.596659
5,Preschool,San Francisco,37.785115,-122.411324,2.376722
6,Transportation,New York,40.708783,-73.98691,3.147536
7,Transportation,San Francisco,37.783843,-122.409873,2.170966
8,Vegan Restaurant,New York,40.711171,-73.980949,2.834848
9,Vegan Restaurant,San Francisco,37.781632,-122.415584,2.63502


In [22]:
df_cities_normalized = ccf.min_max_scaling(df_cities_mean, "distance")

In [23]:
df_cities_normalized["importance"] = df_cities_normalized["category_simplify"]

In [24]:
ccf.importance(df_cities_normalized, "importance")

In [25]:
df_cities_normalized.head()

Unnamed: 0,category_simplify,city,lat,long,distance,importance
0,Going Out,New York,40.70805,-73.973732,0.934992,0.2
1,Going Out,San Francisco,37.779652,-122.410273,0.056773,0.2
2,Pet Services,New York,40.705295,-73.992689,0.421187,0.25
3,Pet Services,San Francisco,37.798542,-122.424608,0.712513,0.25
4,Preschool,New York,40.699645,-73.980175,0.435905,0.15


In [27]:
# df_cities_normalized.info()

### Weights

- Vegan Restaurant = 0.25
- Pet Services = 0.25
- Going Out = 0.2
- Preschool = 0.15
- Transportation = 0.15 

In [28]:
df_cities_normalized["ranking"] = df_cities_normalized["distance"] * df_cities_normalized["importance"]

In [29]:
df_cities_normalized2 = df_cities_normalized.groupby(["city"]).sum()

In [30]:
df_cities_normalized2.sort_values("ranking")

Unnamed: 0_level_0,lat,long,distance,importance,ranking
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
San Francisco,188.928784,-612.071661,1.455164,1.0,0.339883
New York,203.532943,-369.914454,3.471895,1.0,0.677634


### We have a winner: [Fake Tales Of San Francisco](https://www.youtube.com/watch?v=ePg1tbia9Bg)

In [32]:
# export dataframe
df_san_francisco_usa.to_csv("../data/San_Francisco_map_ready/san_francisco_final.csv", index = False)