# City Similarity Scorer based on Venues 
- Recommend Similar / Dissimilar cities to given cities
- Can be used to determine if a city is good for moving
- Can be used to explore cities to visit based on cities already visited.

## Setups & Prepare Environment
- Install necessary libraries
- Import required libraries
- Store necessary API credentials

### Install Necessary Libraries

In [2]:
!pip install geopy
!pip install folium
!pip install requests
!pip install fuzzywuzzy



### Import Required Libraries

In [3]:
# Pandas and numpy
import pandas as pd
import numpy as np

# visualization
import matplotlib.pyplot as plt
import folium

# Getting data from internet
import requests
import geopy

# machine learning
import sklearn

# utils
from fuzzywuzzy import fuzz

### Setup Credentials

In [4]:
# @hidden_cell

# Foursquare Credentials
fs_client_id = "WZGPEJLUAAPXNKFJBDCRZFB1Y3HWNGJWRHTYLEZV4552SABS"
fs_client_secret = "TA3F5MXGNV3KA5RCVGPU1QX2MY0E0YY2FVNUIHGVAKGGE33H"
fs_v = "20201205"
base_url = "https://api.foursquare.com/v2/{}"
querystr_args = {
    "client_id": fs_client_id,
    "client_secret": fs_client_secret,
    "v": fs_v
} 

## Get Requisite Data

### World Cities Data from : https://simplemaps.com/data/world-cities contains
- City
- Latitude
- Longitude
- Country
- Population

In [5]:
world_cities = pd.read_csv("data/worldcities.csv")
print("{} rows read".format(world_cities.shape[0]))
world_cities.head()

26569 rows read


Unnamed: 0,city,city_ascii,lat,lng,country,iso2,iso3,admin_name,capital,population,id
0,Tokyo,Tokyo,35.6897,139.6922,Japan,JP,JPN,Tōkyō,primary,37977000.0,1392685764
1,Jakarta,Jakarta,-6.2146,106.8451,Indonesia,ID,IDN,Jakarta,primary,34540000.0,1360771077
2,Delhi,Delhi,28.66,77.23,India,IN,IND,Delhi,admin,29617000.0,1356872604
3,Mumbai,Mumbai,18.9667,72.8333,India,IN,IND,Mahārāshtra,admin,23355000.0,1356226629
4,Manila,Manila,14.5958,120.9772,Philippines,PH,PHL,Manila,primary,23088000.0,1608618140


### Function to get latitude and longitude information for a city

In [6]:
def get_lat_long(city: str, country: str) -> tuple:

    """
    Function to return a tuple containing latitude and longitude of a given city in a country

    Parameters:
    ------------
    city : str
        City for which latitude and longitude is required. Provide as much information as possible to avoid
        confusion due to multiple cities of the same name. For e.g. Aberdeen, North Carolina rather than
        just Aberdeen
    country : str
        Two letter country code OR complete string of country in which city is present

    Returns:
    --------
    A tuple containing the (latitude, longitude) if found. If not found, empty tuple is returned

    """

    latitude = None
    longitude = None

    geolocator = geopy.geocoders.Nominatim(user_agent="city_similarity")
    location = geolocator.geocode(city + ", " + country)
    if location:
        print("Returning for: " + location.address)
        return location.latitude, location.longitude
    else:
        return None, None


### Function to get venues for a given city (using foursquare)

In [7]:
def get_nearby_venues(latitude, longitude, city_name, radius=10000, limit=None, categories=None):

    """
    Function takes latitude and longitude of a place, (optional radius and limit params)
    and returns a list of venues within given radius of latitude and longitude 

    Parameters:
    ------------
    latitude : float
        latitude of point around which venues are needed
    longitude : float
        longitude of point around which venues are needed
    name : str
        name of the city. This is to compare city results with what we recieve from Foursquare
    radius: int. Optional. Default 10,000
        radius in metres around which to limit. 
        By default 10,000m (10km) radius. Note that if 10km radius may beyond city limits.
    limit: int. Optional. Default None.
        Limit # of venues returned. If not specified, all results will be returned
    categories: list. Optional. Default None.
        List of categories for which venues are returned. Important if you want similarity scores
        based on only certain criteria of venues. Otherwise, we stay category agnostic

    Returns:
    --------
        List of venues found, with each venue in a dictionary, containing
        venue_name, venue_latitude, venue_longitude and venue_category

    """

    per_query_limit = 50

    # Update URL
    results = None
    venue_list = []
    url = base_url.format("venues/explore")
    i = 0

    # Update request parameters
    querystr_args["categoryId"] = ",".join(categories) if categories else None
    querystr_args["radius"] = radius
    # if we are specifying less than 50, we have to do multiple runs of query, otherwise once only
    querystr_args["limit"] = min(limit, per_query_limit) if limit else per_query_limit
    querystr_args["ll"] = "{},{}".format(latitude, longitude)
    querystr_args["time"] = "any"
    querystr_args["day"] = "any"
    querystr_args["locale"] = "en"          # avoids giving addresses in local languages
    querystr_args["offset"] = None

    city_name_match_ratio = 0
    
    # max iterations will change once you get information about total venues if limit is 
    # not specified
    max_iterations = (max(limit, per_query_limit) if limit else per_query_limit)/ per_query_limit

    # also specify how many venues read. This is because foursquare documentation and
    # API behaviour doesn't match. So we have to cut off venue listing at limit
    venues_read = 0

    while i < max_iterations:

        total_venue_count = 0
        resp = requests.get(url, params=querystr_args)
        # print("Getting venues from: ", resp.url)
        if resp.ok:                                 
            results = resp.json()
            # print(results)
            if results and results["meta"]["code"] == 200:
                # Get total # of results
                total_venue_count = results["response"]["totalResults"]
                
                # Get venues in response
                results = results["response"]["groups"][0]["items"]
                # print("Page {} has {} items".format(i, len(results)))

                # get individual venue details
                for item in results:
                    # print (item)

                    # 10km radius may overshoot the city, This is ok. We are not restricting
                    # Fuzzy matching code is written for extensibility.
                    # Fuzzy matching because english address and local city name may be different
                    if fuzz.ratio(city_name, item["venue"]["location"].get("city")) >= city_name_match_ratio:
                        venue = {}
                        venue["venue_name"] = item["venue"]["name"]
                        venue["venue_latitude"] = item["venue"]["location"]["lat"]
                        venue["venue_longitude"] = item["venue"]["location"]["lng"]
                        if item["venue"]["categories"]:
                            venue["venue_category"] = item["venue"]["categories"][0]["name"]
                        else:
                            venue["venue_category"] = None
                        venue_list.append(venue)
                    else:
                        print("Discarding {}. Venue City: {} instead of {}. Distance: {}"
                            .format(
                                item["venue"]["name"], 
                                item["venue"]["location"].get("city"), 
                                city_name,
                                fuzz.ratio(city_name, item["venue"]["location"].get("city"))))
                    venues_read += 1


            # something came back from fsq
            else:
                print("Error in getting results Iteration: {}. Json returned: {}"
                    .format(i, results))
                    
        # valid response from request
        else:
            print("Error in getting results Iteration: {}. Response status: {}, {}"
                .format(i, resp.status_code, resp.text))

        # Update max iterations if limit is None, otherwise use limit to calculate iterations
        venues_to_read = min(limit, total_venue_count) if limit else total_venue_count
        max_iterations = venues_to_read/per_query_limit

        # get ready for next iteration
        i += 1
        querystr_args["offset"] = i*per_query_limit
        # are we on last page? Limit venues to what is needed
        querystr_args["limit"] = min(venues_to_read - venues_read, per_query_limit)

    # return venues found
    print("{} venues found within {}. {} considered valid"
        .format(total_venue_count, city_name, len(venue_list)))
    return venue_list


# City Similarity Evaluation

1. Get cities for which we need similar cities
    - This could be one or more cities provided by the user. Let us assume two cities
2. Get venues for each of the cities, since this is what is used for similarity.
    - We may want to use specific foursquare categories for venues based on use-case (e.g. Tourism -> focus on Food, art, culture etc.
3. Pick 50 random cities from the list of cities
    - This is being done to stay under the Foursquare API limits of 950 calls per day.
4. Get venues for these random cities
5. Perform one-hot encoding of venues
6. Cluster random cities together based on venues
7. Measure similarities between given cities and randomly selected cities
8. Correlate clusters with similarity scores to find which groups are similar to cities provided and which ones are dissimilar

## Get User-input Cities and venues

In [8]:
user_input_cities = ["New York, NY USA", "Amsterdam, NL"]

# Only select Arts and Entertainment, Food, Nightlife, Outdoors / Recreation, Shop & Service, Travel venues
# venue_categories = [
#     "4d4b7104d754a06370d81259", "4d4b7105d754a06374d81259", 
#     "4d4b7105d754a06376d81259", "4d4b7105d754a06377d81259", 
#     "4d4b7105d754a06378d81259", "4d4b7105d754a06379d81259"
# ]

# Get latitude and longitude for user cities
user_city_list = []
for city in user_input_cities:
    lat, lng = get_lat_long(*city.split(", "))
    if lat and lng:
        user_city_list.append((city.split(", ")[0], lat, lng))

# Get venues for the city
user_city_venues_df = pd.DataFrame()
for city, lat, lng in user_city_list:
    city_venues = get_nearby_venues(lat, lng, city, radius=10000)
    if city_venues:
        per_city_venue_df = pd.DataFrame(city_venues)
        per_city_venue_df["city_name"] = city
        per_city_venue_df["city_latitude"] = lat
        per_city_venue_df["city_longitude"] = lng

        per_city_venue_df = per_city_venue_df[
            ["city_name", "city_latitude", "city_longitude", 
            "venue_name", "venue_latitude", "venue_longitude", "venue_category"]]
        # print(per_city_venue_df.shape[0], " venues found for ", city)
        user_city_venues_df = user_city_venues_df.append(per_city_venue_df, ignore_index=True)

print(user_city_venues_df.shape[0], " rows in total")
user_city_venues_df.head()

Returning for: New York, Stone Street Historic District, New York, United States of America
Returning for: Amsterdam, Noord-Holland, Nederland
236 venues found within New York. 236 considered valid
244 venues found within Amsterdam. 244 considered valid
480  rows in total


Unnamed: 0,city_name,city_latitude,city_longitude,venue_name,venue_latitude,venue_longitude,venue_category
0,New York,40.712728,-74.006015,Los Tacos No. 1,40.714267,-74.008756,Taco Place
1,New York,40.712728,-74.006015,Aire Ancient Baths,40.718141,-74.004941,Spa
2,New York,40.712728,-74.006015,9/11 Memorial North Pool,40.712077,-74.013187,Memorial Site
3,New York,40.712728,-74.006015,One World Trade Center,40.713069,-74.013133,Building
4,New York,40.712728,-74.006015,Crown Shy,40.706187,-74.00749,Restaurant


In [9]:
print(user_city_venues_df["venue_category"].nunique(), " unique venue categories!")
user_city_venues_df.groupby(["venue_category", "city_name"]).count()

146  unique venue categories!


Unnamed: 0_level_0,Unnamed: 1_level_0,city_latitude,city_longitude,venue_name,venue_latitude,venue_longitude
venue_category,city_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
American Restaurant,New York,2,2,2,2,2
Arepa Restaurant,New York,1,1,1,1,1
Art Gallery,New York,3,3,3,3,3
Art Museum,Amsterdam,3,3,3,3,3
Art Museum,New York,3,3,3,3,3
Arts & Crafts Store,New York,1,1,1,1,1
Athletics & Sports,Amsterdam,1,1,1,1,1
Athletics & Sports,New York,1,1,1,1,1
BBQ Joint,Amsterdam,1,1,1,1,1
BBQ Joint,New York,1,1,1,1,1


## Get Random Cities and Venues from the list of cities

In [10]:
# Sample 100 random cities from world_cities list
random_sample_count = 100
sample_cities = world_cities.sample(random_sample_count)
sample_cities.head()

Unnamed: 0,city,city_ascii,lat,lng,country,iso2,iso3,admin_name,capital,population,id
13289,Ra’s al ‘Ayn,Ra's al `Ayn,36.8503,40.0706,Syria,SY,SYR,Al Ḩasakah,minor,29347.0,1760030804
26457,Caluula,Caluula,11.967,50.75,Somalia,SO,SOM,Bari,,513.0,1706796061
15080,Owosso,Owosso,42.9955,-84.176,United States,US,USA,Michigan,,21315.0,1840003068
22213,Bad Liebenstein,Bad Liebenstein,50.8144,10.3542,Germany,DE,DEU,Thuringia,,7786.0,1276199299
14069,Dedham,Dedham,42.2466,-71.1777,United States,US,USA,Massachusetts,,25377.0,1840053547


In [18]:
# Get venues from the cities and create a dataframe

sample_city_venues_df = pd.DataFrame()
for city, lat, lng in zip(sample_cities["city"], sample_cities["lat"], sample_cities["lng"]):
    # print("Going venue hunting for {} at {}/{}".format(city, lat, lng))
    city_venues = get_nearby_venues(lat, lng, city, radius=10000)
    
    # for tiny cities, if we are checking within city bounds, we may get empty venue list 
    if city_venues:
        per_city_venue_df = pd.DataFrame(city_venues)
        per_city_venue_df["city_name"] = city
        per_city_venue_df["city_latitude"] = lat
        per_city_venue_df["city_longitude"] = lng
        per_city_venue_df = per_city_venue_df[
            ["city_name", "city_latitude", "city_longitude", 
            "venue_name", "venue_latitude", "venue_longitude", "venue_category"]]
        sample_city_venues_df = sample_city_venues_df.append(per_city_venue_df, ignore_index=True)

sample_city_venues_df

39 venues found within Ra’s al ‘Ayn. 39 considered valid
0 venues found within Caluula. 0 considered valid
54 venues found within Owosso. 54 considered valid
17 venues found within Bad Liebenstein. 17 considered valid
237 venues found within Dedham. 237 considered valid
37 venues found within Grodzisk Mazowiecki. 37 considered valid
72 venues found within Bay St. Louis. 72 considered valid
97 venues found within Bearsted. 97 considered valid
17 venues found within Námestovo. 17 considered valid
27 venues found within Balingen. 27 considered valid
30 venues found within Tutong. 30 considered valid
235 venues found within Barton upon Irwell. 235 considered valid
63 venues found within Hampton. 63 considered valid
14 venues found within Mecca. 14 considered valid
240 venues found within Bagneux. 240 considered valid
238 venues found within Hyattsville. 238 considered valid
138 venues found within Bow. 138 considered valid
6 venues found within Kampong Cham. 6 considered valid
6 venues fou

Unnamed: 0,city_name,city_latitude,city_longitude,venue_name,venue_latitude,venue_longitude,venue_category
0,Ra’s al ‘Ayn,36.8503,40.0706,Azizbey Coffee Shop,36.862475,40.048868,Coffee Shop
1,Ra’s al ‘Ayn,36.8503,40.0706,Hasbihâl Cafe,36.847610,40.053702,Café
2,Ra’s al ‘Ayn,36.8503,40.0706,İncir6 Cafe,36.841199,40.042841,Café
3,Ra’s al ‘Ayn,36.8503,40.0706,Şafak Cafe,36.841551,40.042599,Bar
4,Ra’s al ‘Ayn,36.8503,40.0706,Teras Cafe&Otel,36.849471,40.052688,Café
5,Ra’s al ‘Ayn,36.8503,40.0706,ceylanpınar öğretmenevi restaurant,36.846499,40.054827,Turkish Restaurant
6,Ra’s al ‘Ayn,36.8503,40.0706,eco ciger,36.843972,40.051398,Steakhouse
7,Ra’s al ‘Ayn,36.8503,40.0706,PaşaBey Kebap,36.845106,40.052418,Steakhouse
8,Ra’s al ‘Ayn,36.8503,40.0706,PAŞABEY KEBAP,36.845031,40.052430,Diner
9,Ra’s al ‘Ayn,36.8503,40.0706,Ceylanpınar Tigem Çiftlik,36.840457,40.032132,Farm


In [23]:
# It really makes little sense to compare cities if they come up with very few venues.
# that is not enough information. So dropping venues < 20

scv_copy = sample_city_venues_df.copy()
scv_group_df = sample_city_venues_df.groupby("city_name").count()
sample_city_venues_df = sample_city_venues_df[
    sample_city_venues_df["city_name"].isin(scv_group_df[scv_group_df["venue_name"]>20].index)
]

sample_city_venues_df
sample_city_venues_df.groupby("city_name").count()

Unnamed: 0_level_0,city_latitude,city_longitude,venue_name,venue_latitude,venue_longitude,venue_category
city_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Altenberg,36,36,36,36,36,36
Bagneux,240,240,240,240,240,240
Balingen,27,27,27,27,27,27
Barton upon Irwell,235,235,235,235,235,235
Bay St. Louis,72,72,72,72,72,72
Bearsted,97,97,97,97,97,97
Benevento,22,22,22,22,22,22
Borlänge,27,27,27,27,27,27
Bow,138,138,138,138,138,138
Bratislava,230,230,230,230,230,230


## One Hot Encoding of Venues in user cities and random cities
### Followed by grouping to get proportion of venue_categories for cities

In [24]:
# For user city venues
user_city_venues_onehot = pd.get_dummies(user_city_venues_df[["venue_category"]], prefix="", prefix_sep="")
user_city_venues_onehot.insert(0, "city_name", user_city_venues_df.city_name)
user_city_venues_grouped = user_city_venues_onehot.groupby("city_name").mean().reset_index()
user_city_venues_grouped

Unnamed: 0,city_name,American Restaurant,Arepa Restaurant,Art Gallery,Art Museum,Arts & Crafts Store,Athletics & Sports,BBQ Joint,Bagel Shop,Bakery,...,Turkish Restaurant,Udon Restaurant,Vegetarian / Vegan Restaurant,Volleyball Court,Waterfront,Whisky Bar,Wine Bar,Wine Shop,Yoga Studio,Zoo
0,Amsterdam,0.0,0.0,0.0,0.012295,0.0,0.004098,0.004098,0.0,0.020492,...,0.008197,0.0,0.008197,0.0,0.0,0.004098,0.012295,0.0,0.02459,0.004098
1,New York,0.008475,0.004237,0.012712,0.012712,0.004237,0.004237,0.004237,0.004237,0.033898,...,0.0,0.004237,0.0,0.004237,0.004237,0.0,0.0,0.016949,0.012712,0.0


In [25]:
# For sample city venues
sample_city_venues_onehot = pd.get_dummies(sample_city_venues_df[["venue_category"]], prefix="", prefix_sep="")
sample_city_venues_onehot.insert(0, "city_name", sample_city_venues_df.city_name)
sample_city_venues_grouped = sample_city_venues_onehot.groupby("city_name").mean().reset_index()
sample_city_venues_grouped

Unnamed: 0,city_name,ATM,Accessories Store,Adult Boutique,Afghan Restaurant,African Restaurant,Airport,Airport Lounge,Airport Service,Airport Terminal,...,Weight Loss Center,Whisky Bar,Wine Bar,Wine Shop,Winery,Wings Joint,Women's Store,Yoga Studio,Zoo,Zoo Exhibit
0,Altenberg,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,...,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,Bagneux,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,...,0.00000,0.000000,0.037500,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,Balingen,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,...,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,Barton upon Irwell,0.000000,0.000000,0.004255,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,...,0.00000,0.000000,0.008511,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,Bay St. Louis,0.000000,0.000000,0.000000,0.000000,0.0,0.013889,0.000000,0.000000,0.000000,...,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
5,Bearsted,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,...,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
6,Benevento,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,...,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
7,Borlänge,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,...,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
8,Bow,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,...,0.00000,0.000000,0.000000,0.000000,0.000000,0.007246,0.000000,0.000000,0.000000,0.000000
9,Bratislava,0.000000,0.000000,0.000000,0.000000,0.0,0.004348,0.000000,0.000000,0.000000,...,0.00000,0.000000,0.030435,0.004348,0.000000,0.000000,0.000000,0.008696,0.000000,0.000000


## Find Cosine Similarity between user cities and sample cities

In [26]:
# You need to have only common venue categories between both arrays to use cosine similarity
# Since we are finding similarities with user provided cities, let's use columns from that array
user_city_venues_grouped_ndarray = user_city_venues_grouped.drop("city_name", 1)
sample_city_venues_grouped_ndarray = sample_city_venues_grouped.drop("city_name", 1)
sample_city_venues_grouped_ndarray = \
    sample_city_venues_grouped_ndarray.loc[:, user_city_venues_grouped_ndarray.columns].fillna(0.0)
print(user_city_venues_grouped_ndarray.shape, sample_city_venues_grouped_ndarray.shape)

# Convert dataframes to numpy arrays
user_city_venues_grouped_ndarray = user_city_venues_grouped_ndarray.to_numpy()
sample_city_venues_grouped_ndarray = sample_city_venues_grouped_ndarray.to_numpy()

# Now find cosine similarities
import sklearn.metrics
cosine_similarities = sklearn.metrics.pairwise.cosine_similarity(
    sample_city_venues_grouped_ndarray, user_city_venues_grouped_ndarray
)

# convert cosine similarities to dataframe
cosine_similarities_df = pd.DataFrame(
    data=cosine_similarities, 
    columns=user_city_venues_grouped.city_name.tolist())

cosine_similarities_df.insert(0, "city_name", sample_city_venues_grouped.city_name)

cosine_similarities_df


(2, 146) (70, 146)


Unnamed: 0,city_name,Amsterdam,New York
0,Altenberg,0.540714,0.067875
1,Bagneux,0.612902,0.441608
2,Balingen,0.165268,0.127305
3,Barton upon Irwell,0.695582,0.433480
4,Bay St. Louis,0.479543,0.273900
5,Bearsted,0.387512,0.230056
6,Benevento,0.565277,0.244929
7,Borlänge,0.405543,0.093008
8,Bow,0.574423,0.472873
9,Bratislava,0.701239,0.416556


## Cluster based on similarity to input cities
- This is so that we can find the right group of cities to choose from depending on input cities
- We will also try clustering based on venues to see how that impacts things
    - Differences are expected because venue-categories are different for cosine similarity and clustering

In [27]:
## FIRST FIND THE OPTIMAL K
from sklearn.cluster import KMeans
import sklearn.metrics

def optimal_k(max_k: int, data) -> int:
    
    """
    Finds optimal K for clustering based on multiple algorithms for given data

    Parameters
    -----------
    max_k: Maximum value of K to test clustering for
    data: Data to be used for clustering model fit. Dataframe likely.

    Returns
    ---------
    Best value of K depending on scores
    """
    perf_scores = {}

    def update_scorelist(eval_method, score):
        score_list = perf_scores.get(eval_method) if perf_scores.get(eval_method) else []
        score_list.append(score)
        perf_scores[eval_method] = score_list

    for k in range(2, max_k + 1):
        k_means = KMeans(n_clusters=k, random_state=0).fit(data)

        # silhouette
        score = sklearn.metrics.silhouette_score(data, k_means.labels_, metric="euclidean")
        update_scorelist("silhouette", score)
    
        # Calinski-Harabasz Index
        score = sklearn.metrics.calinski_harabaz_score(data, k_means.labels_)
        update_scorelist("chindex", score)

        # Davies-Boudin Index (Lower is better)
        score = sklearn.metrics.davies_bouldin_score(data, k_means.labels_)
        update_scorelist("dbindex", score * -1)

    k_eval_scores_df = pd.DataFrame(perf_scores, index=range(2, max_k+1))
    print("Scores for different evaluation methods \n", k_eval_scores_df)

    # Now get the most common "K" corresponding to best scores
    return k_eval_scores_df.idxmax().mode().iat[0]


In [28]:
# Best K for cosine_similarity based clustering
cosine_similarity_clustering = cosine_similarities_df.drop("city_name", axis=1)
best_k_cosine = optimal_k(min(10, random_sample_count-1), cosine_similarity_clustering)
print("Best value of K for K-Means Clustering for cosine is: ", best_k_cosine)

# Best K for venue based city clustering
sample_city_venues_grouped_clustering = sample_city_venues_grouped.drop("city_name", axis=1)
best_k_venues = optimal_k(min(10, random_sample_count-1), sample_city_venues_grouped_clustering)
print("Best value of K for K-means Clustering for venue based is: ", best_k_venues)

Scores for different evaluation methods 
     silhouette    chindex   dbindex
2     0.419522  67.491895 -0.914515
3     0.423943  72.750195 -0.823838
4     0.390056  74.291245 -0.805947
5     0.401783  74.232144 -0.775319
6     0.411694  74.328725 -0.752050
7     0.405748  77.126973 -0.708961
8     0.400940  81.727222 -0.686822
9     0.390991  80.380440 -0.760795
10    0.426492  84.916755 -0.702293
Best value of K for K-Means Clustering for cosine is:  10
Scores for different evaluation methods 
     silhouette    chindex   dbindex
2     0.130194  10.155475 -2.418381
3     0.141524   7.584114 -2.413104
4     0.190934   8.470256 -1.902081
5     0.182718   8.102018 -1.792140
6     0.117264   7.161830 -1.986046
7     0.129951   7.339737 -1.646481
8     0.086059   6.870032 -1.841283
9     0.086312   6.196892 -1.793630
10    0.137296   6.322522 -1.610861
Best value of K for K-means Clustering for venue based is:  2


### K-Means with Cosine Similarity Dataframe

In [29]:
kmeans = KMeans(n_clusters=best_k_cosine, random_state=0).fit(cosine_similarity_clustering)
cluster_df = pd.DataFrame(kmeans.labels_, columns=["cluster_label"])
cluster_df.insert(0, "city_name", cosine_similarities_df["city_name"])
cluster_df

Unnamed: 0,city_name,cluster_label
0,Altenberg,2
1,Bagneux,0
2,Balingen,1
3,Barton upon Irwell,0
4,Bay St. Louis,4
5,Bearsted,4
6,Benevento,9
7,Borlänge,8
8,Bow,3
9,Bratislava,0


# Visualization

## Prepare for visualization

In [30]:
# Create a dataframe with clusters, and city latitude/longitude
visualization_df = pd.merge(
    sample_cities[["city", "country", "lat", "lng"]], 
    cluster_df.rename(columns={"city_name": "city"}), 
    on="city", how="inner"
)
visualization_df

Unnamed: 0,city,country,lat,lng,cluster_label
0,Ra’s al ‘Ayn,Syria,36.8503,40.0706,8
1,Owosso,United States,42.9955,-84.1760,6
2,Dedham,United States,42.2466,-71.1777,7
3,Grodzisk Mazowiecki,Poland,52.1039,20.6337,9
4,Bay St. Louis,United States,30.3281,-89.3774,4
5,Bearsted,United Kingdom,51.2738,0.5789,4
6,Balingen,Germany,48.2731,8.8506,1
7,Tutong,Brunei,4.8028,114.6492,1
8,Barton upon Irwell,United Kingdom,53.4760,-2.3600,0
9,Hampton,United States,33.3835,-84.2855,1


In [31]:
# Now visualize dataframe using folium
map_sample_cities = folium.Map([40.866667, 34.566667], zoom_start=1.5)

import matplotlib.cm
import matplotlib.colors

# set color scheme for the clusters
x = np.arange(best_k_cosine)
ys = [i + x + (i*x)**2 for i in range(best_k_cosine)]
colors_array = matplotlib.cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [matplotlib.colors.rgb2hex(i) for i in colors_array]
print(ys)

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(visualization_df['lat'], visualization_df['lng'], 
    visualization_df['city'], visualization_df['cluster_label']
):
    
    label = folium.Popup("{}: Cluster {}".format(poi, cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_sample_cities)
    

[array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=int64), array([ 1,  3,  7, 13, 21, 31, 43, 57, 73, 91], dtype=int64), array([  2,   7,  20,  41,  70, 107, 152, 205, 266, 335], dtype=int64), array([  3,  13,  41,  87, 151, 233, 333, 451, 587, 741], dtype=int64), array([   4,   21,   70,  151,  264,  409,  586,  795, 1036, 1309],
      dtype=int64), array([   5,   31,  107,  233,  409,  635,  911, 1237, 1613, 2039],
      dtype=int64), array([   6,   43,  152,  333,  586,  911, 1308, 1777, 2318, 2931],
      dtype=int64), array([   7,   57,  205,  451,  795, 1237, 1777, 2415, 3151, 3985],
      dtype=int64), array([   8,   73,  266,  587, 1036, 1613, 2318, 3151, 4112, 5201],
      dtype=int64), array([   9,   91,  335,  741, 1309, 2039, 2931, 3985, 5201, 6579],
      dtype=int64)]


In [32]:
map_sample_cities

# Analysis

In [33]:
# First Create the right dataframe
analysis_df = pd.merge(
    visualization_df, 
    cosine_similarities_df.rename(columns={"city_name": "city"}), 
    how="inner", on="city"
)

analysis_df

Unnamed: 0,city,country,lat,lng,cluster_label,Amsterdam,New York
0,Ra’s al ‘Ayn,Syria,36.8503,40.0706,8,0.385948,0.146914
1,Owosso,United States,42.9955,-84.1760,6,0.436798,0.394528
2,Dedham,United States,42.2466,-71.1777,7,0.478020,0.636009
3,Grodzisk Mazowiecki,Poland,52.1039,20.6337,9,0.566953,0.323496
4,Bay St. Louis,United States,30.3281,-89.3774,4,0.479543,0.273900
5,Bearsted,United Kingdom,51.2738,0.5789,4,0.387512,0.230056
6,Balingen,Germany,48.2731,8.8506,1,0.165268,0.127305
7,Tutong,Brunei,4.8028,114.6492,1,0.220841,0.154647
8,Barton upon Irwell,United Kingdom,53.4760,-2.3600,0,0.695582,0.433480
9,Hampton,United States,33.3835,-84.2855,1,0.254649,0.209968


In [34]:
analysis_df.groupby("cluster_label").mean()

Unnamed: 0_level_0,lat,lng,Amsterdam,New York
cluster_label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,35.351929,2.059457,0.683625,0.449606
1,38.7766,-23.262825,0.233234,0.182027
2,47.574417,10.837367,0.624771,0.125088
3,37.055473,-57.876145,0.528852,0.493423
4,44.65777,-32.98891,0.422974,0.289303
5,41.613633,-110.9495,0.645432,0.62936
6,34.444575,-72.618462,0.467783,0.402549
7,38.969125,-81.85315,0.511913,0.615651
8,40.22082,33.97146,0.411384,0.11134
9,32.635525,45.820612,0.602428,0.277915


The following things are evident:

1. Cluster 2 is similar to Amsterdam and dissimilar to New York
2. Cluster 7 is most similar to New York
3. Cluster 5 is similar to both Amsterdam and New York (high similarity)
4. Cluster 1 is not similar to Amsterdam and New York!

## Cluster 2 - Cities Similar to Amsterdam and dissimilar to New York

In [36]:
cluster2_df = analysis_df[analysis_df["cluster_label"] == 2]
print ("{} cities similar to Amsterdam but not to New York".format(cluster2_df.shape[0]))
cluster2_df

6 cities similar to Amsterdam but not to New York


Unnamed: 0,city,country,lat,lng,cluster_label,Amsterdam,New York
24,Altenberg,Germany,50.7644,13.7578,2,0.540714,0.067875
48,Glücksburg,Germany,54.8336,9.55,2,0.676273,0.13835
49,Hatay,Turkey,36.2,36.15,2,0.573339,0.175051
51,Warminster,United Kingdom,51.205,-2.181,2,0.618225,0.119054
55,Vianden,Luxembourg,49.935,6.2089,2,0.695468,0.112245
56,Escaldes-Engordany,Andorra,42.5085,1.5385,2,0.644605,0.137951


## Cluster 7 - Cities Similar to New York (and sometimes to Amsterdam)

In [37]:
cluster7_df = analysis_df[analysis_df["cluster_label"] == 7]
print ("{} cities similar to New York".format(cluster7_df.shape[0]))
cluster7_df

4 cities similar to New York


Unnamed: 0,city,country,lat,lng,cluster_label,Amsterdam,New York
2,Dedham,United States,42.2466,-71.1777,7,0.47802,0.636009
13,Macedonia,United States,41.3147,-81.4989,7,0.526374,0.597723
15,Sunset Hills,United States,38.531,-90.4087,7,0.532987,0.617291
20,Druid Hills,United States,33.7842,-84.3273,7,0.510272,0.61158


## Cluster 5 - Cities Similar to both Amsterdam and New York Similarly

In [38]:
cluster5_df = analysis_df[analysis_df["cluster_label"] == 5]
print ("{} cities similar to both Amsterdam and New York".format(cluster5_df.shape[0]))
cluster5_df

3 cities similar to both Amsterdam and New York


Unnamed: 0,city,country,lat,lng,cluster_label,Amsterdam,New York
54,West Slope,United States,45.4962,-122.7731,5,0.701042,0.59211
65,Woodside,United States,37.4222,-122.2591,5,0.615244,0.610199
69,Elmwood Park,United States,41.9225,-87.8163,5,0.620011,0.68577


## Cluster 1 - Cities very dissimilar from Amsterdam as well as New York

In [39]:
cluster1_df = analysis_df[analysis_df["cluster_label"] == 1]
print ("{} cities dissimilar to Amsterdam & New York".format(cluster1_df.shape[0]))
cluster1_df

8 cities dissimilar to Amsterdam & New York


Unnamed: 0,city,country,lat,lng,cluster_label,Amsterdam,New York
6,Balingen,Germany,48.2731,8.8506,1,0.165268,0.127305
7,Tutong,Brunei,4.8028,114.6492,1,0.220841,0.154647
9,Hampton,United States,33.3835,-84.2855,1,0.254649,0.209968
16,Fort Drum,United States,44.0451,-75.7847,1,0.257285,0.256359
17,Horndean,United Kingdom,50.9136,-0.9961,1,0.251864,0.144737
29,Claremont,United States,43.379,-72.3368,1,0.245885,0.218866
37,Monroe,United States,33.799,-83.716,1,0.303484,0.239766
42,Lünen,Germany,51.6167,7.5167,1,0.166598,0.104564
