# Comparing cities: Toronto, New York and London

In [1]:
import pandas as pd
import numpy as np
import requests
#!pip install geocoder
import geocoder

## Import data:

### 1- Toronto
Firstly we will take the list of postal codes from wikipedia

In [2]:
!wget -O toronto 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

--2019-09-17 20:04:26--  https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M
Resolvendo en.wikipedia.org (en.wikipedia.org)... 208.80.154.224, 2620:0:861:ed1a::1
Conectando-se a en.wikipedia.org (en.wikipedia.org)|208.80.154.224|:443... conectado.
A requisição HTTP foi enviada, aguardando resposta... 200 OK
Tamanho: 79904 (78K) [text/html]
Salvando em: “toronto”


2019-09-17 20:04:32 (191 KB/s) - “toronto” salvo [79904/79904]



In [3]:
data = pd.read_html('toronto',header = 0)
data = pd.DataFrame(data[0])
data.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


As we can see there are "Not assigned" neighbourhoods and we will use neighbourhoods to get coordinates through geocoder.

In [4]:
#Remove not assigned neighbourhoods
data = data[data.Neighbourhood != 'Not assigned'].reset_index(drop=True)
data.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


In [5]:
toronto = data[['Neighbourhood']]

In [6]:
#getting coordinates
for index, rows in toronto.iterrows():
    lat_lng_coords = None
    neighb = rows[0]
    # loop until you get the coordinates
    while(lat_lng_coords is None):
        g = geocoder.arcgis('{}, Toronto, Ontario'.format(neighb))
        lat_lng_coords = g.latlng
    latitude = lat_lng_coords[0]
    longitude = lat_lng_coords[1]
    toronto.loc[index,'Latitude'] = latitude
    toronto.loc[index, 'Longitude'] = longitude

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [7]:
toronto.head()

Unnamed: 0,Neighbourhood,Latitude,Longitude
0,Parkwoods,43.686575,-79.409993
1,Victoria Village,43.73154,-79.31428
2,Harbourfront,43.63951,-79.38316
3,Regent Park,43.66069,-79.36031
4,Lawrence Heights,43.72357,-79.43711


### 2- New York
We will take neighborhoods from wikipedia.

In [8]:
!wget -O newyork 'https://en.wikipedia.org/wiki/Neighborhoods_in_New_York_City'

--2019-09-17 20:08:20--  https://en.wikipedia.org/wiki/Neighborhoods_in_New_York_City
Resolvendo en.wikipedia.org (en.wikipedia.org)... 208.80.154.224, 2620:0:861:ed1a::1
Conectando-se a en.wikipedia.org (en.wikipedia.org)|208.80.154.224|:443... conectado.
A requisição HTTP foi enviada, aguardando resposta... 200 OK
Tamanho: 160781 (157K) [text/html]
Salvando em: “newyork”


2019-09-17 20:08:27 (215 KB/s) - “newyork” salvo [160781/160781]



In [10]:
data = pd.read_html('newyork', header = 0)
data = pd.DataFrame(data[0])
data.head()

Unnamed: 0,Community Board(CB),Areakm2,Pop.Census2010,Pop./km2,Neighborhoods
0,Bronx CB 1,7.17,91497,12761,"Melrose, Mott Haven, Port Morris"
1,Bronx CB 2,5.54,52246,9792,"Hunts Point, Longwood"
2,Bronx CB 3,4.07,79762,19598,"Claremont, Concourse Village, Crotona Park, Mo..."
3,Bronx CB 4,5.28,146441,27735,"Concourse, Highbridge"
4,Bronx CB 5,3.55,128200,36145,"Fordham, Morris Heights, Mount Hope, Universit..."


In [11]:
data = data[['Neighborhoods']]
data.Neighborhoods = data.Neighborhoods.astype(str)
data.head()

Unnamed: 0,Neighborhoods
0,"Melrose, Mott Haven, Port Morris"
1,"Hunts Point, Longwood"
2,"Claremont, Concourse Village, Crotona Park, Mo..."
3,"Concourse, Highbridge"
4,"Fordham, Morris Heights, Mount Hope, Universit..."


As we can see, there are multiple neighborhoods in each row and we need to handle with each neighborhood separately.

In [12]:
def explode(series):      
    return pd.DataFrame([x for _list in series for x in _list.split(', ')], columns = ['Neighborhoods'])  

data = explode(data['Neighborhoods'])
new_york = data
new_york.head()

Unnamed: 0,Neighborhoods
0,Melrose
1,Mott Haven
2,Port Morris
3,Hunts Point
4,Longwood


In [13]:
for index, rows in new_york.iterrows():
    lat_lng_coords = None
    neighb = rows[0]
    # loop until you get the coordinates
    while(lat_lng_coords is None):
        g = geocoder.arcgis('{}, New York, New York'.format(neighb))
        lat_lng_coords = g.latlng
    latitude = lat_lng_coords[0]
    longitude = lat_lng_coords[1]
    new_york.loc[index,'Latitude'] = latitude
    new_york.loc[index, 'Longitude'] = longitude

In [14]:
new_york.head()

Unnamed: 0,Neighborhoods,Latitude,Longitude
0,Melrose,40.82443,-73.9107
1,Mott Haven,40.80899,-73.92295
2,Port Morris,40.80134,-73.90996
3,Hunts Point,40.81242,-73.8845
4,Longwood,40.81746,-73.89815


### 3- London
We will get the locations from wikipedia

In [15]:
!wget -O london 'https://en.wikipedia.org/wiki/List_of_areas_of_London'

--2019-09-17 20:15:03--  https://en.wikipedia.org/wiki/List_of_areas_of_London
Resolvendo en.wikipedia.org (en.wikipedia.org)... 208.80.154.224, 2620:0:861:ed1a::1
Conectando-se a en.wikipedia.org (en.wikipedia.org)|208.80.154.224|:443... conectado.
A requisição HTTP foi enviada, aguardando resposta... 200 OK
Tamanho: 311259 (304K) [text/html]
Salvando em: “london”


2019-09-17 20:15:10 (294 KB/s) - “london” salvo [311259/311259]



In [16]:
data = pd.read_html('london', header = 0)
data = pd.DataFrame(data[1])
data = data[['Location']]
data.head()

Unnamed: 0,Location
0,Abbey Wood
1,Acton
2,Addington
3,Addiscombe
4,Albany Park


In [17]:
data.columns = ['Neighborhoods']
london = data

In [18]:
for index, rows in london.iterrows():
    lat_lng_coords = None
    neighb = rows[0]
    # loop until you get the coordinates
    while(lat_lng_coords is None):
        g = geocoder.arcgis('{}, London, England'.format(neighb))
        lat_lng_coords = g.latlng
    latitude = lat_lng_coords[0]
    longitude = lat_lng_coords[1]
    london.loc[index,'Latitude'] = latitude
    london.loc[index, 'Longitude'] = longitude

In [19]:
london.head()

Unnamed: 0,Neighborhoods,Latitude,Longitude
0,Abbey Wood,51.49245,0.12127
1,Acton,51.51324,-0.26746
2,Addington,51.428124,-0.044685
3,Addiscombe,51.472745,-0.203324
4,Albany Park,51.48511,-0.08241


## Foursquare
Foursquare will be used to get venues for each neighborhood

In [20]:
CLIENT_ID =  # your Foursquare ID
CLIENT_SECRET =  # your Foursquare Secret
VERSION = # Foursquare API version

In [22]:
LIMIT = 50
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [36]:
toronto_venues = getNearbyVenues(names=toronto['Neighbourhood'],
                                   latitudes=toronto['Latitude'],
                                   longitudes=toronto['Longitude']
                                 )
toronto = toronto_venues.groupby(['Venue Category']).size().reset_index(name='Counts').sort_values('Counts',ascending=False)

In [37]:
new_york_venues = getNearbyVenues(names=new_york['Neighborhoods'],
                                   latitudes=new_york['Latitude'],
                                   longitudes=new_york['Longitude']
                                  )
new_york = new_york_venues.groupby(['Venue Category']).size().reset_index(name='Counts').sort_values('Counts',ascending=False)

In [40]:
london_venues = getNearbyVenues(names=london['Neighborhoods'],
                                   latitudes=london['Latitude'],
                                   longitudes=london['Longitude']
                                  )
london = london_venues.groupby(['Venue Category']).size().reset_index(name='Counts').sort_values('Counts',ascending=False)

In [41]:
toronto.head()

Unnamed: 0,Venue Category,Counts
70,Coffee Shop,257
56,Café,161
218,Park,134
226,Pizza Place,115
242,Restaurant,88


In [42]:
new_york.head()

Unnamed: 0,Venue Category,Counts
276,Pizza Place,431
194,Italian Restaurant,253
104,Deli / Bodega,238
79,Coffee Shop,230
73,Chinese Restaurant,213


In [43]:
london.head()

Unnamed: 0,Venue Category,Counts
290,Pub,884
84,Coffee Shop,779
64,Café,685
165,Grocery Store,456
182,Hotel,384


In [44]:
print('There are {} uniques categories in Toronto.'.format(len(toronto['Venue Category'].unique())))
print('There are {} uniques categories in New York.'.format(len(new_york['Venue Category'].unique())))
print('There are {} uniques categories in London.'.format(len(london['Venue Category'].unique())))

There are 309 uniques categories in Toronto.
There are 404 uniques categories in New York.
There are 399 uniques categories in London.


## Comparing data
The following function will compare 2 cities based on its venues: 0 means totally distinct and 100 means identical.

In [158]:
def jaccard_weighted(city1, city2):
    set1 = set()
    set2 = set()
    sum_in = 0
    total1 = city1['Counts'].sum(axis=0)
    total2 = city2['Counts'].sum(axis=0)
    
    for i in range(0,city1.shape[0]):
        set1.add(city1['Venue Category'][i])
    for i in range(0,city2.shape[0]):
        set2.add(city2['Venue Category'][i])
        if city2['Venue Category'][i] in set1.intersection(set2):
            sum_in = sum_in + min(int(city1.loc[city1['Venue Category'] == \
                                                city2['Venue Category'][i]]['Counts'])*100/total1,\
                                  city2['Counts'][i]*100/total2)
            
    return(sum_in)

In [159]:
print('jw for NY and Toronto = ', jaccard_weighted(new_york,toronto))
print('jw for NY and London = ', jaccard_weighted(new_york,london))
print('jw for London and Toronto = ', jaccard_weighted(london,toronto))

jw for NY and Toronto =  66.59696845303091
jw for NY and London =  57.397012450538696
jw for London and Toronto =  67.07940505333352


As we can see, New york is more similar to Toronto than London, Toronto is more similar to London than New York and London is more similar to Toronto than New York.