# A comparison between Manchester and London neighbourhoods for relocation.

***

## Part 1 - Creating the dataframe 

In [1]:
# import required libraries

import pandas as pd # library for data analsysis

import requests # library to handle requests
import numpy as np

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

!conda install -c conda-forge folium=0.5.0 --yes
import folium # map rendering library

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    geopy-1.22.0               |     pyh9f0ad1d_0          63 KB  conda-forge
    python_abi-3.6             |          1_cp36m           4 KB  conda-forge
    certifi-2020.4.5.1         |   py36h9f0ad1d_0         151 KB  conda-forge
    geographiclib-1.50         |             py_0          34 KB  conda-forge
    ca-certificates-2020.4.5.1 |       hecc5488_0         146 KB  conda-forge
    openssl-1.1.1g             |       h516909a_0         2.1 MB  conda-forge
    ------------------------------------------------------------
                                           Total:         2.5 MB

The following NEW packages will be INSTALLED:

    geographiclib:   1.50-py_0           conda-forge
    geopy:          

__The following code retrieves the data for Manchester and performs the data preperation__

In [2]:
# retrieve all Manchester tables from the wiki page 
tables_M = pd.read_html("https://en.wikipedia.org/wiki/List_of_places_in_Greater_Manchester")

# convert the required table (first table in the list) to a dataframe
dfM = pd.DataFrame(tables_M[0])

# split the "other components" column into seperate rows
oc = dfM['Other components'].str.split(', ').apply(pd.Series, 1).stack() # split the column
oc.index = oc.index.droplevel(-1) # drop the new index so it lines up with original df index
oc.name = 'Neighbourhood' # give the column a name

del dfM['Other components'] # delete the original column
dfM = dfM.join(oc).reset_index(drop=True) # join the neighbourhoods to the df and reset the index

dfM = dfM.drop(['Metropolitan borough.1','Centre of administration'], axis=1) # delete unneeded columns
dfM.rename(columns={"Metropolitan county": "City", "Metropolitan borough": "Borough"}, inplace=True) # rename columns

dfM


Unnamed: 0,City,Borough,Neighbourhood
0,Greater Manchester,Bury,Prestwich
1,Greater Manchester,Bury,Radcliffe
2,Greater Manchester,Bury,Ramsbottom
3,Greater Manchester,Bury,Tottington
4,Greater Manchester,Bury,Whitefield
5,Greater Manchester,Bolton,Blackrod
6,Greater Manchester,Bolton,Farnworth
7,Greater Manchester,Bolton,Horwich
8,Greater Manchester,Bolton,Kearsley
9,Greater Manchester,Bolton,Little Lever


In [3]:
# print the number of rows of the dataframe
dfM.shape

(95, 3)

__The following code retrieves the data for London and performs the data preperation__

In [4]:
# retrieve all London tables from the wiki page 
tables_L = pd.read_html("https://en.wikipedia.org/wiki/List_of_areas_of_London")

# convert the required table (first table in the list) to a dataframe
dfL = pd.DataFrame(tables_L[1])

# rename the columns
dfL.columns = ['Neighbourhood','Borough','City','pd','dc','os']

# drop the unneeded columns
dfL.drop(['pd','dc','os'], axis=1, inplace=True)

# update City to be London for all rows
dfL['City'] = 'London'

# reorder columns
dfL = dfL[['City','Borough','Neighbourhood']]

dfL



Unnamed: 0,City,Borough,Neighbourhood
0,London,"Bexley, Greenwich [7]",Abbey Wood
1,London,"Ealing, Hammersmith and Fulham[8]",Acton
2,London,Croydon[8],Addington
3,London,Croydon[8],Addiscombe
4,London,Bexley,Albany Park
5,London,Redbridge[9],Aldborough Hatch
6,London,City[10],Aldgate
7,London,Westminster[10],Aldwych
8,London,Brent[11],Alperton
9,London,Bromley[11],Anerley


In [5]:
# print the number of rows of the dataframe
dfL.shape

(533, 3)

__Join the two dataframes together__

In [6]:
df = pd.concat([dfM, dfL]).reset_index(drop=True)
print(df.shape)
df

(628, 3)


Unnamed: 0,City,Borough,Neighbourhood
0,Greater Manchester,Bury,Prestwich
1,Greater Manchester,Bury,Radcliffe
2,Greater Manchester,Bury,Ramsbottom
3,Greater Manchester,Bury,Tottington
4,Greater Manchester,Bury,Whitefield
5,Greater Manchester,Bolton,Blackrod
6,Greater Manchester,Bolton,Farnworth
7,Greater Manchester,Bolton,Horwich
8,Greater Manchester,Bolton,Kearsley
9,Greater Manchester,Bolton,Little Lever


## Part 2 - Add Latitude and Longitude to the dataframe

In [7]:
# install the geocoder library

! pip install geocoder
import geocoder


Collecting geocoder
[?25l  Downloading https://files.pythonhosted.org/packages/4f/6b/13166c909ad2f2d76b929a4227c952630ebaf0d729f6317eb09cbceccbab/geocoder-1.38.1-py2.py3-none-any.whl (98kB)
[K     |████████████████████████████████| 102kB 7.6MB/s ta 0:00:011
Collecting ratelim (from geocoder)
  Downloading https://files.pythonhosted.org/packages/f2/98/7e6d147fd16a10a5f821db6e25f192265d6ecca3d82957a4fdd592cad49c/ratelim-0.1.6-py2.py3-none-any.whl
Installing collected packages: ratelim, geocoder
Successfully installed geocoder-1.38.1 ratelim-0.1.6


### Retrieve the latitude and longitude from ArcGis

In [8]:
# create two new columns for latitude and lonitude
df['Latitude'] = ""
df['Longitude'] = ""

# set row counter to 0
i = 0

# loop through all rows of the dataframe to fetch latitude and longitude and add to dataframe.

for N in df['Neighbourhood']:
    
    lat_lng_coords = None
    neighbourhood = df.iloc[i]['Neighbourhood'] #retrieve the first neighbourhood in the datafame
    borough = df.iloc[i]['Borough'] #retrieve the first Borough in the datafame
    lookup = neighbourhood + ","  + borough + ", England"
    
    while(lat_lng_coords is None):
        g = geocoder.arcgis(lookup)
        lat_lng_coords = g.latlng
   
    df['Latitude'][i] = lat_lng_coords[0] #update latitude column
    df['Longitude'][i] = lat_lng_coords[1] #update longitude column
   
    print(df.iloc[[i]])
     
    i = i+1 #increment the row counter
    
# check the dataframe has updated correctly
df.head()
    

                 City Borough Neighbourhood Latitude Longitude
0  Greater Manchester    Bury     Prestwich  51.6198  -1.23799
                 City Borough Neighbourhood Latitude Longitude
1  Greater Manchester    Bury     Radcliffe   53.581  -2.30298
                 City Borough Neighbourhood Latitude Longitude
2  Greater Manchester    Bury    Ramsbottom  53.6487  -2.31852
                 City Borough Neighbourhood Latitude Longitude
3  Greater Manchester    Bury    Tottington  53.6105  -2.33735
                 City Borough Neighbourhood Latitude Longitude
4  Greater Manchester    Bury    Whitefield  53.5772  -2.30153
                 City Borough Neighbourhood Latitude Longitude
5  Greater Manchester  Bolton      Blackrod  53.5924  -2.58018
                 City Borough Neighbourhood Latitude Longitude
6  Greater Manchester  Bolton     Farnworth  53.5489  -2.39362
                 City Borough Neighbourhood Latitude Longitude
7  Greater Manchester  Bolton       Horwich  53.6007  -

Unnamed: 0,City,Borough,Neighbourhood,Latitude,Longitude
0,Greater Manchester,Bury,Prestwich,51.6198,-1.23799
1,Greater Manchester,Bury,Radcliffe,53.581,-2.30298
2,Greater Manchester,Bury,Ramsbottom,53.6487,-2.31852
3,Greater Manchester,Bury,Tottington,53.6105,-2.33735
4,Greater Manchester,Bury,Whitefield,53.5772,-2.30153


In [9]:
# print the number of rows of the dataframe
df.shape

(628, 5)

### Visualise the nieghbourhoods on a map to ensure we have retrieved the correct locations

In [10]:
# get the geographical coordinates of England
address = 'England, UK'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of England are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of England are 52.7954791, -0.5402402866174321.


In [11]:
# create map of England using latitude and longitude values
map_england = folium.Map(location=[latitude, longitude], zoom_start=7)

# add markers to map
for lat, lng, label in zip(df['Latitude'], df['Longitude'], df['Neighbourhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_england)  
    
map_england

__We can see that some of the neigbourhoods retrieved from ArcGis are outside of our target areas (Greater Manchetser and London). \
On manual investigation of a sample of these neighbourhoods, it appears that they aren't found on ArcGis, hence ArcGis has provided the nearest match. \
These neigbourhoods will be removed and excluded from our dataset by removing anything outside of our target areas.__

In [14]:
# select the parameters for the Greater Manchester and London areas.
# Greater Manchester between 53.5319 and 53.7667 latitude, -2.2413 and -0.95678 longitude
# London between 51.2059 and 51.5416 latitude, -0.57802 and 0.04302 longitude

df_clean = df[( (df['Latitude']>=53.28) & (df['Latitude']<=53.7) & (df['Longitude']>=-2.8) & (df['Longitude']<=-1) ) | ( (df['Latitude']>=51.2) & (df['Latitude']<=51.7) & (df['Longitude']>=-0.6) & (df['Longitude']<=0.2) )]

print(df_clean.shape)
print('There are {} neighbourhoods.'.format(len(df_clean['Neighbourhood'].unique())))

# we've lost 69 neighbourhoods across the two Cities.


(559, 5)
There are 554 neighbourhoods.


In [15]:
# recreate the map and check that the datapoints are now all within the target areas.

# create map of England using latitude and longitude values
map_england = folium.Map(location=[latitude, longitude], zoom_start=7)

# add markers to map
for lat, lng, label in zip(df_clean['Latitude'], df_clean['Longitude'], df_clean['Neighbourhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_england)  
    
map_england

__The data now looks good so we will continue with the clustering.__

In [18]:
# update the df with the new data set
df = df_clean

## Part 3 - Cluster the neighborhoods

I have decided to cluster the neighbourhoods based on the nearby venues within a 2000m radius of the centre (lat & long coords). \
The limit per call is set to 150 as this is the maximum I will be able to retreve based on my accoutn limitis with Foursquare.  \
I will then take the top ten venue types and use those to cluster as a representation of the neighborhood.

### Create a function to get nearby venues for each neighborhood

In [19]:
def getNearbyVenues(names, latitudes, longitudes, radius=2000):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighbourhood Latitude', 
                  'Neighbourhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

### Run the above function of each neighbourhod
Neighbourhoods for London are split into three groups. \
This is because the foursquare API was failing to retrieve all neighbourhood data when I tried to run the function with all data.

In [20]:
ManNeigh = df.loc[df['City'] == "Greater Manchester",['Neighbourhood']]

# split the London neighbourhoods into groups
LonNeigh1 = df.loc[(df['City'] == "London") & (df.index <= 300),['Neighbourhood']]
LonNeigh2 = df.loc[(df['City'] == "London") & (df.index > 300) & (df.index <= 500),['Neighbourhood']]
LonNeigh3 = df.loc[(df['City'] == "London") & (df.index > 500),['Neighbourhood']]

In [21]:
print(ManNeigh.head())
print(LonNeigh1.head())
print(LonNeigh2.head())
print(LonNeigh3.head())

  Neighbourhood
1     Radcliffe
2    Ramsbottom
3    Tottington
4    Whitefield
5      Blackrod
    Neighbourhood
96          Acton
97      Addington
98     Addiscombe
99    Albany Park
101       Aldgate
    Neighbourhood
301      Hainault
302      The Hale
303           Ham
304   Hammersmith
305     Hampstead
    Neighbourhood
501    Silvertown
502        Sipson
503   Slade Green
504   Snaresbrook
505          Soho


In [29]:
# set parameters
CLIENT_ID = 'LOIRKUCAYWNT2A2ND3VDENVM2REMKMONJQFXPTHW05KJWTXK' 
CLIENT_SECRET = 'IMVGBIVA5JQQ25LDPXV4GWMWOJFNGXLZOWILXKFHHK0CE50Y' 
VERSION = '20180605' 

radius = 2000 #keeping radius to walking distance from the centre 
LIMIT = 300 #limiting number of venues returned

#venuesM = getNearbyVenues(names=ManNeigh['Neighbourhood'],
#                        latitudes=df['Latitude'],
#                       longitudes=df['Longitude']
#                        )

#venuesL1 = getNearbyVenues(names=LonNeigh1['Neighbourhood'],
#                        latitudes=df['Latitude'],
#                        longitudes=df['Longitude']
#                        )

#venuesL2 = getNearbyVenues(names=LonNeigh2['Neighbourhood'],
#                        latitudes=df['Latitude'],
#                        longitudes=df['Longitude']
#                        )
venuesL3 = getNearbyVenues(names=LonNeigh3['Neighbourhood'],
                        latitudes=df['Latitude'],
                        longitudes=df['Longitude']
                        )


Silvertown
Sipson
Slade Green
Snaresbrook
Soho
Somerstown
South Croydon
South Hackney
South Harrow
South Kensington
South Norwood
South Ruislip
South Wimbledon
South Woodford
South Tottenham
Southall
Southborough
Southfields
Southgate
Spitalfields
St Helier
St James's
St Margarets
St Johns
St John's Wood
St Mary Cray
St Pancras
St Paul's Cray
Stamford Hill
Stanmore
Stepney
Stockwell
Stoke Newington
Strawberry Hill
Streatham
Sudbury
Sundridge
Surbiton
Surrey Quays
Sutton
Swiss Cottage
Sydenham (also Lower Sydenham, Upper Sydenham)
Sydenham Hill
Teddington
Temple
Temple Fortune
Thamesmead
Thornton Heath
Tokyngton
Tolworth
Tooting
Tooting Bec
Tottenham
Tottenham Green
Tottenham Hale
Totteridge
Tower Hill
Tufnell Park
Tulse Hill
Turnpike Lane
Twickenham
Upper Clapton
Upper Holloway
Upper Norwood
Upper Ruxley
Upper Walthamstow
Upton
Upton Park
Uxbridge
Vauxhall
Waddon
Wallington
Walthamstow
Walthamstow Village
Walworth
Wandsworth
Wapping
Wealdstone
Well Hall
Welling
Wembley
Wembley Park
Wes

In [33]:
# union the dataframes together
venues = pd.concat([venuesM, venuesL1, venuesL2, venuesL3]).reset_index(drop=True)

# check data looks correct - there should be 554 neighbourhoods
print('There are {} neighbourhoods.'.format(len(venues['Neighbourhood'].unique())))
print(venues.shape)


There are 554 neighbourhoods.
(27094, 7)


### How many unique venue categories are there?

In [35]:
print('There are {} uniques categories.'.format(len(venues['Venue Category'].unique())))

There are 348 uniques categories.


### Analyze each neighborhood

In [36]:
# one hot encoding
onehot = pd.get_dummies(venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
onehot['Neighbourhood'] = venues['Neighbourhood'] 

# move neighborhood column to the first column
onehot = onehot[['Neighbourhood'] + [col for col in onehot.columns if col != 'Neighbourhood']]

onehot.head()



Unnamed: 0,Neighbourhood,Afghan Restaurant,African Restaurant,Airport,Airport Lounge,Airport Service,American Restaurant,Animal Shelter,Antique Shop,Argentinian Restaurant,...,Whisky Bar,Windmill,Wine Bar,Wine Shop,Wings Joint,Women's Store,Xinjiang Restaurant,Yoga Studio,Zoo,Zoo Exhibit
0,Radcliffe,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Radcliffe,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Radcliffe,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Radcliffe,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Radcliffe,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [37]:
onehot.shape

(27094, 349)

### Group the rows by neighborhood

In [38]:
# group using the mean
grouped = onehot.groupby('Neighbourhood').mean().reset_index()
grouped

Unnamed: 0,Neighbourhood,Afghan Restaurant,African Restaurant,Airport,Airport Lounge,Airport Service,American Restaurant,Animal Shelter,Antique Shop,Argentinian Restaurant,...,Whisky Bar,Windmill,Wine Bar,Wine Shop,Wings Joint,Women's Store,Xinjiang Restaurant,Yoga Studio,Zoo,Zoo Exhibit
0,Abram,0.00,0.00,0.000000,0.000000,0.0,0.000000,0.00,0.00,0.00,...,0.00,0.000000,0.000000,0.00,0.0,0.0,0.0,0.00,0.0,0.0
1,Acton,0.00,0.00,0.000000,0.000000,0.0,0.016129,0.00,0.00,0.00,...,0.00,0.000000,0.000000,0.00,0.0,0.0,0.0,0.00,0.0,0.0
2,Addington,0.00,0.00,0.000000,0.000000,0.0,0.000000,0.00,0.00,0.00,...,0.00,0.000000,0.000000,0.00,0.0,0.0,0.0,0.00,0.0,0.0
3,Addiscombe,0.00,0.00,0.000000,0.000000,0.0,0.000000,0.00,0.00,0.00,...,0.00,0.000000,0.000000,0.00,0.0,0.0,0.0,0.00,0.0,0.0
4,Albany Park,0.00,0.00,0.000000,0.000000,0.0,0.022727,0.00,0.00,0.00,...,0.00,0.000000,0.000000,0.00,0.0,0.0,0.0,0.00,0.0,0.0
5,Aldgate,0.00,0.00,0.000000,0.000000,0.0,0.000000,0.00,0.00,0.00,...,0.00,0.000000,0.000000,0.00,0.0,0.0,0.0,0.00,0.0,0.0
6,Altrincham,0.00,0.00,0.000000,0.000000,0.0,0.013514,0.00,0.00,0.00,...,0.00,0.000000,0.013514,0.00,0.0,0.0,0.0,0.00,0.0,0.0
7,Angel,0.00,0.00,0.000000,0.000000,0.0,0.000000,0.00,0.00,0.00,...,0.00,0.000000,0.000000,0.00,0.0,0.0,0.0,0.00,0.0,0.0
8,Arkley,0.00,0.00,0.000000,0.000000,0.0,0.000000,0.00,0.00,0.00,...,0.00,0.000000,0.000000,0.00,0.0,0.0,0.0,0.00,0.0,0.0
9,Arnos Grove,0.00,0.00,0.000000,0.000000,0.0,0.000000,0.00,0.00,0.00,...,0.00,0.000000,0.000000,0.00,0.0,0.0,0.0,0.00,0.0,0.0


In [39]:
# confirm size of new dataframe
grouped.shape

(554, 349)

### Sort the venues and create a dataframe of the top 10 venues for each neighborhood

In [40]:
# create a function to sort the venues
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [41]:
# create the new dataframe
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighbourhood'] = grouped['Neighbourhood']

for ind in np.arange(grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Abram,Pub,Grocery Store,Zoo Exhibit,Food,Farmers Market,Fast Food Restaurant,Film Studio,Fish & Chips Shop,Fish Market,Flea Market
1,Acton,Clothing Store,Pub,Discount Store,Coffee Shop,Hotel,Pharmacy,Bar,Grocery Store,Fast Food Restaurant,Pizza Place
2,Addington,Gastropub,Pub,Indian Restaurant,Tapas Restaurant,Platform,Brewery,Chinese Restaurant,Tea Room,Park,Scenic Lookout
3,Addiscombe,Grocery Store,Pub,Italian Restaurant,Chinese Restaurant,Bus Stop,French Restaurant,Fountain,Farmers Market,Fast Food Restaurant,Film Studio
4,Albany Park,Pub,Discount Store,Hotel,Indian Restaurant,Pizza Place,Train Station,Coffee Shop,Clothing Store,Supermarket,Bar


### Cluster the neighborhoods
I have decided to use 4 clusters through a process of trial and error.  The majority of neighbourhoods are very similar and 5 clusters appears to give me the most differentiation.

In [42]:
# set number of clusters
kclusters = 4

grouped_clustering = grouped.drop('Neighbourhood', 1) #drop the neighborhood column

# run k-means clustering
kmeans = KMeans(init = "k-means++", n_clusters = kclusters, n_init = 12)
kmeans.fit(grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[:]



array([2, 0, 0, 3, 0, 0, 0, 1, 0, 1, 3, 0, 3, 0, 0, 3, 2, 3, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 1, 0, 2, 0, 0, 3, 0, 3, 2, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 0, 0, 0, 0, 0, 0, 1, 3, 3, 0, 3, 0,
       3, 0, 0, 2, 1, 3, 1, 0, 1, 3, 0, 0, 1, 0, 3, 0, 2, 3, 2, 3, 0, 0,
       0, 3, 3, 0, 3, 0, 3, 2, 3, 0, 0, 3, 0, 3, 0, 3, 0, 0, 0, 3, 0, 3,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 2,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 3,
       0, 0, 0, 0, 3, 3, 0, 0, 1, 0, 1, 3, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 3, 3, 0, 3, 0, 0, 0, 3, 0, 0, 1, 3, 0, 2, 0, 0, 0,
       3, 0, 0, 3, 2, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 1, 0, 0, 1, 0, 0,
       3, 3, 0, 3, 0, 0, 0, 2, 3, 1, 0, 1, 3, 0, 0, 1, 0, 3, 1, 0, 2, 3,
       2, 3, 0, 3, 3, 3, 0, 0, 3, 2, 0, 0, 3, 2, 3,

In [43]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

df_merged = df

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
df_merged = df_merged.join(neighborhoods_venues_sorted.set_index('Neighbourhood'), on='Neighbourhood')

df_merged#.head() # check the last columns!

Unnamed: 0,City,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,Greater Manchester,Bury,Radcliffe,53.581,-2.30298,0,Clothing Store,Pub,Discount Store,Coffee Shop,Hotel,Pharmacy,Bar,Grocery Store,Fast Food Restaurant,Pizza Place
2,Greater Manchester,Bury,Ramsbottom,53.6487,-2.31852,0,Gastropub,Pub,Indian Restaurant,Tapas Restaurant,Platform,Brewery,Chinese Restaurant,Tea Room,Park,Scenic Lookout
3,Greater Manchester,Bury,Tottington,53.6105,-2.33735,3,Grocery Store,Pub,Italian Restaurant,Chinese Restaurant,Bus Stop,French Restaurant,Fountain,Farmers Market,Fast Food Restaurant,Film Studio
4,Greater Manchester,Bury,Whitefield,53.5772,-2.30153,0,Pub,Discount Store,Hotel,Indian Restaurant,Pizza Place,Train Station,Coffee Shop,Clothing Store,Supermarket,Bar
5,Greater Manchester,Bolton,Blackrod,53.5924,-2.58018,0,Sandwich Place,Coffee Shop,Rest Area,Pub,Gas Station,Gastropub,Fast Food Restaurant,Bakery,Italian Restaurant,Convenience Store
6,Greater Manchester,Bolton,Farnworth,53.5489,-2.39362,1,Train Station,Supermarket,Theater,Gym,Park,Indian Restaurant,Discount Store,Zoo Exhibit,Flea Market,Fast Food Restaurant
7,Greater Manchester,Bolton,Horwich,53.6007,-2.54706,0,Coffee Shop,Pub,Italian Restaurant,Supermarket,Furniture / Home Store,Rest Area,Grocery Store,Indian Restaurant,Fast Food Restaurant,Tapas Restaurant
8,Greater Manchester,Bolton,Kearsley,53.5399,-2.37542,1,Train Station,Supermarket,Theater,Italian Restaurant,Park,Pub,Grocery Store,Discount Store,Trail,Fish & Chips Shop
9,Greater Manchester,Bolton,Little Lever,53.5613,-2.36431,3,Italian Restaurant,Train Station,Chinese Restaurant,Park,Soccer Stadium,Garden Center,Grocery Store,Pub,Indian Restaurant,Hunan Restaurant
10,Greater Manchester,Bolton,South Turton,53.6379,-2.39857,2,Pub,Indian Restaurant,Zoo Exhibit,Food,Farm,Farmers Market,Fast Food Restaurant,Film Studio,Fish & Chips Shop,Fish Market


### Visualise the clusters

In [44]:
# how many neighbourhoods are in each cluster?

df_merged[['City','Cluster Labels','Neighbourhood']].groupby(['City','Cluster Labels'], as_index=False).count()

Unnamed: 0,City,Cluster Labels,Neighbourhood
0,Greater Manchester,0,54
1,Greater Manchester,1,8
2,Greater Manchester,2,7
3,Greater Manchester,3,25
4,London,0,334
5,London,1,26
6,London,2,23
7,London,3,82


In [45]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=7)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df_merged['Latitude'], df_merged['Longitude'], df_merged['Neighbourhood'], df_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### Decsribe the clusters
What does each cluster look like and can we produce a high level description to help people understand the neighbourhoods? /
This will be done based on the Manchester neighbourhoods as that will be the area most familiar with the user and for which we are looking to find similarities in London.

__Cluster 1__

In [46]:
df_merged.loc[(df_merged['Cluster Labels'] == 0) & (df_merged['City'] == "Greater Manchester"), df_merged.columns[[2] + list(range(6, df_merged.shape[1]))]]

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,Radcliffe,Clothing Store,Pub,Discount Store,Coffee Shop,Hotel,Pharmacy,Bar,Grocery Store,Fast Food Restaurant,Pizza Place
2,Ramsbottom,Gastropub,Pub,Indian Restaurant,Tapas Restaurant,Platform,Brewery,Chinese Restaurant,Tea Room,Park,Scenic Lookout
4,Whitefield,Pub,Discount Store,Hotel,Indian Restaurant,Pizza Place,Train Station,Coffee Shop,Clothing Store,Supermarket,Bar
5,Blackrod,Sandwich Place,Coffee Shop,Rest Area,Pub,Gas Station,Gastropub,Fast Food Restaurant,Bakery,Italian Restaurant,Convenience Store
7,Horwich,Coffee Shop,Pub,Italian Restaurant,Supermarket,Furniture / Home Store,Rest Area,Grocery Store,Indian Restaurant,Fast Food Restaurant,Tapas Restaurant
12,Blackley,Supermarket,Sandwich Place,Light Rail Station,Gym / Fitness Center,Soccer Stadium,Park,Pharmacy,Hotel,Coffee Shop,Tram Station
13,Cheetham Hill,Coffee Shop,Pub,Tea Room,Bar,Café,Park,Record Shop,Fast Food Restaurant,Museum,Supermarket
14,Chorlton-cum-Hardy,Pub,Bar,Café,Grocery Store,Cricket Ground,Park,Pizza Place,Deli / Bodega,Coffee Shop,Restaurant
15,Didsbury,Pub,Italian Restaurant,Bar,Park,Grocery Store,Coffee Shop,Indian Restaurant,Hotel,Café,Sandwich Place
16,Fallowfield,Middle Eastern Restaurant,Grocery Store,Café,Park,Indian Restaurant,Hookah Bar,Pub,Asian Restaurant,Supermarket,Gym / Fitness Center


__Cluster 2__

In [47]:
df_merged.loc[(df_merged['Cluster Labels'] == 1) & (df_merged['City'] == "Greater Manchester"), df_merged.columns[[2] + list(range(6, df_merged.shape[1]))]]

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
6,Farnworth,Train Station,Supermarket,Theater,Gym,Park,Indian Restaurant,Discount Store,Zoo Exhibit,Flea Market,Fast Food Restaurant
8,Kearsley,Train Station,Supermarket,Theater,Italian Restaurant,Park,Pub,Grocery Store,Discount Store,Trail,Fish & Chips Shop
29,Lees,Grocery Store,Fast Food Restaurant,Gas Station,Gastropub,Supermarket,Sandwich Place,Bakery,Discount Store,Pizza Place,Zoo Exhibit
45,Cadishead,Supermarket,Grocery Store,Train Station,Pub,Café,Gas Station,Construction & Landscaping,Hotel,Farmers Market,Fast Food Restaurant
59,Romiley,Train Station,Restaurant,Plaza,Grocery Store,Auto Garage,Convenience Store,Chinese Restaurant,Café,Football Stadium,Farmers Market
61,Broadbottom,Train Station,Fast Food Restaurant,Bar,Gas Station,Pharmacy,Photography Studio,Tea Room,Laundry Service,Zoo Exhibit,Flea Market
65,Hattersley,Train Station,Bar,Fast Food Restaurant,Gas Station,Pool,Trail,Soccer Stadium,Flower Shop,Farm,Farmers Market
94,Winstanley,Supermarket,Grocery Store,Fast Food Restaurant,Chinese Restaurant,Fruit & Vegetable Store,Sandwich Place,Pub,Discount Store,Farmers Market,Gastropub


__Cluster 3__

In [48]:
df_merged.loc[(df_merged['Cluster Labels'] == 2) & (df_merged['City'] == "Greater Manchester"), df_merged.columns[[2] + list(range(6, df_merged.shape[1]))]]

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
10,South Turton,Pub,Indian Restaurant,Zoo Exhibit,Food,Farm,Farmers Market,Fast Food Restaurant,Film Studio,Fish & Chips Shop,Fish Market
31,Saddleworth,Pub,Café,Coffee Shop,Gastropub,Park,Golf Course,Supermarket,Trail,Train Station,Gym
37,Wardle,Pub,Gas Station,Reservoir,Asian Restaurant,Zoo Exhibit,Food & Drink Shop,Farmers Market,Fast Food Restaurant,Film Studio,Fish & Chips Shop
57,Mellor,Pub,Golf Course,American Restaurant,Gastropub,Campground,Zoo Exhibit,Fast Food Restaurant,Film Studio,Fish & Chips Shop,Fish Market
68,Longdendale,Pub,Pharmacy,Gas Station,Reservoir,Zoo Exhibit,Flower Shop,Farm,Farmers Market,Fast Food Restaurant,Film Studio
70,Mottram-in-Longdendale,Pub,Bar,Gas Station,Pharmacy,Fast Food Restaurant,Zoo Exhibit,Food,Farmers Market,Film Studio,Fish & Chips Shop
79,Abram,Pub,Grocery Store,Zoo Exhibit,Food,Farmers Market,Fast Food Restaurant,Film Studio,Fish & Chips Shop,Fish Market,Flea Market


__Cluster 4__

In [49]:
df_merged.loc[(df_merged['Cluster Labels'] == 3) & (df_merged['City'] == "Greater Manchester"), df_merged.columns[[2] + list(range(6, df_merged.shape[1]))]]

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
3,Tottington,Grocery Store,Pub,Italian Restaurant,Chinese Restaurant,Bus Stop,French Restaurant,Fountain,Farmers Market,Fast Food Restaurant,Film Studio
9,Little Lever,Italian Restaurant,Train Station,Chinese Restaurant,Park,Soccer Stadium,Garden Center,Grocery Store,Pub,Indian Restaurant,Hunan Restaurant
11,Westhoughton,Pub,Supermarket,Deli / Bodega,Italian Restaurant,Sandwich Place,Park,Discount Store,Hotel,English Restaurant,Grocery Store
27,Shaw and Crompton,Grocery Store,Convenience Store,Pharmacy,Fish & Chips Shop,Supermarket,Gym,Gastropub,Gas Station,Fried Chicken Joint,Post Office
34,Middleton,Grocery Store,Pub,Supermarket,Locksmith,Stadium,Warehouse Store,Flower Shop,Farmers Market,Fast Food Restaurant,Film Studio
36,Newhey,Pub,Tram Station,Hotel,Sandwich Place,Park,Toy / Game Store,Supermarket,Restaurant,Grocery Store,Italian Restaurant
50,Hazel Grove,Pub,Fast Food Restaurant,Supermarket,Grocery Store,Convenience Store,Tea Room,Golf Course,Train Station,Shoe Store,Gym / Fitness Center
51,Heaton Chapel,Pub,Grocery Store,Indian Restaurant,Fast Food Restaurant,Coffee Shop,Restaurant,Chinese Restaurant,Supermarket,Bar,Gym
53,Heaton Moor,Pub,Grocery Store,Coffee Shop,Fast Food Restaurant,Café,Supermarket,Pharmacy,Restaurant,Bar,Pizza Place
58,Reddish,Grocery Store,Pub,Fast Food Restaurant,Café,Convenience Store,Farm,Bar,Bakery,Supermarket,English Restaurant


__Cluster Descriptions__

__Cluster 1__ \
Towns and villages with a good range amenities including of pubs, restaurants shops and transportation links.\
\
__Cluster 2__\
Residential suburbs with public transport links, grocery shops and fast food.\
\
__Cluster 3__\
Residential suburbs with local pubs and outdoor recreation.\
\
__Cluster 4__\
Residential suburbs comprising of a variety pub and restaurants, with food and drink lifestyle.


### Summary of data
The following tables provide a summary of the data and mapping between the target locations.

In [50]:
# create seperate dataframes for Manchester and London
# might need to add .loc before the first [

df_Man = df_merged.loc[df_merged['City'] == "Greater Manchester",['Borough', 'Neighbourhood', 'Cluster Labels']]
df_Lon = df_merged.loc[df_merged['City'] == "London", ['Borough', 'Neighbourhood', 'Cluster Labels']]

# rename the borough and neighburhood columns
df_Man.rename(columns={'Borough':'Man. Borough', 'Neighbourhood':'Man. Neighbourhood'}, inplace=True)
df_Lon.rename(columns={'Borough':'Lon. Borough', 'Neighbourhood':'Lon. Neighbourhood'}, inplace=True)

# join the dataframes
df_lookup = df_Man.set_index('Cluster Labels').join(df_Lon.set_index('Cluster Labels'), how='left', sort=True)


In [51]:
# how many London neighbourhoods do we have for each Manchester neighbourhood?

print(df_lookup.groupby(['Man. Borough', 'Man. Neighbourhood','Cluster Labels'], as_index=False).count())

   Man. Borough      Man. Neighbourhood  Lon. Borough  Lon. Neighbourhood
0        Bolton                Blackrod           334                 334
1        Bolton               Farnworth            26                  26
2        Bolton                 Horwich           334                 334
3        Bolton                Kearsley            26                  26
4        Bolton            Little Lever            82                  82
5        Bolton            South Turton            23                  23
6        Bolton            Westhoughton            82                  82
7          Bury               Radcliffe           334                 334
8          Bury              Ramsbottom           334                 334
9          Bury              Tottington            82                  82
10         Bury              Whitefield           334                 334
11   Manchester                Blackley           334                 334
12   Manchester           Cheetham Hil

In [52]:
# full lookup table 
df_lookup

Unnamed: 0_level_0,Man. Borough,Man. Neighbourhood,Lon. Borough,Lon. Neighbourhood
Cluster Labels,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,Bury,Radcliffe,"Ealing, Hammersmith and Fulham[8]",Acton
0,Bury,Radcliffe,Croydon[8],Addington
0,Bury,Radcliffe,Bexley,Albany Park
0,Bury,Radcliffe,City[10],Aldgate
0,Bury,Radcliffe,Barnet[12],Arkley
0,Bury,Radcliffe,Redbridge[15],Barkingside
0,Bury,Radcliffe,Bexley[15],Barnehurst
0,Bury,Radcliffe,Richmond upon Thames[15],Barnes
0,Bury,Radcliffe,Bexley[16],Barnes Cray
0,Bury,Radcliffe,Barnet,Barnet Gate
