__Import necessary modules and libraries__

__Read the 'population by ethnicity' data into a pandas DF__
NB: Ethnic populations of only 4 Asian Countries (Pakistani, Bangladeshi, Chinese and Indian) who are living in 33 Boroughs of London

In [7]:
import pandas as pd
import numpy as np

In [8]:
file_name='https://data.london.gov.uk/download/detailed-country-birth-2011-census-borough/2c75bd5c-9d70-47e1-a1bb-dc5d2fdf5187/detailed-country-birth-2011-borough.csv'
df=pd.read_csv(file_name)
df.head()

Unnamed: 0,Country.of.Birth,Borough,Count
0,Total : Country of Birth,LONDON,8173941
1,England,LONDON,4997072
2,Northern Ireland,LONDON,32774
3,Scotland,LONDON,89527
4,Wales,LONDON,53828


__Select only countries of birth required (Bangladesh, China, India, Pakistan) and group by Borough__

In [9]:
ethnic = df.loc[df['Country.of.Birth'].isin(['Bangladesh', 'China', 'India', 'Pakistan'])]
ethnic.groupby('Borough')
ethnic.head()

Unnamed: 0,Country.of.Birth,Borough,Count
25,Bangladesh,LONDON,109948
55,China,LONDON,39452
113,India,LONDON,262247
182,Pakistan,LONDON,112457
300,Bangladesh,Inner.London,81291


__Drop totals for London and Inner/Outer London, then combine counts for each ethnicity__

In [10]:
e = ethnic.drop(ethnic[ethnic['Borough']=='LONDON'].index)
ee = e.drop(e[e['Borough']=='Outer.London'].index)
eee = ee.drop(ee[ee['Borough']=='Inner.London'].index)


In [11]:
e1 = eee.drop(['Country.of.Birth'], axis=1)
aggregation_functions={'Count': 'sum'}
e2 = e1.groupby(e1['Borough']).aggregate(aggregation_functions)
e2.head()

Unnamed: 0_level_0,Count
Borough,Unnamed: 1_level_1
Barking.and.Dagenham,12685
Barnet,17348
Bexley,4676
Brent,37247
Bromley,5369


__Find the borough with the highest ethnic population__

In [12]:
e2.loc[e2['Count'].idxmax()]

Count    65621
Name: Newham, dtype: int64

__As the London borough with the highest asian population is Newham, we will consider this borough as our preferred location for our asian restaurant.__

__Read the 'Weekly Earnings data of the asian Ethnicity in each borough' into a DF__

In [23]:
file = 'https://data.london.gov.uk/download/indices-of-deprivation/a7f09ca8-fdf3-4c0b-9df9-d5a22e209e3a/gla-deprivation-indices-ward-2007.csv'
earnings = pd.read_csv(file)
earnings.head()

Unnamed: 0,Ward code,Ward name,Borough,Extent,Rank of extent (within London),Income Scale,Rank of income scale (within London),Employment Scale,Rank of employment scale (within London),Average Score,...,Highest SOA rank (London),Number of SOAs in ward,In worst 5percent,In worst 1percent,In worst 20percent,Above average (in worst 50percent),IDACI,Rank of IDACI (within London),IDAOPI,Rank of IDAOPI (within London)
0,00AAFA,Aldersgate,City of London,0.0,449,22.326669,628,20.464482,628,5.828698,...,628,1,0,0,0,0,0.040078,624,0.021785,628
1,00AAFQ,Cripplegate,City of London,0.0,449,169.551637,625,109.128061,624,9.281658,...,591,2,0,0,0,0,0.080712,596,0.107448,568
2,00AAFT,Farringdon Without,City of London,0.0,449,122.533641,626,104.69603,625,14.311454,...,592,1,0,0,0,0,0.068756,607,0.078038,616
3,00AAFX,Portsoken,City of London,0.2,278,375.505313,620,92.702508,626,30.054483,...,408,1,0,0,0,1,0.480267,119,0.409238,61
4,00ABFX,Abbey,Barking & Dagenham,0.553065,128,3143.444499,182,827.757005,276,35.942473,...,169,7,0,1,4,7,0.447752,161,0.313961,177


In [38]:
earnings[['Borough', 'Income Scale']]

new = earnings.filter(['Borough', 'Income Scale'], axis=1)

a = new.drop(new[new['Borough']=='LONDON'].index)
aa = a.drop(a[a['Borough']=='Outer.London'].index)
aaa = aa.drop(aa[aa['Borough']=='Inner.London'].index)

aaa.head()


Unnamed: 0,Borough,Income Scale
0,City of London,22.326669
1,City of London,169.551637
2,City of London,122.533641
3,City of London,375.505313
4,Barking & Dagenham,3143.444499


In [40]:
a1 = aaa.loc[aaa['Borough'] == 'Newham']
a1

Unnamed: 0,Borough,Income Scale
453,Newham,4369.396576
454,Newham,4706.304773
455,Newham,5201.173766
456,Newham,4812.175854
457,Newham,4096.096372
458,Newham,3938.792665
459,Newham,4445.536849
460,Newham,4343.397006
461,Newham,4121.010753
462,Newham,4496.441208


__Segment and cluster neighbourhoods in London City__

In [9]:
import requests 
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from urllib.request import urlopen
import re
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
# import folium # map rendering library
import json # library to handle JSON files
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
# import k-means from clustering stage
from sklearn.cluster import KMeans

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

In [10]:
# install and import folium library 
!pip -q install folium
import folium

__Download and explore dataset__

__Determine the latitude and longitude coordinates of all Boroughs in London from a Wiki link__

In [12]:
URL = "https://en.wikipedia.org/wiki/List_of_London_boroughs"
res = requests.get(URL).text
soup = BeautifulSoup(res,'lxml')

df_list = []
# print(soup)
for items in soup.find('table', class_= 'wikitable sortable').find_all('tr')[1::]:
    data = items.find_all(['td'])
    data1 = data[0]
    data8 = data[8]    
    try:
        borough_name = data1.get_text()
        borough_name = borough_name.split('[')
        borough_name = borough_name[0]
        borough_name = borough_name.strip()
        
        ll = data8.get_text()
        ll = ll.split('/')
        lat_long = ll[2]
        lat_long = lat_long.split('(')
        lat_long = lat_long[0]
        lat_long = lat_long.split(';')
        latitude = lat_long[0]
        latitude = latitude.strip()
        longitude = lat_long[1]
        longitude = longitude.strip()
        longitude = longitude.replace(u'\ufeff', '')
        latitude = float(latitude)
        longitude = float(longitude)

#       Append the borough name, latitude and logitude in a list
        df_list.append((borough_name, latitude, longitude))
    except IndexError:pass

In [13]:
df_boroughs = pd.DataFrame(df_list, columns=['Borough', 'Latitude' , 'Longitude'])

In [14]:
df_boroughs.shape


(32, 3)

In [15]:
df_boroughs.head()

Unnamed: 0,Borough,Latitude,Longitude
0,Barking and Dagenham,51.5607,0.1557
1,Barnet,51.6252,-0.1517
2,Bexley,51.4549,0.1505
3,Brent,51.5588,-0.2817
4,Bromley,51.4039,0.0198


In [18]:
df_boroughs.loc[df_boroughs['Borough']=='Newham']

Unnamed: 0,Borough,Latitude,Longitude
23,Newham,51.5077,0.0469


__Get the latitude and longitude of London City using geopy library__

In [19]:
address = 'London, UK'
geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of London City are {}, {}.'.format(latitude, longitude))

  from ipykernel import kernelapp as app


The geograpical coordinate of London City are 51.5073219, -0.1276474.


__Create a map of London with Boroughs superimposed over__

In [20]:
# create map of London using latitude and longitude values
map_london = folium.Map(location=[latitude, longitude], zoom_start=10)

In [21]:
# add markers to map
for lat, lng, borough in zip(df_boroughs['Latitude'], df_boroughs['Longitude'], df_boroughs['Borough']):
    label = '{}'.format(borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_london)  

map_london

__As the borough Newham has the highest Asian population, we will segment and cluster only the neighbourhoods within this borough.__

__Read latitude and longitudes of coordinates of all coordinates of the neighbourhoods in Newham Borough__

In [22]:
from urllib.request import urlopen
import re
URL = "https://en.wikipedia.org/wiki/List_of_areas_of_London"
res = requests.get(URL).text
soup = BeautifulSoup(res,'lxml')

codes = []
areas_list = []
href_links_list = []
for items in soup.find('table', class_= 'wikitable sortable').find_all('tr')[1::]:
    data = items.find_all(['td'])
    data0 = data[0]
    area_name = data0.text

    data1 = data[1]
    data1 = data1.text
    borough = data1.split('[')
    borough_name = borough[0]
    data5 = data[5]
    code = data5.text
    code = code.strip()
    
    if borough_name == 'Newham':
        codes.append(code)
        areas_list.append((borough_name,area_name,code))

                
for link in soup.findAll('a', attrs={'href': re.compile("^https://tools.wmflabs.org")}):
            htext = link.text
            if htext in codes:
                hlink = link.get('href')
                href_links_list.append((htext, hlink))

__Create a df from the areas list__

In [23]:
df_areas = pd.DataFrame(areas_list, columns=['Borough', 'Area', 'Code'])
df_areas

Unnamed: 0,Borough,Area,Code
0,Newham,Beckton,TQ435815
1,Newham,Canning Town,TQ405815
2,Newham,Custom House,TQ408807
3,Newham,East Ham,TQ425835
4,Newham,Forest Gate,TQ405855
5,Newham,Little Ilford,TQ435855
6,Newham,Manor Park,TQ425855
7,Newham,Maryland,TQ391849
8,Newham,North Woolwich,TQ435795
9,Newham,Plaistow,TQ405825


__Create a DF from the list of href links__

In [24]:
df_links = pd.DataFrame(href_links_list, columns=['Code','href'])
df_links

Unnamed: 0,Code,href
0,TQ435815,https://tools.wmflabs.org/geohack/en/51.514205...
1,TQ405815,https://tools.wmflabs.org/geohack/en/51.514959...
2,TQ408807,https://tools.wmflabs.org/geohack/en/51.507695...
3,TQ425835,https://tools.wmflabs.org/geohack/en/51.532429...
4,TQ405855,https://tools.wmflabs.org/geohack/en/51.550902...
5,TQ435855,https://tools.wmflabs.org/geohack/en/51.550147...
6,TQ425855,https://tools.wmflabs.org/geohack/en/51.550401...
7,TQ391849,https://tools.wmflabs.org/geohack/en/51.545857...
8,TQ435795,https://tools.wmflabs.org/geohack/en/51.496234...
9,TQ405825,https://tools.wmflabs.org/geohack/en/51.523944...


__Merge the areas and href links DFs__

In [25]:
cols = df_links.columns.difference(df_areas.columns)
df_areas_links = pd.concat([df_areas, df_links[cols]], axis=1)
df_areas_links


Unnamed: 0,Borough,Area,Code,href
0,Newham,Beckton,TQ435815,https://tools.wmflabs.org/geohack/en/51.514205...
1,Newham,Canning Town,TQ405815,https://tools.wmflabs.org/geohack/en/51.514959...
2,Newham,Custom House,TQ408807,https://tools.wmflabs.org/geohack/en/51.507695...
3,Newham,East Ham,TQ425835,https://tools.wmflabs.org/geohack/en/51.532429...
4,Newham,Forest Gate,TQ405855,https://tools.wmflabs.org/geohack/en/51.550902...
5,Newham,Little Ilford,TQ435855,https://tools.wmflabs.org/geohack/en/51.550147...
6,Newham,Manor Park,TQ425855,https://tools.wmflabs.org/geohack/en/51.550401...
7,Newham,Maryland,TQ391849,https://tools.wmflabs.org/geohack/en/51.545857...
8,Newham,North Woolwich,TQ435795,https://tools.wmflabs.org/geohack/en/51.496234...
9,Newham,Plaistow,TQ405825,https://tools.wmflabs.org/geohack/en/51.523944...


__Remove row with no data__

In [26]:
df_areas_links = df_areas_links.dropna(how='any')
df_areas_links

Unnamed: 0,Borough,Area,Code,href
0,Newham,Beckton,TQ435815,https://tools.wmflabs.org/geohack/en/51.514205...
1,Newham,Canning Town,TQ405815,https://tools.wmflabs.org/geohack/en/51.514959...
2,Newham,Custom House,TQ408807,https://tools.wmflabs.org/geohack/en/51.507695...
3,Newham,East Ham,TQ425835,https://tools.wmflabs.org/geohack/en/51.532429...
4,Newham,Forest Gate,TQ405855,https://tools.wmflabs.org/geohack/en/51.550902...
5,Newham,Little Ilford,TQ435855,https://tools.wmflabs.org/geohack/en/51.550147...
6,Newham,Manor Park,TQ425855,https://tools.wmflabs.org/geohack/en/51.550401...
7,Newham,Maryland,TQ391849,https://tools.wmflabs.org/geohack/en/51.545857...
8,Newham,North Woolwich,TQ435795,https://tools.wmflabs.org/geohack/en/51.496234...
9,Newham,Plaistow,TQ405825,https://tools.wmflabs.org/geohack/en/51.523944...


__Get geo coordinates for all areas in the Newham borough__

In [27]:
geo_codes = []
for row in df_areas_links.itertuples():
    url = row.href
    code = row.Code
    res = requests.get(url).text
    soup1 = BeautifulSoup(res,'lxml')
    
    for lat in soup1.find('span',{'class':'latitude'}):
        latitude = lat
        latitude = float(latitude)
            
    for long in soup1.find('span',{'class':'longitude'}):    
        longitude = long
        longitude = float(longitude)
        
    geo_codes.append((code, latitude, longitude))

print(geo_codes)

[('TQ435815', 51.514206, 0.066634), ('TQ405815', 51.514959, 0.023429), ('TQ408807', 51.507696, 0.027431), ('TQ425835', 51.53243, 0.053041), ('TQ405855', 51.550902, 0.025024), ('TQ435855', 51.550148, 0.068263), ('TQ425855', 51.550401, 0.05385), ('TQ391849', 51.545857, 0.004608), ('TQ435795', 51.496234, 0.065821), ('TQ405825', 51.523945, 0.023828), ('TQ415795', 51.496738, 0.037029), ('TQ385845', 51.54241, -0.004196), ('TQ405837', 51.534728, 0.024306), ('TQ405837', 51.534728, 0.024306)]


__Create a DF from list above__

In [29]:
df_geo_codes = pd.DataFrame(geo_codes, columns=['Code','Latitude','Longitude'])
df_geo_codes

Unnamed: 0,Code,Latitude,Longitude
0,TQ435815,51.514206,0.066634
1,TQ405815,51.514959,0.023429
2,TQ408807,51.507696,0.027431
3,TQ425835,51.53243,0.053041
4,TQ405855,51.550902,0.025024
5,TQ435855,51.550148,0.068263
6,TQ425855,51.550401,0.05385
7,TQ391849,51.545857,0.004608
8,TQ435795,51.496234,0.065821
9,TQ405825,51.523945,0.023828


__Merge areas and geocodes DFs__

In [33]:
cols = df_geo_codes.columns.difference(df_areas.columns)
Newham_borough = pd.concat([df_areas, df_geo_codes[cols]], axis=1)
Newham_borough = Newham_borough.rename(columns={'Area' :'Neighborhood'})
Newham_borough

Unnamed: 0,Borough,Neighborhood,Code,Latitude,Longitude
0,Newham,Beckton,TQ435815,51.514206,0.066634
1,Newham,Canning Town,TQ405815,51.514959,0.023429
2,Newham,Custom House,TQ408807,51.507696,0.027431
3,Newham,East Ham,TQ425835,51.53243,0.053041
4,Newham,Forest Gate,TQ405855,51.550902,0.025024
5,Newham,Little Ilford,TQ435855,51.550148,0.068263
6,Newham,Manor Park,TQ425855,51.550401,0.05385
7,Newham,Maryland,TQ391849,51.545857,0.004608
8,Newham,North Woolwich,TQ435795,51.496234,0.065821
9,Newham,Plaistow,TQ405825,51.523945,0.023828


__Get coordinates of Newham Borough__

In [41]:
address = 'Newham, London'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Newham are {}, {}.'.format(latitude, longitude))

  app.launch_new_instance()


The geograpical coordinate of Newham are 51.52999955, 0.02931796029382208.


__Visualise the areas of Newham Borough__

In [42]:
# create map of Newham using latitude and longitude values
map_Newham = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(Newham_borough['Latitude'], Newham_borough['Longitude'], Newham_borough['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_Newham)  
    
map_Newham

__Explore the first Neighboroughood area. in Newham__

In [45]:
Newham_borough.loc[0, 'Neighborhood']

'Beckton'

In [46]:
Newham_borough.loc[0]

Borough           Newham
Neighborhood     Beckton
Latitude         51.5142
Longitude       0.066634
Name: 0, dtype: object

__Get Neighbourhood's latitude and longitude values__

In [47]:
neighborhood_latitude = Newham_borough.loc[0, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = Newham_borough.loc[0, 'Longitude'] # neighborhood longitude value

neighborhood_name = Newham_borough.loc[0, 'Neighborhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of Beckton are 51.514206, 0.066634.


__Now get he top 100 places of Newham Neighbourhood within 500KM radius.__

__Define foursquare credentials__

In [49]:
CLIENT_ID = '25NULFPYBI2ZOPL1SVEVVI3OTDJMPH0ASX3GS2AEDETVQL0A'
CLIENT_SECRET = '1DKEEM12K1KJ1LIGC4OZSIF4H5VS33AD3HH3PHB2BSCCCF3W'
VERSION = '20180323'

In [50]:
LIMIT = 100 # limit of number of venues returned by Foursquare API
radius = 500 # define radius
# create URL
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret=\
       {}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)
url # display URL

'https://api.foursquare.com/v2/venues/explore?&client_id=25NULFPYBI2ZOPL1SVEVVI3OTDJMPH0ASX3GS2AEDETVQL0A&client_secret=       1DKEEM12K1KJ1LIGC4OZSIF4H5VS33AD3HH3PHB2BSCCCF3W&v=20180323&ll=51.514206,0.066634&radius=500&limit=100'

In [51]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5e5792eb69babe001b9a9145'},
 'response': {'suggestedFilters': {'header': 'Tap to show:',
   'filters': [{'name': 'Open now', 'key': 'openNow'}]},
  'headerLocation': 'Beckton',
  'headerFullLocation': 'Beckton, London',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 9,
  'suggestedBounds': {'ne': {'lat': 51.518706004500004,
    'lng': 0.07385150597103936},
   'sw': {'lat': 51.5097059955, 'lng': 0.05941649402896063}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4e0c524018a89ad010688d2b',
       'name': 'East london Gymnastics Club',
       'contact': {},
       'location': {'lat': 51.514106774737556,
        'lng': 0.060155068624099396,
        'labeledLatLngs': [{'label': 'display',
          'lat': 51.5141067747

__Extract the vategories of. the venues__

In [52]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

__Clean the json and structure into a pandas df__

In [53]:
venues = results['response']['groups'][0]['items']
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]
nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,East london Gymnastics Club,Gym / Fitness Center,51.514107,0.060155
1,Home Bargains,Discount Store,51.516805,0.062804
2,Lituanica,Grocery Store,51.516442,0.062927
3,Premier Inn London Beckton,Hotel,51.515115,0.061016
4,Matalan,Clothing Store,51.516004,0.062635


In [55]:
nearby_venues['name'].value_counts()

East london Gymnastics Club    1
Lituanica                      1
Premier Inn London Beckton     1
Beckton Retail Park            1
Beckton DLR Station            1
Dreams                         1
Home Bargains                  1
Brewers Fayre                  1
Matalan                        1
Name: name, dtype: int64

__How many venus were returned by foursquare?__

In [56]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

9 venues were returned by Foursquare.


__Explore neighbourhoods in newham borough__

In [58]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                             'Neighborhood Latitude', 
                             'Neighborhood Longitude', 
                             'Venue', 
                             'Venue Latitude', 
                             'Venue Longitude', 
                             'Venue Category']
    
    return(nearby_venues)

__Now write the code to run the above function on each neighbourhood and create a new DF called Newham venues__

In [59]:
Newham_venues = getNearbyVenues(names=Newham_borough['Neighborhood'],
                                   latitudes=Newham_borough['Latitude'],
                                   longitudes=Newham_borough['Longitude']
                                  )

Beckton
Canning Town
Custom House
East Ham
Forest Gate
Little Ilford
Manor Park
Maryland
North Woolwich
Plaistow
Silvertown
Stratford
Upton Park
West Ham


In [60]:
Newham_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Beckton,51.514206,0.066634,East london Gymnastics Club,51.514107,0.060155,Gym / Fitness Center
1,Beckton,51.514206,0.066634,Home Bargains,51.516805,0.062804,Discount Store
2,Beckton,51.514206,0.066634,Lituanica,51.516442,0.062927,Grocery Store
3,Beckton,51.514206,0.066634,Premier Inn London Beckton,51.515115,0.061016,Hotel
4,Beckton,51.514206,0.066634,Matalan,51.516004,0.062635,Clothing Store


__Look at how many venues were returned for each neighbourhood__

In [61]:
Newham_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Beckton,9,9,9,9,9,9
Canning Town,4,4,4,4,4,4
Custom House,26,26,26,26,26,26
East Ham,12,12,12,12,12,12
Forest Gate,12,12,12,12,12,12
Little Ilford,4,4,4,4,4,4
Manor Park,5,5,5,5,5,5
Maryland,28,28,28,28,28,28
North Woolwich,22,22,22,22,22,22
Plaistow,10,10,10,10,10,10


__How many unique categories__

In [63]:
print('There are {} uniques categories.'.format(len(Newham_venues['Venue Category'].unique())))

There are 95 uniques categories.


__Analyse each neighbourhood in Newham borough__

In [64]:
# one hot encoding
Newham_onehot = pd.get_dummies(Newham_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
Newham_onehot['Neighborhood'] = Newham_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [Newham_onehot.columns[-1]] + list(Newham_onehot.columns[:-1])
Newham_onehot = Newham_onehot[fixed_columns]

Newham_onehot.head()

Unnamed: 0,Neighborhood,Accessories Store,Art Gallery,Asian Restaurant,Bagel Shop,Bakery,Bar,Boat or Ferry,Bookstore,Boutique,...,Thai Restaurant,Theater,Toy / Game Store,Train Station,Tunnel,Turkish Restaurant,Video Game Store,Vietnamese Restaurant,Warehouse Store,Wine Bar
0,Beckton,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Beckton,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Beckton,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Beckton,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Beckton,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


__Group. rowsby neighbourhood and by taking themean of the frerquency of occcurrence of each category__

In [65]:
Newham_grouped = Newham_onehot.groupby('Neighborhood').mean().reset_index()
Newham_grouped

Unnamed: 0,Neighborhood,Accessories Store,Art Gallery,Asian Restaurant,Bagel Shop,Bakery,Bar,Boat or Ferry,Bookstore,Boutique,...,Thai Restaurant,Theater,Toy / Game Store,Train Station,Tunnel,Turkish Restaurant,Video Game Store,Vietnamese Restaurant,Warehouse Store,Wine Bar
0,Beckton,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Canning Town,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Custom House,0.0,0.0,0.0,0.038462,0.038462,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.038462
3,East Ham,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.083333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.083333,0.0
4,Forest Gate,0.0,0.0,0.0,0.0,0.083333,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.083333,0.0,0.0,0.0,0.0,0.0,0.0
5,Little Ilford,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Manor Park,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0
7,Maryland,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.035714,0.0,0.0,0.0
8,North Woolwich,0.0,0.0,0.0,0.0,0.0,0.0,0.045455,0.0,0.0,...,0.0,0.0,0.0,0.0,0.045455,0.0,0.0,0.0,0.045455,0.0
9,Plaistow,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.0,0.0


__Print each neighbourhood along with top 5 most common venues__

In [66]:
num_top_venues = 5
for hood in Newham_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = Newham_grouped[Newham_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Beckton----
                  venue  freq
0                   Pub  0.11
1    Light Rail Station  0.11
2                 Hotel  0.11
3  Gym / Fitness Center  0.11
4         Grocery Store  0.11


----Canning Town----
                        venue  freq
0           Convenience Store  0.25
1  Construction & Landscaping  0.25
2                        Park  0.25
3                 Gas Station  0.25
4           Accessories Store  0.00


----Custom House----
                venue  freq
0               Hotel  0.19
1                 Pub  0.08
2      Sandwich Place  0.08
3         Coffee Shop  0.08
4  Chinese Restaurant  0.08


----East Ham----
                  venue  freq
0   Sporting Goods Shop  0.08
1         Grocery Store  0.08
2     Electronics Store  0.08
3  Fast Food Restaurant  0.08
4                   Pub  0.08


----Forest Gate----
                venue  freq
0       Grocery Store  0.25
1  Miscellaneous Shop  0.08
2                Café  0.08
3                 Pub  0.08
4            

__Put into pandas df__

__Firstt put all venues in descending order, and then creatte the new DF and display the top 10 venues for each neighbourhood__

In [68]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = Newham_grouped['Neighborhood']

for ind in np.arange(Newham_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(Newham_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Beckton,Gym / Fitness Center,Grocery Store,Hotel,Discount Store,Furniture / Home Store,Pub,Clothing Store,Light Rail Station,Shopping Plaza,Wine Bar
1,Canning Town,Park,Convenience Store,Construction & Landscaping,Gas Station,Wine Bar,Fish & Chips Shop,Dessert Shop,Discount Store,Doner Restaurant,Donut Shop
2,Custom House,Hotel,Coffee Shop,Chinese Restaurant,Pub,Sandwich Place,Wine Bar,Steakhouse,Pizza Place,Café,Restaurant
3,East Ham,Gym Pool,Coffee Shop,Fast Food Restaurant,Sporting Goods Shop,Pub,Thai Restaurant,Sandwich Place,Clothing Store,Warehouse Store,Park
4,Forest Gate,Grocery Store,Fish & Chips Shop,Bakery,Train Station,Pub,Miscellaneous Shop,Café,Flower Shop,Fast Food Restaurant,Market
5,Little Ilford,Fried Chicken Joint,Ice Cream Shop,Asian Restaurant,Indian Restaurant,Flower Shop,Dessert Shop,Discount Store,Doner Restaurant,Donut Shop,Eastern European Restaurant
6,Manor Park,Gym / Fitness Center,Asian Restaurant,Turkish Restaurant,Gas Station,Restaurant,Wine Bar,Fast Food Restaurant,Department Store,Dessert Shop,Discount Store
7,Maryland,Hotel,Pub,Grocery Store,Café,Hungarian Restaurant,Shopping Mall,Moving Target,General Entertainment,Liquor Store,Chinese Restaurant
8,North Woolwich,Pier,Hotel,Clothing Store,Sculpture Garden,Steakhouse,Pub,River,Sandwich Place,Italian Restaurant,Pharmacy
9,Plaistow,Grocery Store,Park,Gym / Fitness Center,Vietnamese Restaurant,Indian Restaurant,Pub,Café,Bus Stop,Electronics Store,Department Store


__Cluster the neighbourhoods__

__Run K-means algorithm to cluster the neighbourhood into 5 clusters__

In [70]:
# set number of clusters
kclusters = 5

Newham_grouped_clustering = Newham_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(Newham_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([1, 0, 1, 1, 0, 3, 4, 1, 1, 0], dtype=int32)

__Let's create a new DF that includes the cluster as well as the top 10 venues for each neighbourhood__

In [71]:
Newham_merged = Newham_borough

# add clustering labels
Newham_merged['Cluster Labels'] = kmeans.labels_

# merge Neighborhoods dataframe with Newham borough dataframe to add latitude/longitude for each neighborhood
Newham_merged = Newham_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

Newham_merged.head() # check the last columns!

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Newham,Beckton,51.514206,0.066634,1,Gym / Fitness Center,Grocery Store,Hotel,Discount Store,Furniture / Home Store,Pub,Clothing Store,Light Rail Station,Shopping Plaza,Wine Bar
1,Newham,Canning Town,51.514959,0.023429,0,Park,Convenience Store,Construction & Landscaping,Gas Station,Wine Bar,Fish & Chips Shop,Dessert Shop,Discount Store,Doner Restaurant,Donut Shop
2,Newham,Custom House,51.507696,0.027431,1,Hotel,Coffee Shop,Chinese Restaurant,Pub,Sandwich Place,Wine Bar,Steakhouse,Pizza Place,Café,Restaurant
3,Newham,East Ham,51.53243,0.053041,1,Gym Pool,Coffee Shop,Fast Food Restaurant,Sporting Goods Shop,Pub,Thai Restaurant,Sandwich Place,Clothing Store,Warehouse Store,Park
4,Newham,Forest Gate,51.550902,0.025024,0,Grocery Store,Fish & Chips Shop,Bakery,Train Station,Pub,Miscellaneous Shop,Café,Flower Shop,Fast Food Restaurant,Market


__Visualise the cluster__

In [72]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(Newham_merged['Latitude'], Newham_merged['Longitude'], Newham_merged['Neighborhood'], Newham_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

__Examine clusters__

In [74]:
Newham_merged.loc[Newham_merged['Cluster Labels'] == 0, Newham_merged.columns[[1] + list(range(5, Newham_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,Canning Town,Park,Convenience Store,Construction & Landscaping,Gas Station,Wine Bar,Fish & Chips Shop,Dessert Shop,Discount Store,Doner Restaurant,Donut Shop
4,Forest Gate,Grocery Store,Fish & Chips Shop,Bakery,Train Station,Pub,Miscellaneous Shop,Café,Flower Shop,Fast Food Restaurant,Market
9,Plaistow,Grocery Store,Park,Gym / Fitness Center,Vietnamese Restaurant,Indian Restaurant,Pub,Café,Bus Stop,Electronics Store,Department Store
10,Silvertown,Gym / Fitness Center,Theater,Paintball Field,Museum,Construction & Landscaping,Café,Park,Go Kart Track,Frozen Yogurt Shop,Doner Restaurant


In [75]:
Newham_merged.loc[Newham_merged['Cluster Labels'] == 1, Newham_merged.columns[[1] + list(range(5, Newham_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Beckton,Gym / Fitness Center,Grocery Store,Hotel,Discount Store,Furniture / Home Store,Pub,Clothing Store,Light Rail Station,Shopping Plaza,Wine Bar
2,Custom House,Hotel,Coffee Shop,Chinese Restaurant,Pub,Sandwich Place,Wine Bar,Steakhouse,Pizza Place,Café,Restaurant
3,East Ham,Gym Pool,Coffee Shop,Fast Food Restaurant,Sporting Goods Shop,Pub,Thai Restaurant,Sandwich Place,Clothing Store,Warehouse Store,Park
7,Maryland,Hotel,Pub,Grocery Store,Café,Hungarian Restaurant,Shopping Mall,Moving Target,General Entertainment,Liquor Store,Chinese Restaurant
8,North Woolwich,Pier,Hotel,Clothing Store,Sculpture Garden,Steakhouse,Pub,River,Sandwich Place,Italian Restaurant,Pharmacy
11,Stratford,Pub,Clothing Store,Shopping Mall,Italian Restaurant,Juice Bar,Coffee Shop,Fast Food Restaurant,Cosmetics Shop,General Entertainment,Toy / Game Store


In [76]:
Newham_merged.loc[Newham_merged['Cluster Labels'] == 2, Newham_merged.columns[[1] + list(range(5, Newham_merged.shape[1]))]]


Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
12,Upton Park,Pub,Convenience Store,Boutique,Bus Stop,Bus Line,Wine Bar,Fish & Chips Shop,Discount Store,Doner Restaurant,Donut Shop
13,West Ham,Pub,Convenience Store,Boutique,Bus Stop,Bus Line,Wine Bar,Fish & Chips Shop,Discount Store,Doner Restaurant,Donut Shop


In [77]:
Newham_merged.loc[Newham_merged['Cluster Labels'] == 3, Newham_merged.columns[[1] + list(range(5, Newham_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
5,Little Ilford,Fried Chicken Joint,Ice Cream Shop,Asian Restaurant,Indian Restaurant,Flower Shop,Dessert Shop,Discount Store,Doner Restaurant,Donut Shop,Eastern European Restaurant


In [78]:
Newham_merged.loc[Newham_merged['Cluster Labels'] == 4, Newham_merged.columns[[1] + list(range(5, Newham_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
6,Manor Park,Gym / Fitness Center,Asian Restaurant,Turkish Restaurant,Gas Station,Restaurant,Wine Bar,Fast Food Restaurant,Department Store,Dessert Shop,Discount Store


__Conclusion: after examining the above 5 clusters, we would recommend to stakeholers tthat Beckton, Custom House, Eastham and Manor park are the best neighbourhoods in Newham boroguh to open their Asian restaurant. This is because in these areasm the most common venue visited by the public is a hotel and as these areas are hgihly populated with asians, openiing an asian restaurant would be a good idea.__