## Importing requests library and getting data from the List of postal codes of Canada wiki page

In [2]:
import requests
page = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")

## Importing BeautifulSoup and using it to extract data out of html file and Prettify() function in BeautifulSoup will enable us to view how the tags are nested in the document

In [3]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(page.content, 'html.parser')
print(soup.prettify())

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   List of postal codes of Canada: M - Wikipedia
  </title>
  <script>
   document.documentElement.className = document.documentElement.className.replace( /(^|\s)client-nojs(\s|$)/, "$1client-js$2" );
  </script>
  <script>
   (window.RLQ=window.RLQ||[]).push(function(){mw.config.set({"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"List_of_postal_codes_of_Canada:_M","wgTitle":"List of postal codes of Canada: M","wgCurRevisionId":876823784,"wgRevisionId":876823784,"wgArticleId":539066,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Communications in Ontario","Postal codes in Canada","Toronto","Ontario-related lists"],"wgBreakFrames":false,"wgPageContentLanguage":"en","wgPageContentModel":"wikitext","wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wg

## Extracting data from various tags

In [4]:
html = list(soup.children)[2]
body = list(html.children)[3]
test = soup.select("body td")
w = [pt.get_text() for pt in test]

## Creating the column data for the dataframe

In [5]:
postcode = [w[n] for n in range(0,(len(w)-34),3)]
borough = [w[n] for n in range(1,(len(w)-34),3)]
neighbourhood = [w[n] for n in range(2,(len(w)-34),3)]

## Importing pandas and creating the dataframe

In [6]:
import pandas as pd
data = pd.DataFrame({"Postcode":postcode, "Borough":borough, "Neighbourhood":neighbourhood})

## Removing the EOL character present in Neighbourhood 

In [7]:
data["Neighbourhood"] = data["Neighbourhood"].replace('\n','', regex=True)

## Removing the rows which has Borough = Not assigned

In [8]:
data = data[data["Borough"]!="Not assigned"]

## Grouping the different Neighbourhoods within the same Postal Codes using comma delimiter

In [9]:
data = data.groupby(['Postcode','Borough'])['Neighbourhood'].apply(', '.join).reset_index()

In [10]:
data

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


## Replacing the 'Not Assigned' value corresponding to Neighbourhood column with value from Borough column

In [11]:
data.loc[data['Neighbourhood'] == "Not assigned", 'Neighbourhood'] = data['Borough']

## Shape of the created dataframe

In [12]:
data.shape

(103, 3)

## Downloading the Geospatial data since geocoder is not working

In [14]:
!wget -q -O 'Geospatial_data.csv' http://cocl.us/Geospatial_data
print('Data downloaded!')

Data downloaded!


## Creating dataframe from the downloaded geospatial data

In [13]:
Geospatial_data = pd.read_csv("Geospatial_data.csv")

In [14]:
Geospatial_data.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


## Merging data and geospatial dataframes and removing the redundant column(Postal Code)

In [15]:
gsdata = pd.merge(data, Geospatial_data, left_on = "Postcode", right_on = "Postal Code")
gsdata = gsdata.drop("Postal Code", axis =1)
gsdata.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


## Filtering out the Toronto data

In [16]:
torontodata = gsdata[(gsdata["Borough"].str.contains("Toronto"))].reset_index(drop=True)
torontodata

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
2,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
5,M4P,Central Toronto,Davisville North,43.712751,-79.390197
6,M4R,Central Toronto,North Toronto West,43.715383,-79.405678
7,M4S,Central Toronto,Davisville,43.704324,-79.38879
8,M4T,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.38316
9,M4V,Central Toronto,"Deer Park, Forest Hill SE, Rathnelly, South Hi...",43.686412,-79.400049


## Importing folium to convert an address into latitude and longitude values

In [17]:
import folium
from geopy.geocoders import Nominatim 

In [18]:
address = 'Toronto, CA'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
tlatitude = location.latitude
tlongitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(tlatitude, tlongitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


## Creating a map of Toronto with neighborhoods superimposed on top

In [19]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[tlatitude, tlongitude], zoom_start=12)

# add markers to map
for lat, lng, borough, neighborhood in zip(torontodata['Latitude'], torontodata['Longitude'], torontodata['Borough'], torontodata['Neighbourhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [20]:
# Matplotlib and associated plotting modules
import numpy as np
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

# tranforming json file into a pandas dataframe library
from pandas.io.json import json_normalize

## Foursquare ID and Secret 

In [21]:
CLIENT_ID = 'B21RWMG44NNR0PXVDDGJJHHIB42PSDJBNLVHXXBRCDWSX3XP' # your Foursquare ID
CLIENT_SECRET = 'O20XQGZPUV4RVY0WJZWU1I0NJVSWYMD3SBJDIQR11FKIYU2F' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: B21RWMG44NNR0PXVDDGJJHHIB42PSDJBNLVHXXBRCDWSX3XP
CLIENT_SECRET:O20XQGZPUV4RVY0WJZWU1I0NJVSWYMD3SBJDIQR11FKIYU2F


## Extracting data of all Indian restuarants in the neighbourhoods of Toronto

In [22]:
n = torontodata["Postcode"].count()
torontoindianrest = pd.DataFrame()
for i in range(0,n):
    lat = torontodata.loc[i, "Latitude"]
    lng = torontodata.loc[i, "Longitude"]
    radius = 10000000000 
    LIMIT = 10000
    search_query = "Indian Restaurant"
    url = 'https://api.foursquare.com/v2/venues/search?client_id={}&client_secret={}&ll={},{}&v={}&query={}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, lat, lng, VERSION, search_query, radius, LIMIT)
    results = requests.get(url).json()
    sq = results['response']['venues']
    nearby_sq = json_normalize(sq)
    indianrest = pd.DataFrame(nearby_sq)
    torontoindianrest = torontoindianrest.append(indianrest)

torontoindianrest.count()

categories                   1900
hasPerk                      1900
id                           1900
location.address             1648
location.cc                  1900
location.city                1845
location.country             1900
location.crossStreet         1212
location.distance            1900
location.formattedAddress    1900
location.labeledLatLngs      1900
location.lat                 1900
location.lng                 1900
location.neighborhood         157
location.postalCode          1326
location.state               1845
name                         1900
referralId                   1900
venuePage.id                  259
dtype: int64

## Data cleansing, munging, merging and filtering to get the required format data 

In [39]:
torontoindianrest["Postalcode"] = torontoindianrest["location.postalCode"].astype(str).str[0:3]
torontoindianrestfulldata = pd.merge(torontoindianrest, torontodata, left_on = ["Postalcode"], right_on = ["Postcode"])
torontoindianrestfulldatarequired = torontoindianrestfulldata.filter(['name','Borough','Neighbourhood', 'Latitude', 'Longitude'], axis=1)
torontoindianrestfulldatarequired = torontoindianrestfulldatarequired[torontoindianrestfulldatarequired['name'].str.contains("ndia")]
torontoindianrestfulldatarequired = torontoindianrestfulldatarequired.drop_duplicates()

In [40]:
torontoindiarestgrouped = torontoindianrestfulldatarequired.groupby(['Neighbourhood']).mean()
torontoindiarestgrouped.reset_index()

Unnamed: 0,Neighbourhood,Latitude,Longitude
0,"Adelaide, King, Richmond",43.650571,-79.384568
1,Berczy Park,43.644771,-79.373306
2,"Brockton, Exhibition Place, Parkdale Village",43.636847,-79.428191
3,"CN Tower, Bathurst Quay, Island airport, Harbo...",43.628947,-79.39442
4,Central Bay Street,43.657952,-79.387383
5,Christie,43.669542,-79.422564
6,Church and Wellesley,43.66586,-79.38316
7,"Commerce Court, Victoria Hotel",43.648198,-79.379817
8,Davisville,43.704324,-79.38879
9,"Deer Park, Forest Hill SE, Rathnelly, South Hi...",43.686412,-79.400049


## Clustering of the data

In [41]:
# set number of clusters
kclusters = 4

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(torontoindiarestgrouped)

## Inserting the cluster labels to the dataframe

In [42]:
torontoindiarestgrouped.insert(0, 'Cluster Labels', kmeans.labels_)
torontoindiarestgrouped = torontoindiarestgrouped.reset_index()
torontoindiarestgrouped

Unnamed: 0,Neighbourhood,Cluster Labels,Latitude,Longitude
0,"Adelaide, King, Richmond",0,43.650571,-79.384568
1,Berczy Park,0,43.644771,-79.373306
2,"Brockton, Exhibition Place, Parkdale Village",1,43.636847,-79.428191
3,"CN Tower, Bathurst Quay, Island airport, Harbo...",0,43.628947,-79.39442
4,Central Bay Street,0,43.657952,-79.387383
5,Christie,2,43.669542,-79.422564
6,Church and Wellesley,0,43.66586,-79.38316
7,"Commerce Court, Victoria Hotel",0,43.648198,-79.379817
8,Davisville,2,43.704324,-79.38879
9,"Deer Park, Forest Hill SE, Rathnelly, South Hi...",2,43.686412,-79.400049


## Merging the restuarant data with grouped data to get all attributes 

In [43]:
torontoindiarestmerged = torontoindianrestfulldatarequired
torontofinaldata = pd.merge(torontoindiarestgrouped, torontoindiarestmerged, how = 'inner', on =['Neighbourhood','Latitude','Longitude'] )
torontofinaldata

Unnamed: 0,Neighbourhood,Cluster Labels,Latitude,Longitude,name,Borough
0,"Adelaide, King, Richmond",0,43.650571,-79.384568,Indian Biriyani House,Downtown Toronto
1,Berczy Park,0,43.644771,-79.373306,Chadani Indian Cuisine,Downtown Toronto
2,"Brockton, Exhibition Place, Parkdale Village",1,43.636847,-79.428191,Ali's West Indian Roti Shop,West Toronto
3,"Brockton, Exhibition Place, Parkdale Village",1,43.636847,-79.428191,A&N Canadian & West Indian Cuisine,West Toronto
4,"CN Tower, Bathurst Quay, Island airport, Harbo...",0,43.628947,-79.39442,Aroma Fine Indian Restaurant,Downtown Toronto
5,"CN Tower, Bathurst Quay, Island airport, Harbo...",0,43.628947,-79.39442,309 Dhaba Indian Excellence,Downtown Toronto
6,Central Bay Street,0,43.657952,-79.387383,Indian Biriyani House,Downtown Toronto
7,Central Bay Street,0,43.657952,-79.387383,Mami's Indian Cuisine,Downtown Toronto
8,Christie,2,43.669542,-79.422564,630 Maroli Indian Kerala Restaurant,Downtown Toronto
9,Christie,2,43.669542,-79.422564,Banjara Indian Cuisine,Downtown Toronto


## Creating the cluster map showing the Indian Restuarants in the neighbourhoods of Toronto

In [69]:
# create map
map_clusters = folium.Map(location=[tlatitude, tlongitude], zoom_start=12)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors = [
    'red',
    'blue',
    'green',
    'purple']

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(torontofinaldata['Latitude'], torontofinaldata['Longitude'], torontofinaldata['Borough'], torontofinaldata['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=colors[cluster-1],
        fill=True,
        fill_color=colors[cluster-1],
        fill_opacity=1).add_to(map_clusters)
       
map_clusters

## Examining the 1st cluster among the 4 clusters

In [59]:
torontofinaldata.loc[torontofinaldata['Cluster Labels'] == 0]

Unnamed: 0,Neighbourhood,Cluster Labels,Latitude,Longitude,name,Borough
0,"Adelaide, King, Richmond",0,43.650571,-79.384568,Indian Biriyani House,Downtown Toronto
1,Berczy Park,0,43.644771,-79.373306,Chadani Indian Cuisine,Downtown Toronto
4,"CN Tower, Bathurst Quay, Island airport, Harbo...",0,43.628947,-79.39442,Aroma Fine Indian Restaurant,Downtown Toronto
5,"CN Tower, Bathurst Quay, Island airport, Harbo...",0,43.628947,-79.39442,309 Dhaba Indian Excellence,Downtown Toronto
6,Central Bay Street,0,43.657952,-79.387383,Indian Biriyani House,Downtown Toronto
7,Central Bay Street,0,43.657952,-79.387383,Mami's Indian Cuisine,Downtown Toronto
10,Church and Wellesley,0,43.66586,-79.38316,Kothur Indian Cuisine,Downtown Toronto
11,"Commerce Court, Victoria Hotel",0,43.648198,-79.379817,Ram's Indian kitchen,Downtown Toronto
14,"Harbourfront East, Toronto Islands, Union Station",0,43.640816,-79.381752,Indian Roti House,Downtown Toronto
15,"Harbourfront East, Toronto Islands, Union Station",0,43.640816,-79.381752,Tamarind: The Indian Kitchen,Downtown Toronto


## Examing the second cluster

In [60]:
torontofinaldata.loc[torontofinaldata['Cluster Labels'] == 1]

Unnamed: 0,Neighbourhood,Cluster Labels,Latitude,Longitude,name,Borough
2,"Brockton, Exhibition Place, Parkdale Village",1,43.636847,-79.428191,Ali's West Indian Roti Shop,West Toronto
3,"Brockton, Exhibition Place, Parkdale Village",1,43.636847,-79.428191,A&N Canadian & West Indian Cuisine,West Toronto
18,"High Park, The Junction South",1,43.661608,-79.464763,Indian Road Crescent Public School,West Toronto
20,"Runnymede, Swansea",1,43.651571,-79.48445,Durbar Indian Cuisine,West Toronto
21,"Runnymede, Swansea",1,43.651571,-79.48445,Bukhara indian cuisine,West Toronto


## Examining the thrid cluster

In [61]:
torontofinaldata.loc[torontofinaldata['Cluster Labels'] == 2]

Unnamed: 0,Neighbourhood,Cluster Labels,Latitude,Longitude,name,Borough
8,Christie,2,43.669542,-79.422564,630 Maroli Indian Kerala Restaurant,Downtown Toronto
9,Christie,2,43.669542,-79.422564,Banjara Indian Cuisine,Downtown Toronto
12,Davisville,2,43.704324,-79.38879,Marigold Indian Bistro | Indian Restaurants in...,Central Toronto
13,"Deer Park, Forest Hill SE, Rathnelly, South Hi...",2,43.686412,-79.400049,Indian Affairs,Central Toronto
19,"Moore Park, Summerhill East",2,43.689574,-79.38316,Chef of India,Central Toronto
23,"The Annex, North Midtown, Yorkville",2,43.67271,-79.405678,Bhoj Indian Cuisine,Central Toronto


## Examining the fourth cluster

In [63]:
torontofinaldata.loc[torontofinaldata['Cluster Labels'] == 3]

Unnamed: 0,Neighbourhood,Cluster Labels,Latitude,Longitude,name,Borough
22,Studio District,3,43.659526,-79.340923,Siddhartha Indian,East Toronto
24,"The Beaches West, India Bazaar",3,43.668999,-79.315572,The Famous Indian Restaurant,East Toronto
25,"The Beaches West, India Bazaar",3,43.668999,-79.315572,Indian Rasoi,East Toronto
