## Part1 Data scraping through wiki pages

In [1]:
import requests
from bs4 import BeautifulSoup
import html5lib
import pandas as pd
from geopy.geocoders import Nominatim
#import geocoder
import folium
import json
from sklearn.cluster import KMeans
import numpy as np
import matplotlib.cm as  cm
import matplotlib.colors as colors
url="https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
data=requests.get(url).text
soup=BeautifulSoup(data,'html5lib')
#print(soup)
tb_data=soup.find('table',attrs={"class":"wikitable sortable"})
#print(tb_data)
columns=[]
for i in tb_data.find_all('th'):
    columns.append(i.text.strip())
#print(columns)
df=pd.DataFrame(columns=columns)
for i in tb_data.find_all('tr')[1:]:
    l=i.text.replace('\n\n','\n').strip().split('\n')
    postal_code=l[0]
    borough=l[1]
    neighborhood=l[2]
    if borough!='Not assigned' and neighborhood=='Not assigned':
        neighborhood='borough'
    if borough !='Not assigned':
        df=df.append({'Postal Code':postal_code,
                   'Borough':borough,
                   'Neighbourhood':neighborhood},ignore_index=True)

In [2]:
print(df.shape)
print("rows are =", df.shape[0])
print("cols are =" ,df.shape[1])

(103, 3)
rows are = 103
cols are = 3


In [3]:
df.columns

Index(['Postal Code', 'Borough', 'Neighbourhood'], dtype='object')

In [4]:
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


## Part2 Add Latitude and Longitude 

In [5]:
df1=pd.read_csv('c:\\users\\nagab\\downloads\\Geospatial_Coordinates.csv')
df_final=pd.concat([df,df1[['Latitude','Longitude']]],axis=1)
df_final=df.join(df1.set_index('Postal Code'),on='Postal Code')

In [6]:
df_final.shape

(103, 5)

In [7]:
df_final.columns

Index(['Postal Code', 'Borough', 'Neighbourhood', 'Latitude', 'Longitude'], dtype='object')

In [8]:
df_final.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


## Part3 Clustring

In [9]:
address='Toronto ,Canada'
geoloc=Nominatim(user_agent='tr_explorer')
location=geoloc.geocode(address)
tc_latitude=location.latitude
tc_longitude=location.longitude
t_map=folium.Map(location=[tc_latitude,tc_longitude],zoom_start=10)
# add all neighbors to map
for lat,lng,poscode,neigh in zip(df_final['Latitude'],df_final['Longitude'],df_final['Postal Code'],df_final['Neighbourhood']):
    label='{},{}'.format(poscode,neigh)
    label=folium.Popup(label,parse_html=True)
    folium.CircleMarker(
                        [lat,lng],
                        radius=5,
                        color='red',
                        popup=label,
                        fill=True,
                        fill_color='blue',
                        fill_opacity=0.7
                        ).add_to(t_map)
t_map
#visualized all neighourhoods in toronto

In [10]:
CLIENT_ID='IGJVY1YBQWNNNNIGMH0BKYJ214DOA5UV5EKQQZWXDNHTEZI0'
CLIENT_SECRET='ECPW3ELSOVQVHHOVKLCJFPTLXS2JSWNX13E1O4Q4XWH5MT0Y'
VERSION='20180604'
radius=500
limit=100

# analysis North York ,Toronto ,Canada data
north_york_toronto=df_final[df_final['Borough']==df_final.loc[0,'Borough']]
address='North York ,Toronto,Canada'
geoloc=Nominatim(user_agent='nt_explorer')
location=geoloc.geocode(address)
latitude=location.latitude
longitude=location.longitude
#create map.
n_map=folium.Map(location=[latitude,longitude],zoom_start=10)
for lat,lng,poscode,neigh in zip(north_york_toronto['Latitude'],north_york_toronto['Longitude'],north_york_toronto['Postal Code'],north_york_toronto['Neighbourhood']):
    label='{},{}'.format(poscode,neigh)
    label=folium.Popup(label,parse_html=True)
    folium.CircleMarker(
                        [lat,lng],
                        radius=5,
                        color='orange',
                        popup=label,
                        fill=True,
                        fill_color='red',
                        fill_opacity=0.7
                        ).add_to(n_map)

#now analyze  the first row data in North York,Toronto Canada.
#get category for only one neighborhood
north_york_neighbor=north_york_toronto.loc[0,'Neighbourhood']
north_york_latitude=north_york_toronto.loc[0,'Latitude']
north_york_longitude=north_york_toronto.loc[0,'Longitude']
print("Neighborhood={}\nLatitude={}\nLongitude={}".format(north_york_neighbor,north_york_latitude,north_york_longitude,VERSION,radius,limit))

url='https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&ll={},{}&v={}&radius={}&limit={}'.format(CLIENT_ID,CLIENT_SECRET,north_york_latitude,north_york_longitude,VERSION,radius,limit)
results=requests.get(url).json()
def get_category(row):
    try:
        cat_list=row['categories']
    except:
        cat_list=row['venue.categories']
    if len(cat_list)==None:
        return None
    else:
        return  cat_list[0]['name']

query=results['response']['groups'][0]['items']
nearby_venues=pd.json_normalize(query)
filtered_columns=['venue.name','venue.location.lat','venue.location.lng','venue.categories']
nearby_venues=nearby_venues.loc[:,filtered_columns]
nearby_venues['venue.categories']=nearby_venues.apply(get_category,axis=1)
#print(nearby_venues['venue.categories'].head())
nearby_venues.columns=[i.split('.')[-1] for i in nearby_venues.columns]
n_map

Neighborhood=Parkwoods
Latitude=43.7532586
Longitude=-79.3296565


In [11]:
#now get the venues names and category for all neighborhoods
def get_near_by_venues(names,latitude,longitude,radius=500):
    venues_list=[]
    for name,lat,lng in zip(names,latitude,longitude):
        url='https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&ll={},{}&v={}&radius={}&limit={}'.format(CLIENT_ID,CLIENT_SECRET,lat,lng,VERSION,radius,limit)
        try:
            results=requests.get(url).json()['response']['groups'][0]['items']
            venues_list.append([(name,
                                lat,
                                lng,
                                i['venue']['name'],
                                i['venue']['location']['lat'],
                                i['venue']['location']['lng'],
                                i['venue']['categories'][0]['name']) for i in results])
    
        except:
            print("no data found")
    nearby_venues=pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    #print([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns=['Neighbourhood','Neigh_Latitude','Neigh_Longitude','Venue','Venue_Latitude','Venue_Longitude','Venue_Categories']
    return nearby_venues
   
nearby_venues_list=get_near_by_venues(names=north_york_toronto['Neighbourhood'],latitude=north_york_toronto['Latitude'],longitude=north_york_toronto['Longitude'])
nearby_venues_list

Unnamed: 0,Neighbourhood,Neigh_Latitude,Neigh_Longitude,Venue,Venue_Latitude,Venue_Longitude,Venue_Categories
0,Parkwoods,43.753259,-79.329656,Brookbanks Park,43.751976,-79.332140,Park
1,Parkwoods,43.753259,-79.329656,Variety Store,43.751974,-79.333114,Food & Drink Shop
2,Victoria Village,43.725882,-79.315572,Victoria Village Arena,43.723481,-79.315635,Hockey Arena
3,Victoria Village,43.725882,-79.315572,Portugril,43.725819,-79.312785,Portuguese Restaurant
4,Victoria Village,43.725882,-79.315572,Tim Hortons,43.725517,-79.313103,Coffee Shop
...,...,...,...,...,...,...,...
241,"Willowdale, Willowdale West",43.782736,-79.442259,Tov-Li,43.784214,-79.446098,Pizza Place
242,"Willowdale, Willowdale West",43.782736,-79.442259,Shoppers Drug Mart,43.784847,-79.446028,Pharmacy
243,"Willowdale, Willowdale West",43.782736,-79.442259,Tim Hortons,43.780940,-79.444231,Coffee Shop
244,"Willowdale, Willowdale West",43.782736,-79.442259,Price Chopper,43.783237,-79.446339,Grocery Store


In [12]:
#analyze the each neighbourhood

nearby_onehot=pd.get_dummies(nearby_venues_list['Venue_Categories'],prefix="",prefix_sep="")
nearby_onehot['Neighbourhood']=nearby_venues_list['Neighbourhood']
filtered_columns=[nearby_onehot.columns[-1]]+list(nearby_onehot.columns[0:-1])
nearby_onehot=nearby_onehot[filtered_columns]

nearby_neigh_mean=nearby_onehot.groupby('Neighbourhood').mean().reset_index()
for i in nearby_neigh_mean['Neighbourhood']:
    temp=nearby_neigh_mean[nearby_neigh_mean['Neighbourhood']==i].T.reset_index()
    temp=temp.iloc[1:]
    temp.columns=['venue','freq']
    temp['freq']=temp['freq'].astype('float')
    temp=temp.round({'freq':2})
    temp=temp.sort_values('freq',ascending=False).reset_index(drop=True).head(5)
    #print(temp)

def return_most_common_venues(row,num_top_venues):
    row_categories=row[1:]
    row_categories_sorted=row_categories.sort_values(ascending=False)
    return row_categories.index.values[0:num_top_venues]

num_top_venues=10
indicators=['st','nd','rd']
#create a column accordinng to the number of top venues
columns=['Neighbourhood']
for i in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most common venue'.format(i+1,indicator[i]))
    except:
        columns.append('{} Most common venue'.format(i+1))
#create a new dataframe
neighbourhood_venues_sorted=pd.DataFrame(columns=columns)
neighbourhood_venues_sorted['Neighbourhood']=nearby_neigh_mean['Neighbourhood']
for i in np.arange(nearby_neigh_mean.shape[0]):
    neighbourhood_venues_sorted.iloc[i,1:]=return_most_common_venues(nearby_neigh_mean.iloc[i,:],num_top_venues)
    print(neighbourhood_venues_sorted)
        


                                      Neighbourhood 1 Most common venue  \
0   Bathurst Manor, Wilson Heights, Downsview North   Accessories Store   
1                                   Bayview Village                 NaN   
2                 Bedford Park, Lawrence Manor East                 NaN   
3                                         Don Mills                 NaN   
4                                         Downsview                 NaN   
5                      Fairview, Henry Farm, Oriole                 NaN   
6                                         Glencairn                 NaN   
7                                 Hillcrest Village                 NaN   
8                                     Humber Summit                 NaN   
9                                  Humberlea, Emery                 NaN   
10                 Lawrence Manor, Lawrence Heights                 NaN   
11         North Park, Maple Leaf Park, Upwood Park                 NaN   
12                  North

In [13]:
#cluster creation

#use nearby_neigh_mean
n=5
clu=nearby_neigh_mean.drop('Neighbourhood',axis=1)
km=KMeans(n_clusters=n,random_state=0).fit(clu)
print(km.labels_)

#add a cluster labels

neighbourhood_venues_sorted.insert(0,'Cluster Labels',km.labels_)
neighbour_merged=north_york_toronto
neighbour_merged=neighbour_merged.join(neighbourhood_venues_sorted.set_index('Neighbourhood'),on='Neighbourhood')

#create a  cluster visualization

map_clu=folium.Map(location=[latitude,longitude],zoom_start=10)
x=np.arange(5)
ys=[i+x+(i*x)**2 for i in range(n)]
colors_array=cm.rainbow(np.linspace(0,1,len(ys)))
rainbow=[colors.rgb2hex(i) for i in colors_array]

#add marker to map

markers_color=[]
for lat,lng,poi,cluster in zip(neighbour_merged['Latitude'],neighbour_merged['Longitude'],neighbour_merged['Neighbourhood'],neighbour_merged['Cluster Labels']):
    label=folium.Popup(str(poi)+'Cluster'+str(cluster),parse_html=True)
    folium.CircleMarker(
        [lat,lng],
        radius=5,
        popup=label,
        color='green',
        fill=True,
        fill_color='blue',
        fill_opacity=0.7
        ).add_to(map_clu)

map_clu

[4 4 4 4 4 4 4 4 3 2 4 0 4 0 4 4 4 0 1]


In [14]:
for i in range(5):
	print(neighbour_merged.loc[neighbour_merged['Cluster Labels']==i,neighbour_merged.columns[[1] +list(range(5,neighbour_merged.shape[1]))]])


       Borough  Cluster Labels 1 Most common venue 2 Most common venue  \
0   North York             0.0   Accessories Store             Airport   
49  North York             0.0   Accessories Store             Airport   
66  North York             0.0   Accessories Store             Airport   

    3 Most common venue 4 Most common venue  5 Most common venue  \
0   American Restaurant         Art Gallery  Arts & Crafts Store   
49  American Restaurant         Art Gallery  Arts & Crafts Store   
66  American Restaurant         Art Gallery  Arts & Crafts Store   

   6 Most common venue 7 Most common venue 8 Most common venue  \
0     Asian Restaurant  Athletics & Sports          Bagel Shop   
49    Asian Restaurant  Athletics & Sports          Bagel Shop   
66    Asian Restaurant  Athletics & Sports          Bagel Shop   

   9 Most common venue 10 Most common venue  
0               Bakery                 Bank  
49              Bakery                 Bank  
66              Bakery     

In [15]:
rainbow

['#8000ff', '#00b5eb', '#80ffb4', '#ffb360', '#ff0000']