In [136]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors
import folium

In [2]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(source ,'lxml')
table = soup.find('table',{'class':'wikitable sortable'})
#print(table.prettify()[:100])

In [3]:
# getting the header elements
header = table.tr.text
headers = header.split('\n')
headers = headers[1:len(headers)-1]
headers

['Postcode', 'Borough', 'Neighbourhood']

In [4]:
# completing the raw data in list format ( exact copy of the wiki table with no edits)
rows_list = []
rows = table.findAll('tr')
#sample = rows[0:7]
for row in rows:
    rows_list.append([cell.text.rstrip() for cell in row.findAll('td')])
rows_list = rows_list[1:]
rows_list[0:10]

[['M1A', 'Not assigned', 'Not assigned'],
 ['M2A', 'Not assigned', 'Not assigned'],
 ['M3A', 'North York', 'Parkwoods'],
 ['M4A', 'North York', 'Victoria Village'],
 ['M5A', 'Downtown Toronto', 'Harbourfront'],
 ['M5A', 'Downtown Toronto', 'Regent Park'],
 ['M6A', 'North York', 'Lawrence Heights'],
 ['M6A', 'North York', 'Lawrence Manor'],
 ['M7A', "Queen's Park", 'Not assigned'],
 ['M8A', 'Not assigned', 'Not assigned']]

In [5]:
# cleaning the list, assigning neigh as the borough if neigh is not assigned
#changes_made = 0
#making a copy of the original list so as to avoid any propagation of edits
import copy
#copy_list = copy.deepcopy(original_list)
row_list_copy = copy.deepcopy(rows_list)
#print(sample_row)
for sample_row in row_list_copy:
    if sample_row[-1] == 'Not assigned':
        if sample_row[1] != 'Not assigned':
            sample_row[-1] = sample_row[1]
            #changes_made+=1
final_list = row_list_copy
row_list_copy[0:10]

[['M1A', 'Not assigned', 'Not assigned'],
 ['M2A', 'Not assigned', 'Not assigned'],
 ['M3A', 'North York', 'Parkwoods'],
 ['M4A', 'North York', 'Victoria Village'],
 ['M5A', 'Downtown Toronto', 'Harbourfront'],
 ['M5A', 'Downtown Toronto', 'Regent Park'],
 ['M6A', 'North York', 'Lawrence Heights'],
 ['M6A', 'North York', 'Lawrence Manor'],
 ['M7A', "Queen's Park", "Queen's Park"],
 ['M8A', 'Not assigned', 'Not assigned']]

In [6]:
#making sure the original list is intact
rows_list[8]

['M7A', "Queen's Park", 'Not assigned']

In [7]:
# a temp df with all rows // no edits
df_temp = pd.DataFrame(final_list,columns = headers)
df_temp.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Queen's Park
9,M8A,Not assigned,Not assigned


In [8]:
Borough_filter = df_temp.Borough == 'Not assigned'
Borough_filter[:10]

0     True
1     True
2    False
3    False
4    False
5    False
6    False
7    False
8    False
9     True
Name: Borough, dtype: bool

In [72]:
# filtering the Not assignnd borough out from the temp_df
df = df_temp[~Borough_filter]
df.sort_values(by='Postcode').head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
11,M1B,Scarborough,Rouge
12,M1B,Scarborough,Malvern
29,M1C,Scarborough,Port Union
28,M1C,Scarborough,Rouge Hill
27,M1C,Scarborough,Highland Creek
42,M1E,Scarborough,Guildwood
43,M1E,Scarborough,Morningside
44,M1E,Scarborough,West Hill
53,M1G,Scarborough,Woburn
62,M1H,Scarborough,Cedarbrae


In [10]:
#checking unique postal codes in df
len(df.Postcode.unique())

103

In [11]:
# merging rows with same Postcode
df2 = df.groupby('Postcode').agg({'Borough':'first',
                               'Neighbourhood': ', '.join}).reset_index()

## Part(1) Results
### *Merged Rows with common postal code*

In [12]:
df2.sort_values(by='Postcode').head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [13]:
#checking unique postal codes in df
#sample_data1 =df2.sort_values(by='Postcode').head(10)
len(df2.Postcode.unique())

103

In [14]:
#reading the coordinate data from the csv provided // geocoder didn't work properly
coordinate_data = pd.read_csv('Geospatial_Coordinates.csv')

In [15]:
type(coordinate_data)

pandas.core.frame.DataFrame

In [16]:
coordinate_data.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [17]:
coordinate_data.columns = ['Postcode', 'Latitude', 'Longitude']
coordinate_data.head()

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [18]:
len(coordinate_data.Postcode.unique())

103

In [19]:
coordinate_data.sort_values(by='Postcode').head()

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


## Part(2) Results
### *Merged dataframe with common postal code*

In [85]:
#merging two dataframes on the postcode column
merged_data = df2.merge(coordinate_data,on = 'Postcode')

#merged_data with ungrouped Neighbourhood
merged_data2 = df.merge(coordinate_data,on = 'Postcode')

In [21]:
merged_data = merged_data.sort_values(by='Postcode')

In [140]:
merged_data.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


In [24]:
merged_data.Borough.unique()

array(['Scarborough', 'North York', 'East York', 'East Toronto',
       'Central Toronto', 'Downtown Toronto', 'York', 'West Toronto',
       "Queen's Park", 'Mississauga', 'Etobicoke'], dtype=object)

In [89]:
merged_data2 = merged_data2.sort_values(by='Postcode')
merged_data2.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
8,M1B,Scarborough,Rouge,43.806686,-79.194353
9,M1B,Scarborough,Malvern,43.806686,-79.194353
23,M1C,Scarborough,Port Union,43.784535,-79.160497
22,M1C,Scarborough,Rouge Hill,43.784535,-79.160497
21,M1C,Scarborough,Highland Creek,43.784535,-79.160497


## Replicating the Neighborhood analysis done in the lab session

### filtering only those borough which contain the word toronto

In [91]:
filter_ = merged_data2.Borough.str.contains('Toronto')
toronto_df = merged_data2[filter_]
toronto_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
36,M4E,East Toronto,The Beaches,43.676357,-79.293031
72,M4K,East Toronto,The Danforth West,43.679557,-79.352188
73,M4K,East Toronto,Riverdale,43.679557,-79.352188
85,M4L,East Toronto,The Beaches West,43.668999,-79.315572
86,M4L,East Toronto,India Bazaar,43.668999,-79.315572


In [92]:
CLIENT_ID = '25RMTM0VWX2LL2ZYVXQBAH0P2SGUZ2QUO51X5LUFKFSPIHHF' # your Foursquare ID
CLIENT_SECRET = 'XU4V5TPK32TLZ2IKBX5BC2L5GJL1MBAQY5JWE2GHT01JWHX5' # your Foursquare Secret
VERSION = '20180604'
LIMIT = 100
radius = 500
print('Your credentails:')
print('CLIENT_ID: ' , CLIENT_ID)
print('CLIENT_SECRET:' , CLIENT_SECRET)

Your credentails:
CLIENT_ID:  25RMTM0VWX2LL2ZYVXQBAH0P2SGUZ2QUO51X5LUFKFSPIHHF
CLIENT_SECRET: XU4V5TPK32TLZ2IKBX5BC2L5GJL1MBAQY5JWE2GHT01JWHX5


In [95]:
#borowwing the function
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    #count=0
    
    for name, lat, lng in zip(names, latitudes, longitudes):
        
        #count+=1
        print('Neighborhood:  ', name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue// appending list of [neigh + venues] to the main list
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])
        #print('venue cat: ', v['venue']['categories'][0]['name'])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    #print(count)
    return(nearby_venues)

In [96]:
toronto_venues = getNearbyVenues(names=toronto_df['Neighbourhood'],
                                   latitudes=toronto_df['Latitude'],
                                   longitudes=toronto_df['Longitude']
                                  )

Neighborhood:   The Beaches
Neighborhood:   The Danforth West
Neighborhood:   Riverdale
Neighborhood:   The Beaches West
Neighborhood:   India Bazaar
Neighborhood:   Studio District
Neighborhood:   Lawrence Park
Neighborhood:   Davisville North
Neighborhood:   North Toronto West
Neighborhood:   Davisville
Neighborhood:   Moore Park
Neighborhood:   Summerhill East
Neighborhood:   South Hill
Neighborhood:   Rathnelly
Neighborhood:   Forest Hill SE
Neighborhood:   Deer Park
Neighborhood:   Summerhill West
Neighborhood:   Rosedale
Neighborhood:   Cabbagetown
Neighborhood:   St. James Town
Neighborhood:   Church and Wellesley
Neighborhood:   Regent Park
Neighborhood:   Harbourfront
Neighborhood:   Ryerson
Neighborhood:   Garden District
Neighborhood:   St. James Town
Neighborhood:   Berczy Park
Neighborhood:   Central Bay Street
Neighborhood:   Adelaide
Neighborhood:   Richmond
Neighborhood:   King
Neighborhood:   Harbourfront East
Neighborhood:   Union Station
Neighborhood:   Toronto Islan

In [97]:
print(toronto_venues.shape)
toronto_venues.head()

(3298, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,The Beaches,43.676357,-79.293031,Grover Pub and Grub,43.679181,-79.297215,Pub
1,The Beaches,43.676357,-79.293031,Starbucks,43.678798,-79.298045,Coffee Shop
2,The Beaches,43.676357,-79.293031,Upper Beaches,43.680563,-79.292869,Neighborhood
3,The Beaches,43.676357,-79.293031,Fearless Meat,43.680337,-79.290289,Burger Joint
4,The Danforth West,43.679557,-79.352188,Pantheon,43.677621,-79.351434,Greek Restaurant


In [103]:
#top 20 venues by frequency
toronto_venues['Venue'].value_counts()[:20]

Starbucks                      108
Tim Hortons                     35
Subway                          19
Pilot Coffee Roasters           19
LCBO                            18
Pizzaiolo                       15
GoodLife Fitness                15
DAVIDsTEA                       15
Buster's Sea Cove               13
Shoppers Drug Mart              13
John & Sons Oyster House        13
Pizzeria Libretto               13
iQ Food Co                      12
Five Guys                       12
Second Cup                      12
TD Canada Trust                 12
Sam James Coffee Bar (SJCB)     11
Brick Street Bakery             11
The Gabardine                   10
deKEFIR                         10
Name: Venue, dtype: int64

In [106]:
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")
#print(len(toronto_onehot.columns))
# add neighborhood column back to dataframe
toronto_onehot.drop(['Neighborhood'],axis=1,inplace=True)
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]
print(len(toronto_onehot.columns))
toronto_onehot.head()

242


Unnamed: 0,Neighborhood,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,The Danforth West,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [107]:
toronto_onehot.shape

(3298, 242)

In [108]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped.head()

Unnamed: 0,Neighborhood,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,Adelaide,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04,...,0.0,0.0,0.0,0.01,0.0,0.0,0.01,0.0,0.01,0.0
1,Bathurst Quay,0.0,0.0,0.071429,0.071429,0.071429,0.142857,0.142857,0.142857,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Brockton,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Business Reply Mail Processing Centre 969 Eastern,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.055556


In [109]:
toronto_grouped.shape

(73, 242)

In [None]:
num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

In [111]:
#takes a row, sorts the values and then returns the indices of the num_top_venues elements
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [132]:
num_top_venues = 10
indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Adelaide,Coffee Shop,Café,Steakhouse,American Restaurant,Thai Restaurant,Clothing Store,Gym,Hotel,Bakery,Bar
1,Bathurst Quay,Airport Lounge,Airport Service,Airport Terminal,Sculpture Garden,Boutique,Airport,Airport Food Court,Airport Gate,Harbor / Marina,Plane
2,Berczy Park,Coffee Shop,Restaurant,Cocktail Bar,Farmers Market,Pub,Cheese Shop,Seafood Restaurant,Café,Bakery,Steakhouse
3,Brockton,Coffee Shop,Café,Breakfast Spot,Bar,Furniture / Home Store,Burrito Place,Climbing Gym,Italian Restaurant,Stadium,Caribbean Restaurant
4,Business Reply Mail Processing Centre 969 Eastern,Light Rail Station,Yoga Studio,Garden Center,Farmers Market,Fast Food Restaurant,Skate Park,Spa,Brewery,Burrito Place,Butcher


# Part(3) : Cluster Neighborhoods

In [133]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([0, 2, 0, 0, 0, 2, 0, 0, 0, 0], dtype=int32)

In [134]:
np.unique(kmeans.labels_,return_counts=True)

(array([0, 1, 2, 3, 4], dtype=int32), array([60,  1,  7,  2,  3]))

In [135]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = toronto_df.copy()
toronto_merged.rename(columns={'Neighbourhood': 'Neighborhood'}, inplace=True)

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.head() # check the last columns!

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
36,M4E,East Toronto,The Beaches,43.676357,-79.293031,0,Coffee Shop,Pub,Burger Joint,Yoga Studio,Dog Run,Filipino Restaurant,Fast Food Restaurant,Farmers Market,Falafel Restaurant,Event Space
72,M4K,East Toronto,The Danforth West,43.679557,-79.352188,0,Greek Restaurant,Coffee Shop,Ice Cream Shop,Bookstore,Italian Restaurant,Yoga Studio,Dessert Shop,Brewery,Bubble Tea Shop,Restaurant
73,M4K,East Toronto,Riverdale,43.679557,-79.352188,0,Greek Restaurant,Coffee Shop,Ice Cream Shop,Bookstore,Italian Restaurant,Yoga Studio,Dessert Shop,Brewery,Bubble Tea Shop,Restaurant
85,M4L,East Toronto,The Beaches West,43.668999,-79.315572,0,Park,Pet Store,Board Shop,Brewery,Burger Joint,Burrito Place,Sandwich Place,Pub,Pizza Place,Sushi Restaurant
86,M4L,East Toronto,India Bazaar,43.668999,-79.315572,0,Park,Pet Store,Board Shop,Brewery,Burger Joint,Burrito Place,Sandwich Place,Pub,Pizza Place,Sushi Restaurant


In [139]:
# create map
latitude = 43.6532 
longitude = -79.3832
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters