In [23]:
import pandas as pd
from bs4 import BeautifulSoup as bs
import numpy as np
import requests
from pathlib import Path
from geopy.geocoders import Nominatim
import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
from sklearn.cluster import KMeans

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
#from sklearn.cluster import KMeans

import folium # map rendering library

print('Libraries imported.')

Libraries imported.


Run the beautiful soup method on the url to get the data in a JSON format

In [2]:
broth = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = bs(broth)

In [3]:
 table = soup.find('table',{'class':'wikitable sortable'})#Pull data from just the table
cols = table.findAll('td')

Strip out the html formatting so that we end up with just strings

In [4]:
dataRaw = []
df = pd.DataFrame()
for i in range(len(cols)):
    dataRaw.append(cols[i].text.strip())

We pulled the data into a single list, but now we nead to split it up into appropriate columns. I'm using a nested iterator to cycle from 0 to 2 to fill in the columns. So we'll do that, then assign the columns to a new dataframe with Pandas. 
<br><br>If you're smarter than me though, you either found a better way to do this, or didn't have this issue to start with

In [5]:
col1 = []
col2 = []
col3 = []
j = 0
for i in range(len(cols)):
    if j == 3:
        j = 0
        col1.append(dataRaw[i])
    elif j == 0:
        col1.append(dataRaw[i])
    elif j == 1: 
        col2.append(dataRaw[i])
    elif j == 2:
        col3.append(dataRaw[i])
    else:
        raise Exception('You done messed up the looping') 
    j += 1
df = pd.DataFrame()
df['Post Code'] = col1
df['Borough'] = col2
df['Neighborhood'] = col3
df.head()

Unnamed: 0,Post Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


Whew boi, looks like there's some post codes that aren't assigned. Let's count those up and strip them out to clean up our dataframe

In [6]:
oldCount = df.shape
notAssignedMask = df['Borough'] == 'Not assigned'
keepMask = ~notAssignedMask
df = df[keepMask]
df.shape, oldCount

((212, 3), (289, 3))

Cool, so we stripped out 80 or so entries in the dataframe. 
<br><br>With that, I think it's time to get down to business. Let's check duplicates just to be sure

In [7]:
df.groupby('Post Code').nunique()

Unnamed: 0_level_0,Post Code,Borough,Neighborhood
Post Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
M1B,1,1,2
M1C,1,1,3
M1E,1,1,3
M1G,1,1,1
M1H,1,1,1
M1J,1,1,1
M1K,1,1,3
M1L,1,1,3
M1M,1,1,3
M1N,1,1,2


Ah dang it, there's multiple neighborhoods per post-code. 
<br><br>Use groupby.agg to aggregate the neighborhoods based on the Post Code, separate them all using a , punctuation

In [8]:
test = df.copy()
new = test.groupby('Post Code').agg({'Borough' :'first',
                                     'Neighborhood': ', '.join,
                                     }).reset_index()
df = new.copy()
df.shape

(103, 3)

Pull the latitude and longitude data in from the .csv file provided in the lab, then combine the dataframes using the Post Code as the key and drop out any  duplicates

In [9]:
latLong = pd.read_csv('/Users/chrismay/Desktop/ibmDSCapstone/Geospatial_Coordinates.csv')
latLong = latLong.astype(object)
new = pd.concat([df, latLong], axis = 1, join = 'inner')
df = new.copy()

In [10]:
df.drop(['Postal Code'], axis = 1, inplace = True) #Drop postal code as it's a duplicate column

In [11]:
df = df[df['Neighborhood'].str.contains('Toronto')] #Use a mask to filter out only neighborhoods that include Toronto
df

Unnamed: 0,Post Code,Borough,Neighborhood,Latitude,Longitude
30,M3K,North York,"CFB Toronto, Downsview East",43.7375,-79.4648
40,M4J,East York,East Toronto,43.6853,-79.3381
46,M4R,Central Toronto,North Toronto West,43.7154,-79.4057
59,M5J,Downtown Toronto,"Harbourfront East, Toronto Islands, Union Station",43.6408,-79.3818
60,M5K,Downtown Toronto,"Design Exchange, Toronto Dominion Centre",43.6472,-79.3816
66,M5S,Downtown Toronto,"Harbord, University of Toronto",43.6627,-79.4
88,M8V,Etobicoke,"Humber Bay Shores, Mimico South, New Toronto",43.6056,-79.5013


In [12]:
# create map of Toronto using latitude and longitude values
latitude = 43.6532
longitude = -79.3832

map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [13]:
CLIENT_ID = '*******' # your Foursquare ID
CLIENT_SECRET = '******' # No peeking at my secret key
VERSION = '20180605' # Foursquare API version

In [14]:
def getNearbyVenues(names, latitudes, longitudes, radius=500): #This function was built by the Coursera lab
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [15]:
#Use the function to pull venues near each neighborhood
LIMIT = 500
toronto_venues = getNearbyVenues(names=df['Neighborhood'],
                                   latitudes=df['Latitude'],
                                   longitudes=df['Longitude']
                                  )

CFB Toronto, Downsview East
East Toronto
North Toronto West
Harbourfront East, Toronto Islands, Union Station
Design Exchange, Toronto Dominion Centre
Harbord, University of Toronto
Humber Bay Shores, Mimico South, New Toronto


In [16]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 103 uniques categories.


In [17]:
#Start the clustering analysis with one-hot encoding for pre-processing
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.shape

(278, 103)

In [18]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,Yoga Studio,Airport,American Restaurant,Aquarium,Art Gallery,Asian Restaurant,Bakery,Bank,Bar,...,Supermarket,Sushi Restaurant,Tea Room,Thai Restaurant,Theater,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Whisky Bar,Wine Bar
0,"CFB Toronto, Downsview East",0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"Design Exchange, Toronto Dominion Centre",0.0,0.0,0.04,0.0,0.01,0.01,0.01,0.0,0.02,...,0.0,0.0,0.01,0.02,0.01,0.01,0.0,0.0,0.0,0.01
2,East Toronto,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"Harbord, University of Toronto",0.0,0.0,0.0,0.0,0.0,0.0,0.057143,0.0,0.057143,...,0.0,0.028571,0.0,0.0,0.028571,0.0,0.0,0.028571,0.0,0.0
4,"Harbourfront East, Toronto Islands, Union Station",0.0,0.0,0.0,0.05,0.01,0.0,0.03,0.01,0.02,...,0.01,0.01,0.01,0.0,0.01,0.02,0.01,0.0,0.01,0.01
5,"Humber Bay Shores, Mimico South, New Toronto",0.0,0.0,0.071429,0.0,0.0,0.0,0.071429,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,North Toronto West,0.047619,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
#Show the top 5 most frequent venues in our neighborhood analysis
num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----CFB Toronto, Downsview East----
                 venue  freq
0              Airport  0.25
1    Electronics Store  0.25
2             Bus Stop  0.25
3                 Park  0.25
4  Rental Car Location  0.00


----Design Exchange, Toronto Dominion Centre----
                 venue  freq
0          Coffee Shop  0.14
1                 Café  0.09
2                Hotel  0.07
3           Restaurant  0.05
4  American Restaurant  0.04


----East Toronto----
               venue  freq
0       Intersection  0.25
1  Convenience Store  0.25
2        Coffee Shop  0.25
3               Park  0.25
4                Pub  0.00


----Harbord, University of Toronto----
         venue  freq
0         Café  0.11
1       Bakery  0.06
2  Coffee Shop  0.06
3    Bookstore  0.06
4          Bar  0.06


----Harbourfront East, Toronto Islands, Union Station----
         venue  freq
0  Coffee Shop  0.14
1        Hotel  0.05
2     Aquarium  0.05
3         Café  0.04
4  Pizza Place  0.04


----Humber Bay Shores, Mi

In [20]:
#Create a function that sorts the venues by frequency
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [21]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.shape

(7, 11)

In [24]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:103]

array([1, 0, 2, 0, 0, 4, 3], dtype=int32)

In [25]:
toronto_merged = df

# add clustering labels
toronto_merged['Cluster Labels'] = kmeans.labels_

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.head() # check the last columns!

Unnamed: 0,Post Code,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
30,M3K,North York,"CFB Toronto, Downsview East",43.7375,-79.4648,1,Airport,Park,Bus Stop,Electronics Store,Wine Bar,Food Court,Dance Studio,Deli / Bodega,Dessert Shop,Diner
40,M4J,East York,East Toronto,43.6853,-79.3381,0,Convenience Store,Coffee Shop,Intersection,Park,Food Court,Dance Studio,Deli / Bodega,Dessert Shop,Diner,Electronics Store
46,M4R,Central Toronto,North Toronto West,43.7154,-79.4057,2,Clothing Store,Coffee Shop,Sporting Goods Shop,Sandwich Place,Grocery Store,Gift Shop,Italian Restaurant,Fast Food Restaurant,Mexican Restaurant,Diner
59,M5J,Downtown Toronto,"Harbourfront East, Toronto Islands, Union Station",43.6408,-79.3818,0,Coffee Shop,Hotel,Aquarium,Pizza Place,Café,Brewery,Restaurant,Bakery,Scenic Lookout,Italian Restaurant
60,M5K,Downtown Toronto,"Design Exchange, Toronto Dominion Centre",43.6472,-79.3816,0,Coffee Shop,Café,Hotel,Restaurant,American Restaurant,Gastropub,Gym,Deli / Bodega,Italian Restaurant,Concert Hall


In [26]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters