# PART 1

Import libraries needed for project

In [1]:
import pandas as pd
import numpy as np
import urllib.request
from urllib.request import urlopen
from bs4 import BeautifulSoup


Scrape postal code table from wikipedia page

In [2]:
url= "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
page = urllib.request.urlopen(url)
soup = BeautifulSoup(page, 'lxml')
post_table= soup.find('table', class_='wikitable sortable')
post_table


<table class="wikitable sortable">
<tbody><tr>
<th>Postcode</th>
<th>Borough</th>
<th>Neighbourhood
</th></tr>
<tr>
<td>M1A</td>
<td>Not assigned</td>
<td>Not assigned
</td></tr>
<tr>
<td>M2A</td>
<td>Not assigned</td>
<td>Not assigned
</td></tr>
<tr>
<td>M3A</td>
<td><a href="/wiki/North_York" title="North York">North York</a></td>
<td><a href="/wiki/Parkwoods" title="Parkwoods">Parkwoods</a>
</td></tr>
<tr>
<td>M4A</td>
<td><a href="/wiki/North_York" title="North York">North York</a></td>
<td><a href="/wiki/Victoria_Village" title="Victoria Village">Victoria Village</a>
</td></tr>
<tr>
<td>M5A</td>
<td><a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a></td>
<td><a href="/wiki/Harbourfront_(Toronto)" title="Harbourfront (Toronto)">Harbourfront</a>
</td></tr>
<tr>
<td>M5A</td>
<td><a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a></td>
<td><a href="/wiki/Regent_Park" title="Regent Park">Regent Park</a>
</td></tr>
<tr>
<td>M6A</td>

Isolate rows and cells from table and create 3 columns

In [3]:
A=[]
B=[]
C=[]

for row in post_table.find_all('tr'):
    cells=row.find_all('td')
    if len(cells)==3:
        A.append(cells[0].find(text=True))
        B.append(cells[1].find(text=True))
        C.append(cells[2].find(text=True))
        
    

Create dataframe with 3 columns

In [5]:
df=pd.DataFrame(A,columns=['PostalCode'])
df['Borough']=B
df['Neighborhood']=C
df

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
9,M8A,Not assigned,Not assigned


Remove Not assigned rows in the Borough column

In [28]:
df2=df[df.Borough !='Not assigned']
df2.head(10)


Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
10,M9A,Etobicoke,Islington Avenue
11,M1B,Scarborough,Rouge
12,M1B,Scarborough,Malvern


Combine Neighborhood names into one row separated by commas

In [7]:
df3=df2.groupby(['PostalCode', 'Borough'],sort=False).agg(lambda x:','.join(x))
df3.reset_index(['PostalCode', 'Borough'], inplace=True)
df3.head(15)


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront,Regent Park"
3,M6A,North York,"Lawrence Heights,Lawrence Manor"
4,M7A,Queen's Park,Not assigned\n
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Rouge,Malvern"
7,M3B,North York,Don Mills North\n
8,M4B,East York,"Woodbine Gardens,Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson\n,Garden District\n"


Demonstrate the number of rows in dataframe (103)

In [37]:
df3.shape[0]


103

# Part 2

In [9]:
import json
from pandas.io.json import json_normalize
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
!conda install -c conda-forge folium=0.5.0 --yes
import folium


Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - folium=0.5.0


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    branca-0.3.1               |             py_0          25 KB  conda-forge
    openssl-1.1.1c             |       h516909a_0         2.1 MB  conda-forge
    ca-certificates-2019.9.11  |       hecc5488_0         144 KB  conda-forge
    certifi-2019.9.11          |           py36_0         147 KB  conda-forge
    altair-3.2.0               |           py36_0         770 KB  conda-forge
    vincent-0.4.4              |             py_1          28 KB  conda-forge
    folium-0.5.0               |             py_0          45 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         3.3 MB

The following NEW packages will be 

Download postal code, lat, long data from csv file

In [10]:
url_ll = ("http://cocl.us/Geospatial_data")
df_ll=pd.read_csv(url_ll)
df_ll.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


Merge postalcode/neighborhood dataframe with lat/long dataframe

In [36]:
df_combo=pd.merge(df3, df_ll, left_on= 'PostalCode', right_on='Postal Code')
df_combo

Unnamed: 0,PostalCode,Borough,Neighborhood,Postal Code,Latitude,Longitude
0,M3A,North York,Parkwoods,M3A,43.753259,-79.329656
1,M4A,North York,Victoria Village,M4A,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront,Regent Park",M5A,43.654260,-79.360636
3,M6A,North York,"Lawrence Heights,Lawrence Manor",M6A,43.718518,-79.464763
4,M7A,Queen's Park,Not assigned\n,M7A,43.662301,-79.389494
5,M9A,Etobicoke,Islington Avenue,M9A,43.667856,-79.532242
6,M1B,Scarborough,"Rouge,Malvern",M1B,43.806686,-79.194353
7,M3B,North York,Don Mills North\n,M3B,43.745906,-79.352188
8,M4B,East York,"Woodbine Gardens,Parkview Hill",M4B,43.706397,-79.309937
9,M5B,Downtown Toronto,"Ryerson\n,Garden District\n",M5B,43.657162,-79.378937


# Part 3

Create dataframe for mapping

In [13]:
df_B= df_combo[['Borough', 'Latitude', 'Longitude']]
df_B

Unnamed: 0,Borough,Latitude,Longitude
0,North York,43.753259,-79.329656
1,North York,43.725882,-79.315572
2,Downtown Toronto,43.654260,-79.360636
3,North York,43.718518,-79.464763
4,Queen's Park,43.662301,-79.389494
5,Etobicoke,43.667856,-79.532242
6,Scarborough,43.806686,-79.194353
7,North York,43.745906,-79.352188
8,East York,43.706397,-79.309937
9,Downtown Toronto,43.657162,-79.378937


Map Toronto boroughs on map

In [35]:
map_toronto=folium.Map(location=[43.653206, -79.394420], zoom_start=11)
map_toronto

for lat, lng, label in zip(df_B['Latitude'], df_B['Longitude'], df_B['Borough']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
map_toronto

Connect to Foursquare API

In [15]:
CLIENT_ID = 'F5HUSRWUDQMSKYPPLP3RI5FRTWWF5N0CICTY5P54PGAJJANB' 
CLIENT_SECRET = 'AFD1KYH2P2K0Q3MGK1RBIHK5XMJOWYHOC5DLSVL0UCFWIEXA' 
VERSION = '20180605' 

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: F5HUSRWUDQMSKYPPLP3RI5FRTWWF5N0CICTY5P54PGAJJANB
CLIENT_SECRET:AFD1KYH2P2K0Q3MGK1RBIHK5XMJOWYHOC5DLSVL0UCFWIEXA


Explore venues in Scarborough borough

In [16]:
LIMIT = 30
radius = 2000
borough_latitude = 43.815
borough_longitude = -79.284
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    borough_latitude, 
    borough_longitude, 
    radius, 
    LIMIT)
url

'https://api.foursquare.com/v2/venues/explore?&client_id=F5HUSRWUDQMSKYPPLP3RI5FRTWWF5N0CICTY5P54PGAJJANB&client_secret=AFD1KYH2P2K0Q3MGK1RBIHK5XMJOWYHOC5DLSVL0UCFWIEXA&v=20180605&ll=43.815,-79.284&radius=2000&limit=30'

In [17]:
import requests
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5da75593342adf0038f67b44'},
 'response': {'suggestedFilters': {'header': 'Tap to show:',
   'filters': [{'name': 'Open now', 'key': 'openNow'}]},
  'headerLocation': 'Scarborough',
  'headerFullLocation': 'Scarborough',
  'headerLocationGranularity': 'city',
  'totalResults': 92,
  'suggestedBounds': {'ne': {'lat': 43.833000018000014,
    'lng': -79.25910125725785},
   'sw': {'lat': 43.79699998199998, 'lng': -79.30889874274216}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4baf981cf964a5203d0e3ce3',
       'name': 'Fahmee Bakery & Jamaican Foods',
       'location': {'address': '119 Montezuma Trail',
        'lat': 43.81016961981095,
        'lng': -79.28011274320716,
        'labeledLatLngs': [{'label': 'display',
          'lat'

Identify venues in Scarborough borough

In [19]:
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [20]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) 

filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Fahmee Bakery & Jamaican Foods,Caribbean Restaurant,43.81017,-79.280113
1,Jim Chai Kee Wonton Noodle 沾仔記,Noodle House,43.814783,-79.293138
2,DaanGo Cake Lab,Bakery,43.809334,-79.290442
3,Lotus Pond Vegetarian Restaurant 蓮花素食,Vegetarian / Vegan Restaurant,43.819421,-79.294682
4,Fishman Lobster Clubhouse Restaurant 魚樂軒,Chinese Restaurant,43.801909,-79.295409


In [21]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

30 venues were returned by Foursquare.


Cluster Toronto neighborhoods using Kmeans

In [22]:
# set number of clusters
kclusters = 7

toronto_grouped_clustering = df_B.drop('Borough', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([3, 2, 6, 1, 6, 0, 4, 3, 2, 6], dtype=int32)