# Segmenting and Clustering Neighborhoods in Toronto (p3)

### Import the necessary librarys

In [20]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

### Get the html from the wikipedia page with the postal codes of Canada

In [21]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')

In [3]:
table = soup.find('table')

In [22]:
source.status_code

200

### Scraping the table from the website and put it into a dataframe

In [23]:
soup = BeautifulSoup(source.content,'lxml')
type(soup)

bs4.BeautifulSoup

In [24]:
table = soup.find_all('table')[0]

In [25]:
df = pd.read_html(str(table))

In [26]:
dataframe = df[0]

In [27]:
dataframe.columns.values

array([0, 1, 2])

In [28]:
dataframe.columns = dataframe.iloc[0]
dataframe.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,Postcode,Borough,Neighbourhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village


In [29]:
dataframe = dataframe.reindex(dataframe.index.drop(0))

### Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned

In [30]:
dataframe = dataframe[dataframe.Borough != 'Not assigned']

In [31]:
print(dataframe.shape)
dataframe.head()

(211, 3)


Unnamed: 0,Postcode,Borough,Neighbourhood
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront
6,M5A,Downtown Toronto,Regent Park
7,M6A,North York,Lawrence Heights


In [32]:
dataframe = dataframe.reset_index(drop=True)
print(dataframe.shape)
dataframe.head()

(211, 3)


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


### Combine the rows that has the same postal code, and put the neighbours in the same cell.

In [33]:
#a = dataframe['Neighbourhood'].groupby(dataframe['Postcode']).apply(lambda x: "{%s}" %', '.join(x))
a = dataframe['Neighbourhood'].groupby([dataframe['Postcode'],dataframe['Borough']]).apply(lambda x: ', '.join(x))
a.head()

Postcode  Borough    
M1B       Scarborough                            Rouge, Malvern
M1C       Scarborough    Highland Creek, Rouge Hill, Port Union
M1E       Scarborough         Guildwood, Morningside, West Hill
M1G       Scarborough                                    Woburn
M1H       Scarborough                                 Cedarbrae
Name: Neighbourhood, dtype: object

In [34]:
df = pd.DataFrame(a).reset_index()
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


### If a cell has borough but a Not Assigned neighborhood, then the neighborhood will be the same as the borough

In [35]:
a = df[df.Neighbourhood == 'Not assigned'].index
a

Int64Index([85], dtype='int64')

In [36]:
df.loc[a,'Neighbourhood'] = df.loc[a,'Borough']

### Print the number of rows of your dataframe.

In [37]:
df.shape

(103, 3)

### Part 2 Add Latitude and longitude to the dataframe

In [39]:
df_ = pd.read_csv('http://cocl.us/Geospatial_data')

In [40]:
df_.shape

(103, 3)

In [41]:
df_.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### Explore and cluster the neighborhoods in Toronto.

In [45]:
import numpy as np # library to handle data in a vectorized manner
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Fetching package metadata .............
Solving package specifications: .

Package plan for installation in environment /opt/conda/envs/DSX-Python35:

The following NEW packages will be INSTALLED:

    geographiclib: 1.49-py_0   conda-forge
    geopy:         1.19.0-py_0 conda-forge

geographiclib- 100% |################################| Time: 0:00:00  23.94 MB/s
geopy-1.19.0-p 100% |################################| Time: 0:00:00  34.72 MB/s
Fetching package metadata .............
Solving package specifications: .

Package plan for installation in environment /opt/conda/envs/DSX-Python35:

The following NEW packages will be INSTALLED:

    altair:  2.2.2-py35_1 conda-forge
    branca:  0.3.1-py_0   conda-forge
    folium:  0.5.0-py_0   conda-forge
    vincent: 0.4.4-py_1   conda-forge

altair-2.2.2-p 100% |################################| Time: 0:00:00  50.18 MB/s
branca-0.3.1-p 100% |################################| Time: 0:00:00  36.30 MB/s
vincent-0.4.4- 100% |###################

### Work with only boroughs that contain the word Toronto

In [46]:
df = df[df['Borough'].str.contains('Toronto')]
df.reset_index()

Unnamed: 0,index,Postcode,Borough,Neighbourhood
0,37,M4E,East Toronto,The Beaches
1,41,M4K,East Toronto,"The Danforth West, Riverdale"
2,42,M4L,East Toronto,"The Beaches West, India Bazaar"
3,43,M4M,East Toronto,Studio District
4,44,M4N,Central Toronto,Lawrence Park
5,45,M4P,Central Toronto,Davisville North
6,46,M4R,Central Toronto,North Toronto West
7,47,M4S,Central Toronto,Davisville
8,48,M4T,Central Toronto,"Moore Park, Summerhill East"
9,49,M4V,Central Toronto,"Deer Park, Forest Hill SE, Rathnelly, South Hi..."


### Create a map of Toronto with neighborhoods superimposed on top

In [1]:
latitude = 43.657952
longitude = -79.387383

### Define Foursquare Credentials and Version

In [4]:
CLIENT_ID = 'P1E2EGPIORWJ2JCIUBTJ42T41UGN1YSGGJGQONGBJMPZREBX' # your Foursquare ID
CLIENT_SECRET = 'D1PT5043NRXZFD2MYHDWLAAQOBF2IWCL5UQ0AKWEVFNTAFI4' # your Foursquare Secret
VERSION = '20181124' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: P1E2EGPIORWJ2JCIUBTJ42T41UGN1YSGGJGQONGBJMPZREBX
CLIENT_SECRET:D1PT5043NRXZFD2MYHDWLAAQOBF2IWCL5UQ0AKWEVFNTAFI4


In [5]:
LIMIT = 100
radius = 500

def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [16]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]