# Problem-1 (Question-1) How To Create Dataframes From Wikipedia Tables

In [98]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Use SHIFT+TAB keys to popup inplace code help
%config IPCompleter.greedy = True

# Output multiple statements from one input cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## Customize Notebook

**table_from_top.** If the Wikipedia page has one table then use `table_from_top = 1` value. Otherwise count table number from top and replace value to get specific table.

**wikipedia_page.** Specify the wikipedia page name from where to source dataset. The CSV file will be saved with the same name.

**trace.** Set `trace = True` to trace how feature values are extracted. Does not save extracted dataset. Prefixes applied parsing/extraction rules to extracted values.

In [99]:
table_from_top = 1
wikipedia_page = 'List_of_postal_codes_of_Canada:_M'
trace = False

## Load and Parse

This section loads the Wikipedia page and parses the table data we are interested in converting to a dataset.

In [100]:
wikipedia_url = 'https://en.wikipedia.org/wiki/{}'.format(wikipedia_page)
page = requests.get(wikipedia_url)
soup = BeautifulSoup(page.content, 'lxml')
tables = soup.find_all('table', {'class': 'wikitable'})
table = tables[table_from_top - 1]

## Quick Preview

This section extracts the table header with feature or column names.

Use this section to quick preview if you have the right table in processing.

In [101]:
feature_names = []

header_row = table.find('tr')
for header in header_row.find_all('th'):
    feature_name = ' '.join(header.find_all(text=True))    
    feature_names.append(feature_name.replace('\n', ''))
print(feature_names)

['Postcode', 'Borough', 'Neighbourhood']


## Data Wrangling

This section applies data wrangling rules based on exceptions found when parsing Wikipedia tables.

- If a feature value contains a link then extract text from the link.
- Ignore text which starts with `[` square brackets.
- Ignore image links (...flags) prefix link text.
- Ignore hidden text used for IDs.

In [102]:
samples = []
sample_rows = table.find_all('tr')[1:]
for sample_row in sample_rows:
    features = []
    for feature_col in sample_row.find_all('td'):
        feature_value = ''
        text = feature_col.text.replace('\n','')
        features.append(text)
        href_tags = soup.find_all(href=True)
    features.append(feature_value)
    samples.append(dict(zip(feature_names, features)))
#sample_rows

## Preview Dataset

This section enables you to preview the parsed dataset.

In [107]:
df = pd.DataFrame(samples)
#col_list = list(df)
# use this handy way to swap the elements
#col_list[0], col_list[1],  col_list[2] = col_list[2], col_list[0], col_list[1]
# assign back, the order will now be swapped
#df.columns = col_list
final_df = df[["Postcode", "Borough", "Neighbourhood"]]
#final_df
final_df.head()
final_df.tail()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


Unnamed: 0,Postcode,Borough,Neighbourhood
283,M8Z,Etobicoke,Mimico NW
284,M8Z,Etobicoke,The Queensway West
285,M8Z,Etobicoke,Royal York South West
286,M8Z,Etobicoke,South of Bloor
287,M9Z,Not assigned,Not assigned


# Output Data Frame Creation
### Group the Neighbourhood based on the Postal Code

In [131]:
#Group the Neighbourhood based on the Postal Code
output_df = final_df.groupby(['Postcode','Borough'])['Neighbourhood'].apply(', '.join).reset_index()
ordered_df = output_df[['Postcode','Borough', 'Neighbourhood']]

# Assign the Borough value to Neighbourhood value if the Borough has a value and the Neighbourhood has no assigned value

In [134]:
for i, row in ordered_df.iterrows():
    if(row['Borough'] != 'Not assigned' and row['Neighbourhood'] == 'Not assigned'):
        print(row['Borough'])
        row['Neighbourhood'] = row['Borough']

Queen's Park


# Display the final Output

In [136]:
#for i, row in ordered_df.iterrows():
#    print(row['Postcode'], row['Borough'], row['Neighbourhood'])
ordered_df

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M1B,Scarborough,"Rouge, Malvern"
2,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
3,M1E,Scarborough,"Guildwood, Morningside, West Hill"
4,M1G,Scarborough,Woburn
5,M1H,Scarborough,Cedarbrae
6,M1J,Scarborough,Scarborough Village
7,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
8,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
9,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"


# Display the shape of the Output

In [137]:
ordered_df.shape

(180, 3)

# Problem-2(Question-2)Read the Co-Ordinates of Canadian Geo Spatial Data into a Data Frame

In [150]:
coordinates_df = pd.read_csv("https://cocl.us/Geospatial_data")
coordinates_df = coordinates_df.rename(columns={'Postal Code': 'Postcode'})
#coordinates_df.head()
coordinates_df.shape

(103, 3)

# Merge the Neighbourhood Data with the Geo Spatial Data using Postal Code

In [233]:
#combined_df = pd.concat([ordered_df,coordinates_df], axis=0, ignore_index=True)
combined_df = pd.merge(ordered_df, coordinates_df, on='Postcode')
combined_df
#for i, row in combined_df.iterrows():
#    print(row['Postcode'], row['Borough'], row['Neighbourhood'],row['Latitude'],row['Longitude'])
#for i, row in combined_df.iterrows():
#    if("Toronto" in row['Borough']):
#        print(i,row['Borough'])
#print(row['Postcode'], row['Borough'], row['Neighbourhood'],row['Latitude'],row['Longitude'])

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


In [165]:
print('The dataframe has {} Boroughs and {} Neighborhoods.'.format(
        len(combined_df['Borough'].unique()),
        combined_df.shape[0]
    )
)

The dataframe has 11 Boroughs and 103 Neighborhoods.


# Problem-3 (Question-3) Analyze the Neighbourhood of Toronto City Using Map and Forsquare API

In [163]:
import numpy as np # library to handle data in a vectorized manner

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Fetching package metadata .............
Solving package specifications: .

Package plan for installation in environment /opt/conda/envs/DSX-Python35:

The following NEW packages will be INSTALLED:

    geographiclib: 1.49-py_0   conda-forge
    geopy:         1.19.0-py_0 conda-forge

geographiclib- 100% |################################| Time: 0:00:00  21.64 MB/s
geopy-1.19.0-p 100% |################################| Time: 0:00:00  10.74 MB/s
Fetching package metadata .............
Solving package specifications: .

Package plan for installation in environment /opt/conda/envs/DSX-Python35:

The following NEW packages will be INSTALLED:

    altair:  2.2.2-py35_1 conda-forge
    branca:  0.3.1-py_0   conda-forge
    folium:  0.5.0-py_0   conda-forge
    vincent: 0.4.4-py_1   conda-forge

altair-2.2.2-p 100% |################################| Time: 0:00:00  10.23 MB/s
branca-0.3.1-p 100% |################################| Time: 0:00:00  31.49 MB/s
vincent-0.4.4- 100% |###################

In [181]:
# create map of New York using latitude and longitude values
latitude = 43.6532
longitude = -79.3832
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

#toronto_data = combined_df[(combined_df['Borough'] == 'East Toronto') or (combined_df['Borough'] == 'Central Toronto') or (combined_df['Borough'] == 'Downtown Toronto') or (combined_df['Borough'] == 'West Toronto')].reset_index(drop=True)
#toronto_data.head()

# add markers to map
for lat, lng, borough, neighborhood in zip(combined_df['Latitude'], combined_df['Longitude'], combined_df['Borough'], combined_df['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

<folium.features.CircleMarker at 0x7f633a763e80>

<folium.features.CircleMarker at 0x7f633a763d68>

<folium.features.CircleMarker at 0x7f633a76ff60>

<folium.features.CircleMarker at 0x7f633a763be0>

<folium.features.CircleMarker at 0x7f633a76f0f0>

<folium.features.CircleMarker at 0x7f633a76fbe0>

<folium.features.CircleMarker at 0x7f633a7638d0>

<folium.features.CircleMarker at 0x7f633a76fe10>

<folium.features.CircleMarker at 0x7f633a76fa90>

<folium.features.CircleMarker at 0x7f633a76f860>

<folium.features.CircleMarker at 0x7f633a718240>

<folium.features.CircleMarker at 0x7f633a718668>

<folium.features.CircleMarker at 0x7f633a76fc88>

<folium.features.CircleMarker at 0x7f633a7185c0>

<folium.features.CircleMarker at 0x7f633a7185f8>

<folium.features.CircleMarker at 0x7f633a718978>

<folium.features.CircleMarker at 0x7f633a718dd8>

<folium.features.CircleMarker at 0x7f633a70f4a8>

<folium.features.CircleMarker at 0x7f633a70f748>

<folium.features.CircleMarker at 0x7f633a7186a0>

<folium.features.CircleMarker at 0x7f633a70f898>

<folium.features.CircleMarker at 0x7f633a7182e8>

<folium.features.CircleMarker at 0x7f633a70fba8>

<folium.features.CircleMarker at 0x7f633a70f2e8>

<folium.features.CircleMarker at 0x7f633a70f3c8>

<folium.features.CircleMarker at 0x7f633a728c18>

<folium.features.CircleMarker at 0x7f633a7180b8>

<folium.features.CircleMarker at 0x7f633a728588>

<folium.features.CircleMarker at 0x7f633a70f668>

<folium.features.CircleMarker at 0x7f633a728a20>

<folium.features.CircleMarker at 0x7f633a7287f0>

<folium.features.CircleMarker at 0x7f633a728160>

<folium.features.CircleMarker at 0x7f633a7289e8>

<folium.features.CircleMarker at 0x7f633a72cdd8>

<folium.features.CircleMarker at 0x7f633a728208>

<folium.features.CircleMarker at 0x7f633a72c438>

<folium.features.CircleMarker at 0x7f633a72c208>

<folium.features.CircleMarker at 0x7f633a72c668>

<folium.features.CircleMarker at 0x7f633a72cc88>

<folium.features.CircleMarker at 0x7f633a70feb8>

<folium.features.CircleMarker at 0x7f633a72c048>

<folium.features.CircleMarker at 0x7f633a72c8d0>

<folium.features.CircleMarker at 0x7f633a6d7320>

<folium.features.CircleMarker at 0x7f633a6d7668>

<folium.features.CircleMarker at 0x7f633a6d7eb8>

<folium.features.CircleMarker at 0x7f633a6d7f60>

<folium.features.CircleMarker at 0x7f633a72cba8>

<folium.features.CircleMarker at 0x7f633a72c9e8>

<folium.features.CircleMarker at 0x7f633a6d7d68>

<folium.features.CircleMarker at 0x7f633a6d7278>

<folium.features.CircleMarker at 0x7f633a6eb358>

<folium.features.CircleMarker at 0x7f633a6eb6a0>

<folium.features.CircleMarker at 0x7f633a6eb7f0>

<folium.features.CircleMarker at 0x7f633a87dd68>

<folium.features.CircleMarker at 0x7f633a72c358>

<folium.features.CircleMarker at 0x7f633a7abbe0>

<folium.features.CircleMarker at 0x7f633a6d7e80>

<folium.features.CircleMarker at 0x7f633a6eba58>

<folium.features.CircleMarker at 0x7f633a6eb5f8>

<folium.features.CircleMarker at 0x7f633a6c0438>

<folium.features.CircleMarker at 0x7f633a711470>

<folium.features.CircleMarker at 0x7f633a6d7710>

<folium.features.CircleMarker at 0x7f633a6c05c0>

<folium.features.CircleMarker at 0x7f633a6c0e48>

<folium.features.CircleMarker at 0x7f633a6c0320>

<folium.features.CircleMarker at 0x7f633a6c0588>

<folium.features.CircleMarker at 0x7f633a6c0f28>

<folium.features.CircleMarker at 0x7f633a68cb38>

<folium.features.CircleMarker at 0x7f633a7abcf8>

<folium.features.CircleMarker at 0x7f633a68c828>

<folium.features.CircleMarker at 0x7f633a6c0668>

<folium.features.CircleMarker at 0x7f633a68cbe0>

<folium.features.CircleMarker at 0x7f633a68c198>

<folium.features.CircleMarker at 0x7f633a68c978>

<folium.features.CircleMarker at 0x7f633a68cf28>

<folium.features.CircleMarker at 0x7f633a6c0b00>

<folium.features.CircleMarker at 0x7f633a68ccc0>

<folium.features.CircleMarker at 0x7f633a68c780>

<folium.features.CircleMarker at 0x7f633a685e80>

<folium.features.CircleMarker at 0x7f633a6856a0>

<folium.features.CircleMarker at 0x7f633a685da0>

<folium.features.CircleMarker at 0x7f633a68c048>

<folium.features.CircleMarker at 0x7f633a685940>

<folium.features.CircleMarker at 0x7f633a6858d0>

<folium.features.CircleMarker at 0x7f633a686400>

<folium.features.CircleMarker at 0x7f633a6855c0>

<folium.features.CircleMarker at 0x7f633a686f28>

<folium.features.CircleMarker at 0x7f633a686b70>

<folium.features.CircleMarker at 0x7f633a685b38>

<folium.features.CircleMarker at 0x7f633a685518>

<folium.features.CircleMarker at 0x7f633a6867f0>

<folium.features.CircleMarker at 0x7f633a686e48>

<folium.features.CircleMarker at 0x7f633a6a2470>

<folium.features.CircleMarker at 0x7f633a6a28d0>

<folium.features.CircleMarker at 0x7f633a6a2400>

<folium.features.CircleMarker at 0x7f633a6867b8>

<folium.features.CircleMarker at 0x7f633a6a2ef0>

<folium.features.CircleMarker at 0x7f633a6a2908>

<folium.features.CircleMarker at 0x7f633a6a2780>

<folium.features.CircleMarker at 0x7f633a6a1f60>

<folium.features.CircleMarker at 0x7f633a6a2710>

<folium.features.CircleMarker at 0x7f633a6a15f8>

<folium.features.CircleMarker at 0x7f633a685978>

In [182]:
CLIENT_ID = 'FRGY1S2AIEV00MCMMLAD4IA4E0MZN1LWXA1IG0WFMVLD0HJY' # your Foursquare ID
CLIENT_SECRET = 'B34HXESKIDXT3AT5R1KKZZJ3WOWT43VU0QUPUAAB1GGREPRQ' # your Foursquare Secret
VERSION = '20190405' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: FRGY1S2AIEV00MCMMLAD4IA4E0MZN1LWXA1IG0WFMVLD0HJY
CLIENT_SECRET:B34HXESKIDXT3AT5R1KKZZJ3WOWT43VU0QUPUAAB1GGREPRQ


# Let's explore the first neighborhood in our dataframe.

### Get the neighborhood's name.

In [186]:
combined_df['Neighbourhood'][0]

'Rouge, Malvern'

### Get the neighborhood's latitude and longitude values.

In [190]:
neighbourhood_latitude = combined_df.loc[0, 'Latitude'] # neighborhood latitude value
neighbourhood_longitude = combined_df.loc[0, 'Longitude'] # neighborhood longitude value

neighbourhood_name = combined_df.loc[0, 'Neighbourhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighbourhood_name, 
                                                               neighbourhood_latitude, 
                                                               neighbourhood_longitude))

Latitude and longitude values of Rouge, Malvern are 43.806686299999996, -79.19435340000001.


#### Now, let's get the top 100 venues that are in Marble Hill within a radius of 500 meters.

### First, let's create the GET request URL. Name your URL **url**.

In [192]:
LIMIT = 100 # limit of number of venues returned by Foursquare API
radius = 500 # define radius
# create URL
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)
url # display URL

'https://api.foursquare.com/v2/venues/explore?&client_id=FRGY1S2AIEV00MCMMLAD4IA4E0MZN1LWXA1IG0WFMVLD0HJY&client_secret=B34HXESKIDXT3AT5R1KKZZJ3WOWT43VU0QUPUAAB1GGREPRQ&v=20190405&ll=43.806686299999996,-79.19435340000001&radius=500&limit=100'

In [193]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5ca73f8c351e3d25e00cb6c9'},
 'response': {'groups': [{'items': [{'reasons': {'count': 0,
       'items': [{'reasonName': 'globalInteractionReason',
         'summary': 'This spot is popular',
         'type': 'general'}]},
      'referralId': 'e-0-4bb6b9446edc76b0d771311c-0',
      'venue': {'categories': [{'icon': {'prefix': 'https://ss3.4sqi.net/img/categories_v2/food/fastfood_',
          'suffix': '.png'},
         'id': '4bf58dd8d48988d16e941735',
         'name': 'Fast Food Restaurant',
         'pluralName': 'Fast Food Restaurants',
         'primary': True,
         'shortName': 'Fast Food'}],
       'id': '4bb6b9446edc76b0d771311c',
       'location': {'cc': 'CA',
        'city': 'Toronto',
        'country': 'Canada',
        'crossStreet': 'Morningside & Sheppard',
        'distance': 387,
        'formattedAddress': ['Toronto ON', 'Canada'],
        'labeledLatLngs': [{'label': 'display',
          'lat': 43.80744841934756,
          'ln

In [194]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

### From the Foursquare lab in the previous module, we know that all the information is in the *items* key. Before we proceed, let's borrow the **get_category_type** function from the Foursquare lab.

### Now we are ready to clean the json and structure it into a *pandas* dataframe.

In [195]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Wendy's,Fast Food Restaurant,43.807448,-79.199056
1,Interprovincial Group,Print Shop,43.80563,-79.200378


In [196]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

2 venues were returned by Foursquare.


## 2. Explore Neighborhoods in Toronto

#### Let's create a function to repeat the same process to all the neighborhoods in Toronto

In [197]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

#### Now write the code to run the above function on each neighborhood and create a new dataframe called *manhattan_venues*.

In [198]:
# type your answer here

toronto_venues = getNearbyVenues(names=toronto_data['Neighbourhood'],
                                   latitudes=toronto_data['Latitude'],
                                   longitudes=toronto_data['Longitude']
                                  )

The Beaches
The Danforth West, Riverdale
The Beaches West, India Bazaar
Studio District
Business Reply Mail Processing Centre 969 Eastern


#### Let's check the size of the resulting dataframe

In [199]:
print(toronto_venues.shape)
toronto_venues.head()

(121, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,The Beaches,43.676357,-79.293031,The Big Carrot Natural Food Market,43.678879,-79.297734,Health Food Store
1,The Beaches,43.676357,-79.293031,Grover Pub and Grub,43.679181,-79.297215,Pub
2,The Beaches,43.676357,-79.293031,Starbucks,43.678798,-79.298045,Coffee Shop
3,The Beaches,43.676357,-79.293031,Upper Beaches,43.680563,-79.292869,Neighborhood
4,"The Danforth West, Riverdale",43.679557,-79.352188,Pantheon,43.677621,-79.351434,Greek Restaurant


In [200]:
toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Business Reply Mail Processing Centre 969 Eastern,19,19,19,19,19,19
Studio District,38,38,38,38,38,38
The Beaches,4,4,4,4,4,4
"The Beaches West, India Bazaar",17,17,17,17,17,17
"The Danforth West, Riverdale",43,43,43,43,43,43


In [206]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 68 uniques categories.


## 3. Analyze Each Neighborhood

In [207]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Yoga Studio,American Restaurant,Auto Workshop,Bakery,Bank,Bar,Board Shop,Bookstore,Brewery,Bubble Tea Shop,...,Sandwich Place,Seafood Restaurant,Skate Park,Smoke Shop,Spa,Sports Bar,Stationery Store,Steakhouse,Sushi Restaurant,Trail
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [208]:
toronto_onehot.shape

(121, 68)

In [209]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,Yoga Studio,American Restaurant,Auto Workshop,Bakery,Bank,Bar,Board Shop,Bookstore,Brewery,...,Sandwich Place,Seafood Restaurant,Skate Park,Smoke Shop,Spa,Sports Bar,Stationery Store,Steakhouse,Sushi Restaurant,Trail
0,Business Reply Mail Processing Centre 969 Eastern,0.052632,0.0,0.052632,0.0,0.0,0.0,0.0,0.0,0.052632,...,0.0,0.0,0.052632,0.052632,0.052632,0.0,0.0,0.0,0.0,0.0
1,Studio District,0.026316,0.052632,0.0,0.052632,0.026316,0.026316,0.0,0.026316,0.026316,...,0.026316,0.026316,0.0,0.0,0.0,0.0,0.026316,0.0,0.0,0.0
2,The Beaches,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"The Beaches West, India Bazaar",0.0,0.0,0.0,0.0,0.0,0.0,0.058824,0.0,0.058824,...,0.058824,0.0,0.0,0.0,0.0,0.0,0.0,0.058824,0.058824,0.0
4,"The Danforth West, Riverdale",0.023256,0.023256,0.0,0.023256,0.0,0.0,0.0,0.023256,0.023256,...,0.0,0.0,0.0,0.0,0.023256,0.023256,0.0,0.0,0.0,0.023256


In [210]:
toronto_grouped.shape

(5, 68)

In [211]:
num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Business Reply Mail Processing Centre 969 Eastern----
                venue  freq
0  Light Rail Station  0.11
1         Yoga Studio  0.05
2       Burrito Place  0.05
3       Garden Center  0.05
4              Garden  0.05


----Studio District----
                 venue  freq
0                 Café  0.11
1          Coffee Shop  0.08
2            Gastropub  0.05
3               Bakery  0.05
4  American Restaurant  0.05


----The Beaches----
               venue  freq
0        Coffee Shop  0.25
1  Health Food Store  0.25
2                Pub  0.25
3          Juice Bar  0.00
4      Movie Theater  0.00


----The Beaches West, India Bazaar----
          venue  freq
0           Gym  0.06
1  Burger Joint  0.06
2  Liquor Store  0.06
3          Park  0.06
4     Pet Store  0.06


----The Danforth West, Riverdale----
                venue  freq
0    Greek Restaurant  0.21
1         Coffee Shop  0.09
2  Italian Restaurant  0.07
3      Ice Cream Shop  0.07
4   Indian Restaurant  0.02




# Analyze Common Venues

In [212]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

# Analyze Top Venues

In [214]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Business Reply Mail Processing Centre 969 Eastern,Light Rail Station,Yoga Studio,Recording Studio,Farmers Market,Garden,Garden Center,Gym / Fitness Center,Comic Shop,Park,Brewery
1,Studio District,Café,Coffee Shop,Gastropub,Italian Restaurant,Bakery,American Restaurant,Bank,Bar,Bookstore,Brewery
2,The Beaches,Coffee Shop,Pub,Health Food Store,Dessert Shop,Comfort Food Restaurant,Comic Shop,Convenience Store,Cosmetics Shop,Coworking Space,Trail
3,"The Beaches West, India Bazaar",Fast Food Restaurant,Pub,Ice Cream Shop,Gym,Liquor Store,Fish & Chips Shop,Movie Theater,Burrito Place,Park,Pet Store
4,"The Danforth West, Riverdale",Greek Restaurant,Coffee Shop,Italian Restaurant,Ice Cream Shop,Trail,Dessert Shop,Indian Restaurant,Grocery Store,Furniture / Home Store,Fruit & Vegetable Store


## 4. Cluster Neighborhoods

### Run *k*-means to cluster the neighborhood into 5 clusters.

In [215]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([3, 4, 1, 2, 0], dtype=int32)

In [224]:
# add clustering labels
#neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = toronto_data

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighbourhood')

toronto_merged.head() # check the last columns!

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4E,East Toronto,The Beaches,43.676357,-79.293031,1,Coffee Shop,Pub,Health Food Store,Dessert Shop,Comfort Food Restaurant,Comic Shop,Convenience Store,Cosmetics Shop,Coworking Space,Trail
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188,0,Greek Restaurant,Coffee Shop,Italian Restaurant,Ice Cream Shop,Trail,Dessert Shop,Indian Restaurant,Grocery Store,Furniture / Home Store,Fruit & Vegetable Store
2,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572,2,Fast Food Restaurant,Pub,Ice Cream Shop,Gym,Liquor Store,Fish & Chips Shop,Movie Theater,Burrito Place,Park,Pet Store
3,M4M,East Toronto,Studio District,43.659526,-79.340923,4,Café,Coffee Shop,Gastropub,Italian Restaurant,Bakery,American Restaurant,Bank,Bar,Bookstore,Brewery
4,M7Y,East Toronto,Business Reply Mail Processing Centre 969 Eastern,43.662744,-79.321558,3,Light Rail Station,Yoga Studio,Recording Studio,Farmers Market,Garden,Garden Center,Gym / Fitness Center,Comic Shop,Park,Brewery


# Resulting Clusters

In [226]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighbourhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

<folium.features.CircleMarker at 0x7f633c3bd6a0>

<folium.features.CircleMarker at 0x7f633c38e1d0>

<folium.features.CircleMarker at 0x7f633c3bd390>

<folium.features.CircleMarker at 0x7f633c38e390>

<folium.features.CircleMarker at 0x7f633c3bdb00>

## 5. Examine Clusters

# Cluster 1

In [227]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,East Toronto,0,Greek Restaurant,Coffee Shop,Italian Restaurant,Ice Cream Shop,Trail,Dessert Shop,Indian Restaurant,Grocery Store,Furniture / Home Store,Fruit & Vegetable Store


# Cluster 2

In [228]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,East Toronto,1,Coffee Shop,Pub,Health Food Store,Dessert Shop,Comfort Food Restaurant,Comic Shop,Convenience Store,Cosmetics Shop,Coworking Space,Trail


# Cluster 3

In [229]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
2,East Toronto,2,Fast Food Restaurant,Pub,Ice Cream Shop,Gym,Liquor Store,Fish & Chips Shop,Movie Theater,Burrito Place,Park,Pet Store


# Cluster 4

In [230]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
4,East Toronto,3,Light Rail Station,Yoga Studio,Recording Studio,Farmers Market,Garden,Garden Center,Gym / Fitness Center,Comic Shop,Park,Brewery


# Cluster 5

In [231]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 4, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
3,East Toronto,4,Café,Coffee Shop,Gastropub,Italian Restaurant,Bakery,American Restaurant,Bank,Bar,Bookstore,Brewery
