# Applied Data Science Capstone

This notebook hosts code for Applied Data Science Capstone project

### Week 1 - Capstone Project Notebook

In [2]:
import pandas as pd
import numpy as np

In [3]:
print('Hello Capstone Project Course!')

Hello Capstone Project Course!


### Week 3 - Segmenting and Clustering Neighborhoods in Toronto

**Step 1**: prepare environment

In [4]:
import requests

**Step 2:** get wiki page as text

In [5]:
wiki_url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
wiki_page = requests.get(wiki_url)

**Step 3:** extract table by class to dataframe

In [6]:
codes_df_list = pd.read_html(io = wiki_page.text, attrs = {'class': 'wikitable'})
codes_df_0 = codes_df_list[0]

In [7]:
codes_df_0.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Downtown Toronto,Queen's Park
8,M8A,Not assigned,Not assigned
9,M9A,Queen's Park,Not assigned


**Step 4:** rename column

In [8]:
codes_df_1 = codes_df_0.rename(columns = {'Postcode': 'PostalCode'})

In [9]:
codes_df_1.head(10)

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Downtown Toronto,Queen's Park
8,M8A,Not assigned,Not assigned
9,M9A,Queen's Park,Not assigned


**Step 5:** remove Borough 'Not Assigned' values

In [10]:
codes_df_2 = codes_df_1[codes_df_1['Borough'] != 'Not assigned']

In [11]:
codes_df_2.head(10)

Unnamed: 0,PostalCode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Downtown Toronto,Queen's Park
9,M9A,Queen's Park,Not assigned
10,M1B,Scarborough,Rouge
11,M1B,Scarborough,Malvern
13,M3B,North York,Don Mills North


**Step 6:** fill 'Not assigned' neighborhood with borough

In [12]:
codes_df_2['Neighbourhood'] = np.where(codes_df_2['Neighbourhood'] == 'Not assigned', codes_df_2['Borough'], codes_df_2['Neighbourhood'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [13]:
codes_df_2.head(10)

Unnamed: 0,PostalCode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Downtown Toronto,Queen's Park
9,M9A,Queen's Park,Queen's Park
10,M1B,Scarborough,Rouge
11,M1B,Scarborough,Malvern
13,M3B,North York,Don Mills North


**Step 7:** combine neighborhoods with same postal code area

In [14]:
aggr_df = codes_df_2.groupby(['PostalCode', 'Borough'])['Neighbourhood'].apply(lambda v: ', '.join(v)).reset_index()

**Step 8:** check result dataframe

In [15]:
aggr_df.shape

(103, 3)

**Step 9:** enhance environment

In [16]:
import sys
!{sys.executable} -m pip install geocoder

Collecting geocoder
[?25l  Downloading https://files.pythonhosted.org/packages/4f/6b/13166c909ad2f2d76b929a4227c952630ebaf0d729f6317eb09cbceccbab/geocoder-1.38.1-py2.py3-none-any.whl (98kB)
[K     |████████████████████████████████| 102kB 6.6MB/s ta 0:00:011
[?25hCollecting ratelim (from geocoder)
  Downloading https://files.pythonhosted.org/packages/f2/98/7e6d147fd16a10a5f821db6e25f192265d6ecca3d82957a4fdd592cad49c/ratelim-0.1.6-py2.py3-none-any.whl
Installing collected packages: ratelim, geocoder
Successfully installed geocoder-1.38.1 ratelim-0.1.6


In [17]:
import geocoder

**Step 10:** try to get data from geocoder

In [18]:
def get_lat_lon(postal_code):
    lat_lng_coords = None
    while(lat_lng_coords is None):
        g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
        lat_lng_coords = g.latlng
    return [lat_lng_coords[0], lat_lng_coords[1]]
#latitude = lat_lng_coords[0]
#longitude = lat_lng_coords[1]

In [19]:
lat_list = []
lon_list = []
pc_list = aggr_df['PostalCode'].tolist()
for pc in pc_list:
    lat_lon = get_lat_lon(pc)
    lat.apend(lat_lon[0])
    lon.append(lat_lon[1])

KeyboardInterrupt: 

**Step 11:** geocoder does not work, use csv data

In [20]:
geo_df = pd.read_csv('https://cocl.us/Geospatial_data')

**Step 12:** prepare geo data to merge into neigborhoods data

In [21]:
geo_df_merge = geo_df.rename(columns = {'Postal Code': 'PostalCode'}, )

In [22]:
geo_df_merge.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


**Step 13:** merge geo data

In [23]:
data_with_geo = pd.merge(aggr_df, geo_df_merge[['PostalCode', 'Latitude', 'Longitude']], on = 'PostalCode')

**Step 14:** check data

In [24]:
data_with_geo.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [25]:
data_with_geo['Borough'].unique().tolist()

['Scarborough',
 'North York',
 'East York',
 'East Toronto',
 'Central Toronto',
 'Downtown Toronto',
 'York',
 'West Toronto',
 'Mississauga',
 'Etobicoke',
 "Queen's Park"]

In [26]:
# The code was removed by Watson Studio for sharing.

**Step 15:** envance environment vol.2

In [27]:
!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    geographiclib-1.50         |             py_0          34 KB  conda-forge
    certifi-2019.11.28         |           py36_0         149 KB  conda-forge
    openssl-1.1.1d             |       h516909a_0         2.1 MB  conda-forge
    ca-certificates-2019.11.28 |       hecc5488_0         145 KB  conda-forge
    geopy-1.21.0               |             py_0          58 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         2.5 MB

The following NEW packages will be INSTALLED:

    geographiclib:   1.50-py_0         conda-forge
    geopy:           1.21.0-py_0       conda-forge

The following packages will be UPDATED:

    ca-

In [28]:
!conda install -c conda-forge folium=0.5.0 --yes
import folium

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - folium=0.5.0


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    vincent-0.4.4              |             py_1          28 KB  conda-forge
    branca-0.3.1               |             py_0          25 KB  conda-forge
    altair-4.0.1               |             py_0         575 KB  conda-forge
    folium-0.5.0               |             py_0          45 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         673 KB

The following NEW packages will be INSTALLED:

    altair:  4.0.1-py_0 conda-forge
    branca:  0.3.1-py_0 conda-forge
    folium:  0.5.0-py_0 conda-forge
    vincent: 0.4.4-py_1 conda-forge


Downloading and Extracting Packages
vincent-0.4.4        | 28 KB     | #####

**Step 16:** get Toronto coordinates

In [29]:
geolocator = Nominatim(user_agent='foursquare_agent')
location = geolocator.geocode('Toronto, Canada')
tor_lat = location.latitude
tor_lon = location.longitude
print(tor_lat, tor_lon)

43.653963 -79.387207


**Step 17:** define color map

In [30]:
bor_col = {
     'East Toronto': 'red',
     'Central Toronto': 'green',
     'Downtown Toronto': 'blue',
     'West Toronto': 'yellow'
}

**Step 18:** select only Toronto's boroughs

In [31]:
tor_bor = data_with_geo[data_with_geo['Borough'].str.contains('Toronto')]

In [32]:
tor_bor.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
37,M4E,East Toronto,The Beaches,43.676357,-79.293031
41,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
42,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572
43,M4M,East Toronto,Studio District,43.659526,-79.340923
44,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


In [114]:
tor_bor.shape

(39, 5)

In [37]:
tor_map = folium.Map(location=[tor_lat, tor_lon], zoom_start=11)

for lat, lng, borough in zip(tor_bor['Latitude'], tor_bor['Longitude'], tor_bor['Borough']):
    label = borough
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=10,
        popup=label,
        color=bor_col.get(borough),
#         fill=True,
#         fill_color=bor_col.get(borough),
#         fill_opacity=0.5,
        parse_html=False).add_to(tor_map)
    
tor_map

**Step 19:** additional data

In [38]:
# 1 degree of latitude ~ 111 km
lat_dist = 111000
# 1 degree of longitude ~ 81 km at Toronto latitude
lon_dist = 81000

**Step 20:** cluster neighborhoods

cluster and visualize neighborhoods of Toronto  
as it seen from map, we have 4 boroughs

In [40]:
k_tor_cl = 4

try to get borough's coordinates to ease clustering

In [54]:
bor_lat_lon = []
for bor in bor_col:
    location = geolocator.geocode(bor + ', Toronto, Canada')
    lat = location.latitude
    lon = location.longitude
    bor_lat_lon.append({'Borough': bor, 'Latitude': lat, 'Longitude': lon})
df_tor_bor = pd.DataFrame(bor_lat_lon)
df_tor_bor.head()

Unnamed: 0,Borough,Latitude,Longitude
0,East Toronto,43.62479,-79.393492
1,Central Toronto,43.653963,-79.387207
2,Downtown Toronto,43.656322,-79.380916
3,West Toronto,43.653963,-79.387207


In [57]:
tor_bor_map = folium.Map(location=[tor_lat, tor_lon], zoom_start=11)

for lat, lng, borough in zip(df_tor_bor['Latitude'], df_tor_bor['Longitude'], df_tor_bor['Borough']):
    label = borough
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=50,
        popup=label,
        color=bor_col.get(borough),
#         fill=True,
#         fill_color=bor_col.get(borough),
#         fill_opacity=0.5,
        parse_html=False).add_to(tor_bor_map)
    
tor_bor_map

after visualization we can see that geo data is misplacing borougs, so we use clustering and take these as initial points

**Step 21:** additional functions

In [109]:
import math

def get_distance(lat1, lon1, lat2, lon2):
    lat_delta = lat1 - lat2
    lon_delta = lon1 - lon2
    y = lat_delta * lat_dist
    x = lon_delta * lon_dist
    dist = math.sqrt(x * x + y * y)
    return(dist)

def assign_members(lat, lon, nbh, test_df):
    dist_min = 1000000
    bor = ''
    for index, row in df_tor_bor.iterrows():
        dist = get_distance(row['Latitude'], row['Longitude'], lat, lon)
        if dist < dist_min:
            dist_min = dist
            bor = row['Borough']
    test_df.loc[len(test_df)] = [bor, nbh, lat, lon]

**Step 23:** assign beigborhoods to boroughs using geo k-centers

In [112]:
knn_df = pd.DataFrame(columns = ['Borough', 'Neighbourhood', 'Latitude', 'Longitude'])
for index, row in tor_bor.reset_index().iterrows():
    assign_members(row['Latitude'], row['Longitude'], row['Neighbourhood'], knn_df)
knn_df.head()
# knn_df.shape

Unnamed: 0,Borough,Neighbourhood,Latitude,Longitude
0,Downtown Toronto,The Beaches,43.676357,-79.293031
1,Downtown Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
2,Downtown Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572
3,Downtown Toronto,Studio District,43.659526,-79.340923
4,Downtown Toronto,Lawrence Park,43.72802,-79.38879


**Step 24:** visualize

In [113]:
k_temp_map = folium.Map(location=[tor_lat, tor_lon], zoom_start=11)

for lat, lng, borough in zip(knn_df['Latitude'], knn_df['Longitude'], knn_df['Borough']):
    label = borough
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=10,
        popup=label,
        color=bor_col.get(borough),
#         fill=True,
#         fill_color=bor_col.get(borough),
#         fill_opacity=0.5,
        parse_html=False).add_to(k_temp_map)
    
k_temp_map

we will use mean coordinates to determine borough cluster centers

In [123]:
tor_bor_lat_lon = tor_bor.drop(['PostalCode', 'Neighbourhood'], axis = 1)
tor_bor_lat_lon_mean = tor_bor_lat_lon.groupby('Borough').mean().reset_index()
tor_bor_lat_lon_mean.head()

Unnamed: 0,Borough,Latitude,Longitude
0,Central Toronto,43.70198,-79.398954
1,Downtown Toronto,43.654597,-79.383972
2,East Toronto,43.669436,-79.324654
3,West Toronto,43.652653,-79.44929


put new data on map

In [124]:
tor_bor_lat_lon = folium.Map(location=[tor_lat, tor_lon], zoom_start=11)

for lat, lng, borough in zip(tor_bor_lat_lon_mean['Latitude'], tor_bor_lat_lon_mean['Longitude'], tor_bor_lat_lon_mean['Borough']):
    label = borough
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=10,
        popup=label,
        color=bor_col.get(borough),
#         fill=True,
#         fill_color=bor_col.get(borough),
#         fill_opacity=0.5,
        parse_html=False).add_to(tor_bor_lat_lon)
    
tor_bor_lat_lon

put new centers to work

In [127]:
def assign_members_2(lat, lon, nbh, test_df):
    dist_min = 1000000
    bor = ''
    for index, row in tor_bor_lat_lon_mean.iterrows():
        dist = get_distance(row['Latitude'], row['Longitude'], lat, lon)
        if dist < dist_min:
            dist_min = dist
            bor = row['Borough']
    test_df.loc[len(test_df)] = [bor, nbh, lat, lon]

test_df_mean = pd.DataFrame(columns = ['Borough', 'Neighbourhood', 'Latitude', 'Longitude'])
for index, row in tor_bor.iterrows():
    assign_members_2(row['Latitude'], row['Longitude'], row['Neighbourhood'], test_df_mean)
test_df_mean.head()

Unnamed: 0,Borough,Neighbourhood,Latitude,Longitude
0,East Toronto,The Beaches,43.676357,-79.293031
1,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
2,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572
3,East Toronto,Studio District,43.659526,-79.340923
4,Central Toronto,Lawrence Park,43.72802,-79.38879


try on map

In [128]:
temp_map_2 = folium.Map(location=[tor_lat, tor_lon], zoom_start=11)

for lat, lng, borough in zip(test_df_mean['Latitude'], test_df_mean['Longitude'], test_df_mean['Borough']):
    label = borough
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=10,
        popup=label,
        color=bor_col.get(borough),
#         fill=True,
#         fill_color=bor_col.get(borough),
#         fill_opacity=0.5,
        parse_html=False).add_to(temp_map_2)
    
temp_map_2

as we can see - we can use geographical centering to get best k-center  
basicaly it conclusion derived from real world - neigbourhoods are usually grouped in normal way  
for more accurate data i would prefer to use coordinates of each neigborhood and aggregate them to boroughs data

let'see what we can get from foursquare

In [129]:
dt_df = tor_bor[data_with_geo['Borough'] == 'Downtown Toronto']
dt_df.head()

  if __name__ == '__main__':


Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
50,M4W,Downtown Toronto,Rosedale,43.679563,-79.377529
51,M4X,Downtown Toronto,"Cabbagetown, St. James Town",43.667967,-79.367675
52,M4Y,Downtown Toronto,Church and Wellesley,43.66586,-79.38316
53,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
54,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937


try to find some hospitals

In [144]:
url = 'https://api.foursquare.com/v2/venues/search?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}&query=hospital'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    43.679563, 
    -79.377529, 
    5000, 
    1000)
            
results = requests.get(url).json()['response']['venues']
results

[{'id': '4ad4c064f964a5206ef820e3',
  'name': 'The Hospital for Sick Children (SickKids)',
  'location': {'address': '555 University Ave.',
   'crossStreet': 'at Gerrard St.',
   'lat': 43.657498668962646,
   'lng': -79.3865121609307,
   'labeledLatLngs': [{'label': 'display',
     'lat': 43.657498668962646,
     'lng': -79.3865121609307}],
   'distance': 2560,
   'postalCode': 'M5G 1X8',
   'cc': 'CA',
   'city': 'Toronto',
   'state': 'ON',
   'country': 'Canada',
   'formattedAddress': ['555 University Ave. (at Gerrard St.)',
    'Toronto ON M5G 1X8',
    'Canada']},
  'categories': [{'id': '4bf58dd8d48988d196941735',
    'name': 'Hospital',
    'pluralName': 'Hospitals',
    'shortName': 'Hospital',
    'icon': {'prefix': 'https://ss3.4sqi.net/img/categories_v2/building/medical_',
     'suffix': '.png'},
    'primary': True}],
  'referralId': 'v-1581337679',
  'hasPerk': False},
 {'id': '4af0615cf964a5208cdb21e3',
  'name': "Women's College Hospital",
  'location': {'address': '76 

In [145]:
hosp_map = folium.Map(location=[tor_lat, tor_lon], zoom_start=11)

for venue in results:
    label = venue['name']
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [venue['location']['lat'], venue['location']['lng']],
        radius=2,
        popup=label,
        color='red',
#         fill=True,
#         fill_color=bor_col.get(borough),
#         fill_opacity=0.5,
        parse_html=False).add_to(hosp_map)
    
hosp_map

wrap call into function

In [146]:
def call_api(lat, lon, radius = 1000, limit = 100):
    url = 'https://api.foursquare.com/v2/venues/search?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
        CLIENT_ID, 
        CLIENT_SECRET, 
        VERSION, 
        lat, 
        lon, 
        radius, 
        limit)
            
    return(requests.get(url).json()['response']['venues'])

get data for Downtown Toronto

In [176]:
dt_venues_df = pd.DataFrame(columns = ['Name', 'Category', 'Neighbourhood'])
for index, row in dt_df.iterrows():
    responces = call_api(row['Latitude'], row['Longitude'])
    for resp in responces:
        cat = resp['categories']
        cat_name = 'Unknown'
        if len(cat) == 0:
            cat_name = 'Unknown'
        else:
            cat_name = cat[0]['name']
        dt_venues_df.loc[len(dt_venues_df)] = [resp['name'], cat_name, row['Neighbourhood']]
dt_venues_df.head()

Unnamed: 0,Name,Category,Neighbourhood
0,Mooredale House,Building,Rosedale
1,Park Drive Reservation Lands,Park,Rosedale
2,10 Lamport Ave,Residential Building (Apartment / Condo),Rosedale
3,Milkman's Lane,Trail,Rosedale
4,Le Germain Hotel Toronto Mercer,Hotel,Rosedale


In [177]:
dt_venues_df.shape

(1770, 3)

peek into receieved data

In [175]:
dt_venues_df[dt_venues_df['Category'] != 'Unknown'].groupby(['Category','Neighbourhood']).size().reset_index().rename(columns = {0: 'count'}).sort_values(by='count', ascending=False).head(20)

Unnamed: 0,Category,Neighbourhood,count
501,Hospital,Central Bay Street,24
702,Office,St. James Town,21
693,Office,"Commerce Court, Victoria Hotel",21
697,Office,Harbourfront,17
694,Office,"Design Exchange, Toronto Dominion Centre",17
687,Office,Berczy Park,17
802,Residential Building (Apartment / Condo),Church and Wellesley,15
703,Office,Stn A PO Boxes 25 The Esplanade,12
7,Airport Gate,"CN Tower, Bathurst Quay, Island airport, Harbo...",12
444,Government Building,Queen's Park,12
