In [1]:
import pandas as pd
from bs4 import BeautifulSoup as bs
import requests
import numpy as np

In [81]:
!pip install folium

Collecting folium
[?25l  Downloading https://files.pythonhosted.org/packages/a4/f0/44e69d50519880287cc41e7c8a6acc58daa9a9acf5f6afc52bcc70f69a6d/folium-0.11.0-py2.py3-none-any.whl (93kB)
[K     |████████████████████████████████| 102kB 7.3MB/s ta 0:00:011
Collecting branca>=0.3.0 (from folium)
  Downloading https://files.pythonhosted.org/packages/13/fb/9eacc24ba3216510c6b59a4ea1cd53d87f25ba76237d7f4393abeaf4c94e/branca-0.4.1-py3-none-any.whl
Installing collected packages: branca, folium
Successfully installed branca-0.4.1 folium-0.11.0


In [70]:
wikipage = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M").content

soup = bs(wikipage, 'html')

data = str(soup.find('table', {'class': 'wikitable'}))
data = data.split('<tr>')[1:]
data = [[phrase for phrase in row.replace('<td>', '').replace('<th>', '').split('\n') if (phrase != '' and '<' not in phrase)] for row in data]

colNames = data.pop(0)

In [71]:
df = pd.DataFrame(data)
df.columns = colNames
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [72]:
df['Borough'].replace(to_replace=['Not assigned'], value=[np.nan],inplace=True)
df['Neighborhood'].replace(to_replace=['Not assigned'], value=[np.nan],inplace=True)
df.dropna(inplace=True)

In [73]:
geospatial_data = requests.get('https://raw.githubusercontent.com/max-herman/Coursera_Capstone/master/Geospatial_Coordinates.csv').content.decode("utf-8")
geospatial_data = geospatial_data.split('\r\n')[1:]
geospatial_data = {phrase.split(',')[0]: phrase.split(',')[1] + "," + phrase.split(',')[2] for phrase in geospatial_data}

In [74]:
df['location'] = df['Postal Code'].map(geospatial_data)

df[['latitude', 'longitude']] = df["location"].str.split(",", n = 1, expand = True)
df.drop(['location'], axis=1, inplace=True)
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,latitude,longitude
2,M3A,North York,Parkwoods,43.7532586,-79.3296565
3,M4A,North York,Victoria Village,43.7258823,-79.3155716
4,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.6542599,-79.3606359
5,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.4647633
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.6623015,-79.3894938


In [75]:
from sklearn.cluster import KMeans

kmeans_data = df
removed_data = df[['Neighborhood', 'Borough', 'Postal Code']]
kmeans_data = kmeans_data.drop(['Neighborhood', 'Borough', 'Postal Code'], axis=1, inplace=True)
# set number of clusters
k = 5

# run k-means clustering
kmeans = KMeans(n_clusters=k, random_state=0).fit(df)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([4, 4, 2, 3, 2, 1, 0, 4, 4, 2], dtype=int32)

In [76]:
# add clustering labels
df.insert(0, 'Cluster Labels', kmeans.labels_)

df = df.join(removed_data)

Unnamed: 0,Cluster Labels,latitude,longitude,Neighborhood,Borough,Postal Code
2,4,43.7532586,-79.3296565,Parkwoods,North York,M3A
3,4,43.7258823,-79.3155716,Victoria Village,North York,M4A
4,2,43.6542599,-79.3606359,"Regent Park, Harbourfront",Downtown Toronto,M5A
5,3,43.718518,-79.4647633,"Lawrence Manor, Lawrence Heights",North York,M6A
6,2,43.6623015,-79.3894938,"Queen's Park, Ontario Provincial Government",Downtown Toronto,M7A


In [79]:
df.head()

Unnamed: 0,Cluster Labels,latitude,longitude,Neighborhood,Borough,Postal Code
2,4,43.7532586,-79.3296565,Parkwoods,North York,M3A
3,4,43.7258823,-79.3155716,Victoria Village,North York,M4A
4,2,43.6542599,-79.3606359,"Regent Park, Harbourfront",Downtown Toronto,M5A
5,3,43.718518,-79.4647633,"Lawrence Manor, Lawrence Heights",North York,M6A
6,2,43.6623015,-79.3894938,"Queen's Park, Ontario Provincial Government",Downtown Toronto,M7A


In [85]:
import folium
import matplotlib.cm as cm
import matplotlib.colors as colors

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df['latitude'], df['longitude'], df['Postal Code'], df['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

NameError: name 'Nominatim' is not defined