# IBM Ciencia de Datos
### Agrupación y Segmentación de Vecindarios en la Ciudad de Toronto, Canadá

Realizado por Paola Jiménez

In [52]:
# Importando librerías esenciales
import pandas as pd
import numpy as np

### Parte 1 - Creación de marco de datos de vecindarios de Toronto

In [53]:
# Importamos librerías necesarias para obtener información 
import requests
from bs4 import BeautifulSoup

# Creamos arrays y dataframes necesarios
df = pd.DataFrame()
codes_array = np.array([])
boroughs_array = np.array([])
neighborhoods_array = np.array([])

# Proceso de Web Scraping a través de Beautiful Soup
URL = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
page = requests.get(URL)
soup = BeautifulSoup(page.content, "html.parser")
contentTable  = soup.find('table')
codes  = contentTable.findAll('b')
boroughs_neighborhoods = contentTable.findAll('span')

# Rellenando arrays de codigos postales y vecindarios
for i in range(0, len(codes)):
    codes[i] = codes[i].get_text()
    boroughs_neighborhoods[i] = boroughs_neighborhoods[i].get_text()
    boroughs_neighborhoods[i] = boroughs_neighborhoods[i].split("(") 
    if boroughs_neighborhoods[i][0] != "Not assigned": # Solo procesando información válida
        boroughs_neighborhoods[i][1] = boroughs_neighborhoods[i][1].replace(")", "") # Eliminando paréntesis
        boroughs_neighborhoods[i][1] = boroughs_neighborhoods[i][1].replace(" /", ",") # Cambiar barra oblicua por comas
        codes_array = np.append(codes_array, codes[i])
        boroughs_array = np.append(boroughs_array, boroughs_neighborhoods[i][0])
        neighborhoods_array = np.append(neighborhoods_array, boroughs_neighborhoods[i][1])

# Añadiendo datos a columnas del dataframe
df['Postal Code'] = codes_array.tolist()
df['Borough'] = boroughs_array.tolist()
df['Neighborhood'] = neighborhoods_array.tolist()

df.head()


Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government


In [54]:
# Revisando si existen vecindarios "Not assigned"
df[df["Neighborhood"]=="Not assigned"]

Unnamed: 0,Postal Code,Borough,Neighborhood


In [55]:
df.shape

(103, 3)

### Parte 2

In [56]:
# Obteniendo información del archivo .csv 
df_geo = pd.read_csv('Geospatial_Coordinates.csv')

df_geo.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [57]:
# Uniendo ambos marcos de datos basados en el código postal
df = pd.merge(df,
                 df_geo[['Postal Code', 'Latitude', 'Longitude']],
                 on='Postal Code')
                 
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Queen's Park,Ontario Provincial Government,43.662301,-79.389494


### Parte 3

In [58]:
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

# Importando Matplotlib y otros módulos
import matplotlib.cm as cm
import matplotlib.colors as colors

# Importando k-Means 
from sklearn.cluster import KMeans

# Importando folium para gráficos
import folium


In [59]:
# Filtrando marco de datos para que solo contenga los vecindarios de Toronto
df_TO = df[df['Borough'].str.contains("Toronto")].reset_index(drop=True)

df_TO.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
2,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
3,M4E,East Toronto,The Beaches,43.676357,-79.293031
4,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306


In [60]:
# Obteniendo las coordenadas de la ciudad de Toronto
address = 'Toronto, Canada'

geolocator = Nominatim(user_agent="to_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('Las coordenadas geográficas de Toronto son {}, {}.'.format(latitude, longitude))

Las coordenadas geográficas de Toronto son 43.6534817, -79.3839347.


In [61]:
# Creando un mapa de Toronto usando los datos de latitud y longitud
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=12)

# Añadir marcadores al mapa
for lat, lng, borough, neighborhood in zip(df_TO['Latitude'], df_TO['Longitude'], df_TO['Borough'], df_TO['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [62]:
# Valores de latitud y longitud como X
X= df_TO.values[:,3:]
X

array([[43.6542599, -79.3606359],
       [43.6571618, -79.3789371],
       [43.6514939, -79.3754179],
       [43.6763574, -79.2930312],
       [43.6447708, -79.3733064],
       [43.6579524, -79.3873826],
       [43.669542, -79.4225637],
       [43.6505712, -79.3845675],
       [43.6690051, -79.4422593],
       [43.685347, -79.3381065],
       [43.6408157, -79.3817523],
       [43.6479267, -79.4197497],
       [43.6795571, -79.352188],
       [43.6471768, -79.3815764],
       [43.6368472, -79.4281914],
       [43.6689985, -79.3155716],
       [43.6481985, -79.3798169],
       [43.6595255, -79.340923],
       [43.7280205, -79.3887901],
       [43.7116948, -79.4169356],
       [43.7127511, -79.3901975],
       [43.6969476, -79.4113072],
       [43.6616083, -79.4647633],
       [43.7153834, -79.4056784],
       [43.6727097, -79.4056784],
       [43.6489597, -79.456325],
       [43.7043244, -79.3887901],
       [43.6626956, -79.4000493],
       [43.6515706, -79.4844499],
       [43.6895743,

Empezamos el proceso de segmentación con cuatro clusters

In [63]:
# Número de clusters
kclusters =4

# Ejecutando clustering de k-Means
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(X)

# Revisar los clusters creados
kmeans.labels_[0:10]

array([1, 1, 1, 3, 1, 1, 2, 1, 2, 3])

In [64]:
# Agregando los clusters al marco de datos principal
df_TO['Cluster'] = kmeans.labels_

df_TO.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude,Cluster
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,1
1,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937,1
2,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418,1
3,M4E,East Toronto,The Beaches,43.676357,-79.293031,3
4,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306,1


In [65]:
# Definiendo colores para marcadores
rainbow = ['blue','green','yellow','red']

# Añadir marcadores al mapa
markers_colors = []
for lat, lon, poi, cluster in zip(df_TO['Latitude'], df_TO['Longitude'], df_TO['Neighborhood'], df_TO['Cluster']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster],
        fill=True,
        fill_color=rainbow[cluster],
        fill_opacity=0.7).add_to(map_toronto)
       
map_toronto