# Segmenting and Clustering Neighborhoods in Toronto

## Question 1: Webscraping

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

#### Import text from wikipedia

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
source = requests.get(url).text
soup = BeautifulSoup(source, 'lxml')


#### Convert text into readable format

In [3]:
# Get all the details
tabulka = soup.find("table", {"class" : "wikitable sortable"})
tabulka.prettify()
gdp_table_data = tabulka.tbody.find_all("tr") 

#### Store data in list

In [4]:
table_data = []
data = {}
for item in gdp_table_data:
    # Get all the headings of Lists
    headings = []
    for td in gdp_table_data[0].find_all("th"):
        # Extract the headings and remove whitespaces
        headings.append(td.text.replace('\n', ' ').strip())

    # Get all the rows of table
    t_row = {}
    row_data=[]
    for tr in item.find_all("td"):
        for data in tr:
            text = data.replace('\n', ' ').strip()
            row_data.append(text)
    # Append to table    
    table_data.append(row_data)

#### Convert text into DataFrame

In [5]:
# Convert to table and drop the first row
table= pd.DataFrame(table_data,columns=headings)
table = table.iloc[1:]

#### Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.


In [6]:
table = table[table['Borough']!='Not assigned']

#### More than one neighborhood can exist in one postal code area. 
 For example, in the table on the Wikipedia page, you will notice that M5A is listed twice and has two neighborhoods: 
Harbourfront and Regent Park. These two rows will be combined into one row with the neighborhoods separated with a 
comma as shown in row 11 in the above table.

In [7]:
table = table.groupby('Postal code').agg(lambda x : ','.join(set(x))).reset_index()
table.Neighborhood = table.Neighborhood.str.replace('/',',')


#### If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.

In [8]:
table.loc[table['Neighborhood']=='Not assigned','Neighborhood'] = table['Borough']

#### Shape of dataframe

In [9]:
table.shape

(103, 3)

## Question 2: Geocoder

#### Obtain the geocoding for all the address

In [10]:
# Read the coordinates
GeoCoding = pd.read_csv('C:\\Users\\Muthumani\\Desktop\\Coursera_Capstone\\Geospatial_Coordinates.csv')

In [11]:
# Merge the table and Geo coding to get the values
table_Geocoded=pd.merge(table,GeoCoding,left_on='Postal code',right_on='Postal Code')


In [12]:
table_Geocoded

Unnamed: 0,Postal code,Borough,Neighborhood,Postal Code,Latitude,Longitude
0,M1B,Scarborough,"Malvern , Rouge",M1B,43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill , Port Union , Highland Creek",M1C,43.784535,-79.160497
2,M1E,Scarborough,"Guildwood , Morningside , West Hill",M1E,43.763573,-79.188711
3,M1G,Scarborough,Woburn,M1G,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,M1H,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,M1J,43.744734,-79.239476
6,M1K,Scarborough,"Kennedy Park , Ionview , East Birchmount Park",M1K,43.727929,-79.262029
7,M1L,Scarborough,"Golden Mile , Clairlea , Oakridge",M1L,43.711112,-79.284577
8,M1M,Scarborough,"Cliffside , Cliffcrest , Scarborough Village West",M1M,43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff , Cliffside West",M1N,43.692657,-79.264848


## Question 3: Cluster analysis

In [13]:
import folium

#### Filter the records to contain only those of Toronto

In [14]:
# Filter the data for Toronto
toronto_data = table_Geocoded[table_Geocoded['Borough'].str.contains('Toronto')]
toronto_data

Unnamed: 0,Postal code,Borough,Neighborhood,Postal Code,Latitude,Longitude
37,M4E,East Toronto,The Beaches,M4E,43.676357,-79.293031
41,M4K,East Toronto,"The Danforth West , Riverdale",M4K,43.679557,-79.352188
42,M4L,East Toronto,"India Bazaar , The Beaches West",M4L,43.668999,-79.315572
43,M4M,East Toronto,Studio District,M4M,43.659526,-79.340923
44,M4N,Central Toronto,Lawrence Park,M4N,43.72802,-79.38879
45,M4P,Central Toronto,Davisville North,M4P,43.712751,-79.390197
46,M4R,Central Toronto,North Toronto West,M4R,43.715383,-79.405678
47,M4S,Central Toronto,Davisville,M4S,43.704324,-79.38879
48,M4T,Central Toronto,"Moore Park , Summerhill East",M4T,43.689574,-79.38316
49,M4V,Central Toronto,"Summerhill West , Rathnelly , South Hill , For...",M4V,43.686412,-79.400049


#### Create map to find the clusters of data

In [15]:
# create map of Manhattan using latitude and longitude values
latitude = 43.6532
longitude = -79.3832

map_manhattan = folium.Map(location=[latitude, longitude], zoom_start=13)

# add markers to map
for lat, lng, label in zip(toronto_data['Latitude'], toronto_data['Longitude'], toronto_data['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=2,
        popup=label,
        color='red',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_manhattan)  
    
map_manhattan

### If the map is not clear above, please refer to the image below

![image](Toronto map.png)