# Part I - Segmenting and Clustering Neighborhoods in Toronto

## In this assignment we have to create pandas dataframe from the table of the Torronto post codes, borroughs and neighborhoods located on wikipedia web page

In [1]:
#first we have to import needed libraries
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

### First we have to scrape table from the web page with requests and BeautifulSoup python libraries

In [2]:
#using requests library in order to get web page that I need and Beautiful Soup library to parse from that web page
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(source, 'lxml') #parser is lxml

In [22]:
#parse table from the whole web page
table = soup.find('table', class_='wikitable sortable')
#print(table.prettify())

In [4]:
#in HTML 'tr' is the tag for table rows
table_rows = table.find_all('tr')
len(table_rows)

289

In [5]:
#create a list with 3 items in every list. Strip is used to remove '\n' from the end of the line.
toronto_list = []
for tr in table_rows:
    td = tr.find_all('td')
    row = [i.text.strip() for i in td]
    #print(row)
    toronto_list.append(row)
toronto_list[:10]

[[],
 ['M1A', 'Not assigned', 'Not assigned'],
 ['M2A', 'Not assigned', 'Not assigned'],
 ['M3A', 'North York', 'Parkwoods'],
 ['M4A', 'North York', 'Victoria Village'],
 ['M5A', 'Downtown Toronto', 'Harbourfront'],
 ['M5A', 'Downtown Toronto', 'Regent Park'],
 ['M6A', 'North York', 'Lawrence Heights'],
 ['M6A', 'North York', 'Lawrence Manor'],
 ['M7A', "Queen's Park", 'Not assigned']]

In [6]:
len(toronto_list)

289

In [7]:
#remove first item from the list
toronto_list = toronto_list[1:]
toronto_list

[['M1A', 'Not assigned', 'Not assigned'],
 ['M2A', 'Not assigned', 'Not assigned'],
 ['M3A', 'North York', 'Parkwoods'],
 ['M4A', 'North York', 'Victoria Village'],
 ['M5A', 'Downtown Toronto', 'Harbourfront'],
 ['M5A', 'Downtown Toronto', 'Regent Park'],
 ['M6A', 'North York', 'Lawrence Heights'],
 ['M6A', 'North York', 'Lawrence Manor'],
 ['M7A', "Queen's Park", 'Not assigned'],
 ['M8A', 'Not assigned', 'Not assigned'],
 ['M9A', 'Etobicoke', 'Islington Avenue'],
 ['M1B', 'Scarborough', 'Rouge'],
 ['M1B', 'Scarborough', 'Malvern'],
 ['M2B', 'Not assigned', 'Not assigned'],
 ['M3B', 'North York', 'Don Mills North'],
 ['M4B', 'East York', 'Woodbine Gardens'],
 ['M4B', 'East York', 'Parkview Hill'],
 ['M5B', 'Downtown Toronto', 'Ryerson'],
 ['M5B', 'Downtown Toronto', 'Garden District'],
 ['M6B', 'North York', 'Glencairn'],
 ['M7B', 'Not assigned', 'Not assigned'],
 ['M8B', 'Not assigned', 'Not assigned'],
 ['M9B', 'Etobicoke', 'Cloverdale'],
 ['M9B', 'Etobicoke', 'Islington'],
 ['M9B', 

### Now we have to do the following: 
* create pandas DataFrame from the list of list that was scraped from the web page
* drop all rows with the value of the Borough 'Not Assigned'
* change name of the Neighborhood in the row that has neighborhood value 'Not Assigned' to the corresponding Borough name
* combine all rows that have same PostalCode and Borough, like M5A and Downtown Toronto into one row with 2 or more neighborhoods. For this we have to use groupby method.

In [8]:
#creating DF from the list and renaming columns
df = pd.DataFrame(toronto_list)
df.rename(columns={0 : 'PostalCode', 1 : 'Borough', 2 : 'Neighborhood'}, inplace=True)
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [9]:
len(df)

288

In [10]:
#We have to drop all rows with the value of the Borough'Not Assigned'. There are 77 rows.
len(df.loc[df['Borough'] == 'Not assigned'])

77

In [11]:
#we have dropped 77 rows, final number of rows is 211
df = df.loc[df['Borough'] != 'Not assigned']
len(df)

211

In [12]:
# Code to drop all rows with the value of the Borough and Neighborhod 'Not Assigned'. There are 78 rows.
#len(df.loc[(df['Borough'] == 'Not assigned') | (df['Neighborhood'] == 'Not assigned')])
#df = df.loc[(df['Borough'] != 'Not assigned') & (df['Neighborhood'] != 'Not assigned')]

In [13]:
#there is one more 'Not assigned' value in the Neighborhood, we will make Nighborhood same as the Borough 
df.loc[df['Neighborhood'] == 'Not assigned', 'Neighborhood'] = "Queen's Park"

In [14]:
#just to check if the name was changed successfuly
df.loc[df['Borough'] == "Queen's Park"]

Unnamed: 0,PostalCode,Borough,Neighborhood
8,M7A,Queen's Park,Queen's Park


In [15]:
#we have to combine all rows that have same PostalCode and Borough, like M5A and Downtown Toronto 
#into one row with 2 or more neighborhoods
df.loc[df.PostalCode == 'M5A']

Unnamed: 0,PostalCode,Borough,Neighborhood
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park


In [16]:
#use group by method to group items by PostalCode and Borough, result is Series with two indexes and string join by ','.
df_grouped = df.groupby(['PostalCode', 'Borough'])['Neighborhood'].apply(','.join)

In [17]:
df_grouped.head(8)

PostalCode  Borough    
M1B         Scarborough                                Rouge,Malvern
M1C         Scarborough         Highland Creek,Rouge Hill,Port Union
M1E         Scarborough              Guildwood,Morningside,West Hill
M1G         Scarborough                                       Woburn
M1H         Scarborough                                    Cedarbrae
M1J         Scarborough                          Scarborough Village
M1K         Scarborough    East Birchmount Park,Ionview,Kennedy Park
M1L         Scarborough                Clairlea,Golden Mile,Oakridge
Name: Neighborhood, dtype: object

In [18]:
#group by return Pandas Series here multiindex serie.
type(df_grouped)

pandas.core.series.Series

In [19]:
df_toronto = pd.DataFrame(df_grouped)
df_toronto.head(8)

Unnamed: 0_level_0,Unnamed: 1_level_0,Neighborhood
PostalCode,Borough,Unnamed: 2_level_1
M1B,Scarborough,"Rouge,Malvern"
M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
M1E,Scarborough,"Guildwood,Morningside,West Hill"
M1G,Scarborough,Woburn
M1H,Scarborough,Cedarbrae
M1J,Scarborough,Scarborough Village
M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park"
M1L,Scarborough,"Clairlea,Golden Mile,Oakridge"


In [20]:
df_toronto = df_toronto.reset_index()
df_toronto.head(8)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park"
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge"


In [21]:
df_toronto.shape

(103, 3)