# Segmenting and Clustering Neighborhoods in Toronto - part 1
#### Webscraping to gather data from Wikipedia

# 
### 1. Libraries and accessing data

In [1]:
# Import required libraries for scrapping
import urllib.request
from bs4 import BeautifulSoup

# Import required libraries to handle the data as DataFrame
import pandas as pd

print("Libraries imported!")

Libraries imported!


In [2]:
# Set/access url to be scrapped
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
page = urllib.request.urlopen(url)

# Parsing through the URL
soup = BeautifulSoup(page, "lxml")

print("Data parsed.")

Data parsed.


In [3]:
# print(soup.prettify()) ### -> uncomment if needed
# soup.table ### -> uncomment it needed
all_tables = soup.find_all("table")
right_table = soup.find('table', class_ = 'wikitable sortable')
# right_table  ### -> uncomment it needed

print("Table selected.")

Table selected.


# 
### 2. Setting DataFrame and storing the data

In [4]:
# Set de DataFrame columns
column_names = ['PostalCode','Borough','Neighborhood']
df = pd.DataFrame(columns = column_names)

# Check the DataFrame structure
df

Unnamed: 0,PostalCode,Borough,Neighborhood


In [5]:
# Looping through the table | Store the data in the DF

for tr_cell in right_table.find_all('tr'):
    row_data = []
    for td_cell in tr_cell.find_all('td'):
        row_data.append(td_cell.text.rstrip())
        
    if len(row_data)==3:
        df.loc[len(df)] = row_data

In [6]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


# 
### 3. Cleaning data

In [7]:
# Removing records 'Not assigned' for Boroughs
print("Shape prior to the first cleaning:", df.shape)
df = df[df['Borough'] != 'Not assigned']
print("Shape after the first cleaning:", df.shape)

Shape prior to the first cleaning: (180, 3)
Shape after the first cleaning: (103, 3)


In [8]:
df = df.reset_index(drop=True)
df

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [9]:
# Checking for 'Not assigned' in Neighborhoods | Giving Borough name if needed
df.loc[df['Neighborhood'] =='Not assigned' , 'Neighborhood'] = df['Borough']
df.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [10]:
# Rows should be combined into one separated by a comma | Sharing the PostalCode
df_pc = df.groupby(['PostalCode','Borough'], sort=False).agg( ', '.join)
df_toronto=df_pc.reset_index()
df_toronto.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [11]:
# Save the data to an external file, to use in the next parts of the project 
df_toronto.to_csv("/home/oscar/PythonProjects/Coursera/ibm-datascience/course9_Capstone/Coursera_Capstone/toronto.csv")

In [12]:
df_toronto.shape

(103, 3)