# Machine Learning Capstone - Clustering

### Instaling libraries

In [65]:
!pip install BeautifulSoup4
!pip install requests



### Importing important libraries

In [69]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup

### Fetching data from wikipedia

In [72]:
path ='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
rawpage= requests.get(path).text

#using BeautifulSoup to get xml codes
soup = BeautifulSoup(rawpage,'xml')


### Extracting raw table from wikipedia

In [78]:
data = []
columns = []
table = soup.find(class_='wikitable sortable')
for index, tr in enumerate(table.find_all('tr')):
    section = []
    for td in tr.find_all(['th','td']):
        section.append(td.text.rstrip())
    
    #First row of data is the header
    if (index == 0):
        columns = section
    else:
        data.append(section)

#convert list into Pandas DataFrame
canada_df = pd.DataFrame(data = data,columns = columns)
canada_df.head()



Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


### Clean the data 
#### remove borough that are not assigned

In [79]:
canada_df = canada_df[canada_df['Borough'] != 'Not assigned']
canada_df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


### More than one neighborhood can exist in one postal code area. 
#### These two rows combined into one row with the neighborhoods separated with a comma.

In [None]:
canada_df["Neighbourhood"] = canada_df.groupby("Postcode")["Neighbourhood"].transform(lambda neigh: ', '.join(neigh))

#remove duplicates
canada_df = canada_df.drop_duplicates()

#update index to be postcode if it isn't already
if(canada_df.index.name != 'Postcode'):
    canada_df = canada_df.set_index('Postcode')
    
canada_df.head()

### If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.

In [76]:
canada_df['Neighbourhood'].replace("Not assigned", canada_df["Borough"],inplace=True)
canada_df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


### Print dataframe shape

In [77]:
canada_df.shape

(103, 3)