# Assignment on Segmentation and Clustering in Toronto - Data Wrangling

#### Data Wrangling process of converting data from its initial format to a format suitable for analysis.

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
from urllib.request import urlopen as urfeq
import lxml

In [2]:
## Data acquisition
source = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
wikihtml = requests.get(source).text
data = BeautifulSoup(wikihtml, 'html.parser')
## Extract table
rtable=data.table
results=rtable.find_all('tr')
nrows=len(results)
nrows  ### Count rows
##data

289

In [3]:
# Function transform 1 

def thtml2dataframe (datahtml):
    datareg =[]
    n=1
    while n < nrows :
        Postcode=results[n].text.split('\n')[1]
        Borough=results[n].text.split('\n')[2]
        Neighborhood=results[n].text.split('\n')[3]
        datareg.append((Postcode, Borough,Neighborhood))
        n=n+1

    df=pd.DataFrame(datareg, columns=['PostalCode', 'Borough', 'Neighborhood'])
    return df


#Call function transform
df=thtml2dataframe (data)
#df.head()
df.shape

(288, 3)

In [4]:
## Data extracted from 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M' and converted to Data Frame
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


#### Preprocessing-Data Clean

In [5]:
# How many rows have Borough equal to 'Not assigned'?
df[df['Borough']=='Not assigned'].count()

PostalCode      77
Borough         77
Neighborhood    77
dtype: int64

In [6]:
# Drop None and 'Not assigned' values 
# drops rows where 'Not assigned' in column Borough
df1=df[~df.Borough.str.contains("Not assigned")]
df1=df1.reset_index(drop=True)
df1.shape

(211, 3)

In [7]:
##df['PostalCode'].nunique() ## Count Postal Code Unique 
df1.tail()

Unnamed: 0,PostalCode,Borough,Neighborhood
206,M8Z,Etobicoke,Kingsway Park South West
207,M8Z,Etobicoke,Mimico NW
208,M8Z,Etobicoke,The Queensway West
209,M8Z,Etobicoke,Royal York South West
210,M8Z,Etobicoke,South of Bloor


### Uniques PostalCode

In [8]:
postalcodes = df1['PostalCode'].nunique()
boroughs = df1['Borough'].nunique()
neighbourhoods= df1['Neighborhood'].nunique()
print('Unique Postalcodes : ' ,postalcodes)
print('Unique Boroughs  : ',boroughs)
print('Unique Neighborhoods  :', neighbourhoods)

Unique Postalcodes :  103
Unique Boroughs  :  11
Unique Neighborhoods  : 209


##### 	Concat when more than one neighborhood in one postal code area

In [9]:
# Function concat 
def neighborhoodgroup(strgroup):    
    return ', '.join(sorted(strgroup['Neighborhood'].tolist()))
## Group by  'PostalCode', 'Borough'                   
group = df1.groupby(['PostalCode', 'Borough'])
# Call function neighborhoodgroup
dft2 = group.apply(neighborhoodgroup).reset_index(name='Neighborhood')

#### Save preprocessed data to a file for next steps

In [10]:
dft2.to_csv('fdatatoronto.csv', index=False)

In [11]:
#Size DataFrame 
dft2.shape

(103, 3)

#### Author: Alfonso Pereda G.