### Import Libraries

In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import requests
from bs4 import BeautifulSoup
import re

import folium
import json
import shapefile

from geopy.geocoders import Nominatim
from pandas.io.json import json_normalize
import matplotlib.cm as cm
import matplotlib.colors as colors

### DATA EXTRACTION

#### Postal Code and Coordinates
The first 3 characters of a postal code are officially called "forward sortation area" in Canada. 

##### Retrieve data from GeoNames

In [144]:
url = 'https://www.geonames.org/postalcode-search.html?q=calgary&country=CA&adminCode1=AB'
req = requests.get(url)
html_data = req.text

In [145]:
soup = BeautifulSoup(html_data,'html.parser')

In [146]:
data = []
for tr in soup.findAll('table',{'class':'restable'}): 
    data = np.append(data,tr.text.split('\n'))
data = list(filter(None,data)) 
data[:5]

['PlaceCodeCountryAdmin1Admin2Admin3',
 '1Calgary (City Centre / Calgary Tower)T2PCanadaAlbertaCalgary\xa0\xa0\xa051.071/-113.693',
 '2Calgary (Cranston)T3MCanadaAlbertaCalgary\xa0\xa0\xa050.88/-113.955',
 '3Calgary NortheastT3NCanadaAlbertaCalgary\xa0\xa0\xa051.163/-113.954',
 '4Calgary NorthwestT3RCanadaAlbertaCalgary\xa0\xa0\xa051.202/-114.245']

In [147]:
neighbourhood = []
postalcode = []
lat = []
long = []

for item in data[1:]: 
    lat = np.append(lat, item.split()[-1].split('/')[0])
    long = np.append(long, item.split()[-1].split('/')[-1])
    postalcode = np.append(postalcode, re.findall(r'T[0-9][A-Z]',item))
    
    if len(re.findall(r'\(.*\)',item))==0: 
        neighbourhood = np.append(neighbourhood, re.findall(r'[A-Z,a-z]+T[0-9][A-Z]',item)[0][:-3])
    else: 
        neighbourhood = np.append(neighbourhood, re.findall(r'\(.*?\)',item))

In [148]:
geonames = pd.DataFrame([postalcode,neighbourhood,lat,long])
geonames = geonames.T
geonames.columns = ['PostalCode','Neighbourhood','Latitude','Longitude']
geonames = geonames.replace(to_replace=['\(','\)',' / '], value=['','',', '], regex=True)
geonames.loc[geonames['Neighbourhood']=='Calgary','Neighbourhood'] = 'Southeast'
geonames.head()

Unnamed: 0,PostalCode,Neighbourhood,Latitude,Longitude
0,T2P,"City Centre, Calgary Tower",51.071,-113.693
1,T3M,Cranston,50.88,-113.955
2,T3N,Northeast,51.163,-113.954
3,T3R,Northwest,51.202,-114.245
4,T1Y,"Rundle, Whitehorn, Monterey Park",51.082,-113.958


##### Retrieve data from Mapawi

In [149]:
url = 'http://zip-code.en.mapawi.com/canada/4/alberta/1/1/ab/'
req = requests.get(url)
html_data = req.text

In [150]:
soup = BeautifulSoup(html_data,'html.parser')

In [151]:
urllist = []
for tr in soup.findAll('a',href=re.compile(r'http://zip-code.en.mapawi.com/canada/4/alberta/1/1/ab/calgary.*')): 
    urllist = np.append(urllist,str(tr).split('"')[1])
urllist[:5]

array(['http://zip-code.en.mapawi.com/canada/4/alberta/1/1/ab/calgary-braeside-woodbine-/t2w/45/',
       'http://zip-code.en.mapawi.com/canada/4/alberta/1/1/ab/calgary-brentwood-collingwood-nose-hill-/t2l/37/',
       'http://zip-code.en.mapawi.com/canada/4/alberta/1/1/ab/calgary-bridgeland-greenview-zoo-yyc-/t2e/32/',
       'http://zip-code.en.mapawi.com/canada/4/alberta/1/1/ab/calgary-city-centre-calgary-tower-/t2p/40/',
       'http://zip-code.en.mapawi.com/canada/4/alberta/1/1/ab/calgary-connaught-west-victoria-park-/t2r/41/'],
      dtype='<U107')

In [152]:
postalcode = []
neighbourhood = []
lat = []
long = []

for url in urllist: 
    req = requests.get(url)
    html_data = req.text
    soup = BeautifulSoup(html_data,'html.parser')
    
    data = soup.findAll('font',{'class':'data'})
    postalcode = np.append(postalcode,data[0].text)
    lat = np.append(lat,data[2].text)
    long = np.append(long,data[3].text)
    
    neighbourhood = np.append(neighbourhood, str(soup.find('title')).split(':')[-1].split('(')[-1].split(')')[0])

In [153]:
mapawi = pd.DataFrame([postalcode,neighbourhood,lat,long])
mapawi = mapawi.T
mapawi.columns = ['PostalCode','Neighbourhood','Latitude','Longitude']
mapawi.head()

Unnamed: 0,PostalCode,Neighbourhood,Latitude,Longitude
0,T2W,Braeside / Woodbine,50.9604,-114.1001
1,T2L,Brentwood / Collingwood / Nose Hill,51.0917,-114.1127
2,T2E,Bridgeland / Greenview / Zoo / YYC,51.0632,-114.0614
3,T2P,City Centre / Calgary Tower,51.0472,-114.0802
4,T2R,Connaught / West Victoria Park,51.0426,-114.0791


##### Merging GeoNames and Mapawi data
Coordinates data from GeoNames are unreliable. 
Neighbourhood data from Mapawi are incomplete. 

In [186]:
calgary = pd.merge(geonames,mapawi,how='left',on='PostalCode')

In [188]:
calgary.Neighbourhood_y.fillna(calgary.Neighbourhood_x,inplace=True)
calgary.Latitude_y.fillna(calgary.Latitude_x,inplace=True)
calgary.Longitude_y.fillna(calgary.Longitude_x,inplace=True)

In [189]:
calgary.drop(['Neighbourhood_y','Latitude_x','Longitude_x'],axis=1,inplace=True)
calgary.columns=['PostalCode','Neighbourhood','Latitude','Longitude']
calgary

Unnamed: 0,PostalCode,Neighbourhood,Latitude,Longitude
0,T2P,"City Centre, Calgary Tower",51.0472,-114.0802
1,T3M,Cranston,50.8902,-113.9892
2,T3N,Northeast,51.1494,-114.0019
3,T3R,Northwest,51.1497,-114.2695
4,T1Y,"Rundle, Whitehorn, Monterey Park",51.0759,-114.0015
5,T2E,"Bridgeland, Greenview, Zoo, YYC",51.0632,-114.0614
6,T2H,"Highfield, Burns Industrial",50.9857,-114.0631
7,T2K,"Thornecliffe, Tuxedo",51.0857,-114.0714
8,T2L,"Brentwood, Collingwood, Nose Hill",51.0917,-114.1127
9,T2N,"Kensington, Westmont, Parkdale, University",51.0591,-114.1146


In [223]:
calgary.to_csv('calgary_postalcode_latlong.csv',index=False)

In [2]:
calgary = pd.read_csv('calgary_postalcode_latlong.csv')

In [3]:
calgary

Unnamed: 0,PostalCode,Neighbourhood,Latitude,Longitude
0,T2P,"City Centre, Calgary Tower",51.0472,-114.0802
1,T3M,Cranston,50.8902,-113.9892
2,T3N,Northeast,51.1494,-114.0019
3,T3R,Northwest,51.1497,-114.2695
4,T1Y,"Rundle, Whitehorn, Monterey Park",51.0759,-114.0015
5,T2E,"Bridgeland, Greenview, Zoo, YYC",51.0632,-114.0614
6,T2H,"Highfield, Burns Industrial",50.9857,-114.0631
7,T2K,"Thornecliffe, Tuxedo",51.0857,-114.0714
8,T2L,"Brentwood, Collingwood, Nose Hill",51.0917,-114.1127
9,T2N,"Kensington, Westmont, Parkdale, University",51.0591,-114.1146


##### Map to quality check the coordinates and geojson boundary data

In [4]:
address = 'Calgary'
geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

  


In [5]:
print('Calgary latitude:', latitude)
print('Calgary longitude:', longitude)

Calgary latitude: 51.0534234
Calgary longitude: -114.0625892


In [6]:
map_calgary = folium.Map(location=[latitude,longitude],zoom_start=10)
for nei,lat,long in zip(calgary['Neighbourhood'],calgary['Latitude'],calgary['Longitude']): 
    label = nei
    lable = folium.Popup(label,parse_html=True)
    folium.CircleMarker([lat,long], radius=5, popup=label, color='red', 
                        fill=True, fill_color='#FFA500', fill_opacity=0.5, 
                        parse_html=False).add_to(map_calgary)
    
map_calgary

In [7]:
geo_calgary = r'lfsa000a16a_e_simp.json'

In [282]:
map_calgary.choropleth(
    geo_data=geo_calgary,
    data=calgary,
    columns=['PostalCode','Latitude'],
    key_on='feature.properties.Name',
    fill_color='YlOrRd',
    fill_opacity=0.5, 
    line_opacity=0.5,
    legend_name='Latitude',reset=True)

map_calgary

In [249]:
#Everything looks correct

#### House Price, Crime Rate, School Ratings

### CLUSTERING

### SOME OTHER ML TO PREDICT HOUSEPRICE

### RESULT