# Applied Data Science Capstone #
This is a Capstone Project notebook for IBM Applied Data Science Specialization on Coursera.

In [1]:
import pandas as pd
import numpy as np
import json # library to handle JSON files
import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
import matplotlib.cm as cm
import matplotlib.colors as colors

from sklearn.cluster import KMeans # import k-means from clustering stage
import folium # map rendering library
from bs4 import BeautifulSoup  # library for web scraping

In [2]:
print("Hello Capstone Project Course!")

Hello Capstone Project Course!


In [3]:
# it's a good practice to identify ourselves
headers = {"user-agent": "Webscraper for IBM Data Science Capstone"}
page = requests.get("https://www.bostonmagazine.com/top-places-to-live-2018-condos/", headers = headers)

# check for valid status 
if page.status_code != requests.codes.ok :
    print("Request was not successful, status code:", page.status_code)
    exit()
    
# Parse page using BeautifulSoup
soup = BeautifulSoup(page.content, 'html.parser')

In [4]:
# print scraped page title
print(soup.title.text)

Condo Prices in Boston 2018 - Boston Magazine


From inspecting the page we can see that the information we need is inside table element with id 'tablepress-151'.

In [5]:
# get table data
table = soup.find("table", {"id":"tablepress-151"})
# print first row of the table (column headers)
print(table.find("tr"))

<tr class="row-1 odd">
<th class="column-1">Boston Neighborhoods</th><th class="column-2">Median Price: 2017</th><th class="column-3">Median Price: 2016</th><th class="column-4">Median Price: 2012</th><th class="column-5">Median Price: 2007</th><th class="column-6">Percent Change in Price: One-Year</th><th class="column-7">Percent Change in Price: Five-Year</th><th class="column-8">Percent Change in Price: Ten-Year</th><th class="column-9">Days on Market: 2017</th>
</tr>


In [74]:
# create dataframe for Boston House Prices
df= pd.DataFrame(columns = ["Neighborhood", "2017 Median Price"])
# get all table rows
trs = table.find_all("tr")
# process data
for i,tr in enumerate(trs[1:]):
    entry = tr.find_all("td")
    df.loc[i] = entry[0].text.strip(), entry[1].text.strip()

df

Unnamed: 0,Neighborhood,2017 Median Price
0,Allston,"$480,000"
1,Back Bay,"$1,100,000"
2,Bay Village/South End,"$615,000"
3,Beacon Hill,"$952,500"
4,Brighton,"$430,000"
5,Charlestown,"$690,000"
6,Chinatown/Leather Dist.,"$850,000"
7,Dorchester,"$429,950"
8,East Boston,"$454,500"
9,Fenway,"$571,000"


In [75]:
for i,hood in enumerate(df['Neighborhood']):
    df.iloc[i,1] = int(df.iloc[i,1].lstrip('$').replace(',', ''))
    if '/' in hood:
        hoods = hood.split('/')
        df.iloc[i,0] = hoods[0]
        df = df.append({'Neighborhood': hoods[1], '2017 Median Price': df.iloc[i,1]}, ignore_index=True)
df

Unnamed: 0,Neighborhood,2017 Median Price
0,Allston,480000
1,Back Bay,1100000
2,Bay Village,615000
3,Beacon Hill,952500
4,Brighton,430000
5,Charlestown,690000
6,Chinatown,850000
7,Dorchester,429950
8,East Boston,454500
9,Fenway,571000


In [68]:
df = df.append({'Neighborhood': 'Downtown', '2017 Median Price': 10000000}, ignore_index=True)
df = df.append({'Neighborhood': 'Mission Hill', '2017 Median Price': 10000000}, ignore_index=True)
df = df.append({'Neighborhood': 'South Boston Waterfront', '2017 Median Price': 10000000}, ignore_index=True)
df

Unnamed: 0,Neighborhood,2017 Median Price
0,Allston,480000
1,Back Bay,1100000
2,Bay Village,615000
3,Beacon Hill,952500
4,Brighton,430000
5,Charlestown,690000
6,Chinatown,850000
7,Dorchester,429950
8,East Boston,454500
9,Fenway,571000


In [71]:
df = df.append({'Neighborhood': 'Longwood', '2017 Median Price': 10000000}, ignore_index=True)
df = df.append({'Neighborhood': 'Harbor Islands', '2017 Median Price': 10000000}, ignore_index=True)

Let's filter boroughs that contain the word Boston:

Let's clean up data: remove word "Boston" from Neighborhood name, create new column "Median Price" with the latest Median Price available.

In [39]:
# save to csv file to use later
df.to_csv('boston.csv', index=False)

In [None]:
address1 = 'Mattapan Boston, Massachusetts'

geolocator = Nominatim(user_agent="boston_explorer")
location1 = geolocator.geocode(address1)
latitude1 = location1.latitude
longitude1 = location1.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude1, longitude1))


In [89]:
address = 'Boston, Massachusetts'

geolocator = Nominatim(user_agent="boston_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Boston are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Boston are 42.3602534, -71.0582912.


In [90]:
boston_map = folium.Map(location=[latitude, longitude], zoom_start=11)
geo_json = r'Boston_Neighborhoods.geojson'
# display the map of San Francisco
boston_map

In [91]:
folium.Choropleth(
    geo_data=geo_json,
    data=df,
    columns=['Neighborhood', '2017 Median Price'],
    key_on='feature.properties.Name',
    fill_color='YlOrRd', 
    fill_opacity=0.6, 
    line_opacity=0.8,
    legend_name='2017 Median Price'
).add_to(boston_map)

# display map
boston_map