Web Scraping 201: finding the API
http://www.gregreda.com/2015/02/15/web-scraping-finding-the-api/

In [1]:
import pandas as pd
import numpy as np
import requests
import json
import pprint
from bs4 import BeautifulSoup

# Get suburb median property price summary data

In [5]:
# A list of suburb to scrape the house buy and rent data from realestate.com.au
suburbs=['runcorn','sunnybank','sunnybank hills','eight mile plains', 
         'kuraby', 'upper mount gravatt', 'wishart', 'coopers plains','calamvale', 'algester']

In [6]:
#Create a loop to collect the house buy and rent data from each suburb

results=[]

for s in suburbs:
    #define API URL
    url_part1='https://investor-api.realestate.com.au/v2/states/qld/suburbs/'
    url_part2='.json?embed=suburb_geo'

    url=url_part1+s+url_part2
    
    #Request the URL and parse the JSON data
    response=requests.get(url) #request the url
    response.raise_for_status() #raise exception if invaid response
    #print('Response Code: ',response.status_code) #check the request code  

    result=response.json() #create a json object ofthe results
    results.append(result)

In [7]:
#check if the results list has collected all 10 suburbs data from the json file
suburb_keys=[list(r.keys())[0] for r in results] 
print(suburb_keys)

['RUNCORN-4113', 'SUNNYBANK-4109', 'SUNNYBANK HILLS-4109', 'EIGHT MILE PLAINS-4113', 'KURABY-4112', 'UPPER MOUNT GRAVATT-4122', 'WISHART-4122', 'COOPERS PLAINS-4108', 'CALAMVALE-4116', 'ALGESTER-4115']


In [8]:
#Preview a sample suburb "Runcorn" json data
#pprint.pprint(results[1]) #print the result

## Create a suburb based house price data frame

In [9]:
property_type=results[0]['RUNCORN-4113']['property_types'] #check what property types available in the listing
print(property_type.keys()) #2 Types - House, Unit

#We only look for bedrooms with 3+ bedrooms
house_bed_num=['ALL', '3', '4', '5+']

dict_keys(['HOUSE', 'UNIT'])


In [10]:
#Create a loop to access the house buy/rent data based on the num of bedrooms from each suburb
house_bedrooms_data=[]

for key in range(len(suburb_keys)):
    
    suburbs_pc=suburb_keys[key]  #suburb name + postal_code
    suburb_data=results[key][suburbs_pc] #access the suburb dict
    house_suburb_data=suburb_data['property_types']['HOUSE'] #access the house data in the dict
    house_bedrooms=house_suburb_data['bedrooms'] #access the house bedrooms info in the dict
    
    hbk=list(house_bedrooms.keys()) #check if the suburb has house data with less than 3 bedrooms, if so, remove them
    if '0' in hbk:  #remove record with bedroom num=0
        house_bedrooms.pop('0')
    elif '1' in hbk: #remove record with bedroom num=1
        house_bedrooms.pop('1')
    elif '2' in hbk:#remove record with bedroom num=2
        house_bedrooms.pop('2')
    else:
        house_bedrooms
    
    house_bedrooms_data.append(house_bedrooms)    

In [11]:
# Buid a house buy and rent dataframe based on the num of bedrooms from each suburb

df_houses_list=[]
for i in range(10):
    suburb=suburbs[i]
    house_data=house_bedrooms_data[i]
    
    houses_data_list=[]
    
    for x in range(4): #use loop to return the house data based on the num of bedrooms (num=3,4, 5+ or ALL)
        num=house_bed_num[x]
        houses_data=house_data[num]['investor_metrics']
            
        houses_data['bedrooms']=num # add a new column to show the bedroom number
        df_houses=pd.DataFrame(houses_data, index=np.arange(1))#store the house data in a dataframe
        houses_data_list.append(df_houses) #append all house dfs for a suburb into a list


    df_houses=pd.concat(houses_data_list,axis=0, ignore_index=True) #merge all dataframes in the list into 1 for each suburb

    df_houses['suburb']=str(suburb) #add a new colum to show the suburb name

    cols=df_houses.columns.to_list() # 10 columns
    cols=cols[-1:]+cols[-2:-1]+cols[:-2] #move the last 2 colunns ("suburb, bedrooms") to the front of dataframe
    df_houses=df_houses[cols]
    df_houses_list.append(df_houses) #append all suburb unit dfs into a list
    

df_houses=pd.concat(df_houses_list,axis=0, ignore_index=True) #merge all suburb dataframes into 1
df_houses['property_type']='houses'
df_houses.head()    


Unnamed: 0,suburb,bedrooms,median_sold_price,median_sold_price_five_years_ago,median_rental_price,rental_yield,annual_growth,rental_demand,rental_properties,sold_properties,sold_properties_five_years_ago,property_type
0,runcorn,ALL,569000.0,520000,417.5,0.038155,0.018,742.58,222,86,170,houses
1,runcorn,3,550000.0,478800,390.0,0.036873,0.028,821.38,108,35,81,houses
2,runcorn,4,605000.0,558500,470.0,0.040397,0.016,828.09,91,36,58,houses
3,runcorn,5+,777777.0,655000,530.0,0.035434,0.035,2015.98,17,7,14,houses
4,sunnybank,ALL,740000.0,686000,430.0,0.030216,0.015,597.14,257,66,145,houses


In [12]:
df_houses.dtypes # we noticed that "median_sold_price_five_years_ago" and "annual_growth" are shown as string

suburb                               object
bedrooms                             object
median_sold_price                   float64
median_sold_price_five_years_ago     object
median_rental_price                 float64
rental_yield                        float64
annual_growth                        object
rental_demand                       float64
rental_properties                     int64
sold_properties                       int64
sold_properties_five_years_ago        int64
property_type                        object
dtype: object

In [13]:
df_houses.isnull().sum() #check if there are null values

suburb                              0
bedrooms                            0
median_sold_price                   0
median_sold_price_five_years_ago    1
median_rental_price                 0
rental_yield                        0
annual_growth                       1
rental_demand                       0
rental_properties                   0
sold_properties                     0
sold_properties_five_years_ago      0
property_type                       0
dtype: int64

In [14]:
df_houses.fillna(0, inplace=True) #replace null value with 0
df_houses.head()

Unnamed: 0,suburb,bedrooms,median_sold_price,median_sold_price_five_years_ago,median_rental_price,rental_yield,annual_growth,rental_demand,rental_properties,sold_properties,sold_properties_five_years_ago,property_type
0,runcorn,ALL,569000.0,520000.0,417.5,0.038155,0.018,742.58,222,86,170,houses
1,runcorn,3,550000.0,478800.0,390.0,0.036873,0.028,821.38,108,35,81,houses
2,runcorn,4,605000.0,558500.0,470.0,0.040397,0.016,828.09,91,36,58,houses
3,runcorn,5+,777777.0,655000.0,530.0,0.035434,0.035,2015.98,17,7,14,houses
4,sunnybank,ALL,740000.0,686000.0,430.0,0.030216,0.015,597.14,257,66,145,houses


In [15]:
#convert "median_sold_price_five_years_ago" and "annual_growth" into float data types
dict={"suburb":str, "bedrooms":str,
      "median_sold_price":np.float64,
     "median_sold_price_five_years_ago":np.float64,
     "median_rental_price":np.float64,
      "annual_growth":np.float64,
      "rental_demand":np.float64,
      "rental_properties":np.int64,
      "sold_properties":np.int64,
      "sold_properties_five_years_ago":np.int64,
      "property_type": str}

In [16]:
#check the data type again
df_houses.apply(dict)
df_houses.dtypes

suburb                               object
bedrooms                             object
median_sold_price                   float64
median_sold_price_five_years_ago    float64
median_rental_price                 float64
rental_yield                        float64
annual_growth                       float64
rental_demand                       float64
rental_properties                     int64
sold_properties                       int64
sold_properties_five_years_ago        int64
property_type                        object
dtype: object

## Create a suburb based unit price data frame

In [17]:
unit_bed_num=['ALL', '1', '2', '3', '4'] #we don't look for units with bedrooms =0 or 5+

In [18]:
#Create a loop to access the unit buy/rent data based on the num of bedrooms from each suburb
unit_bedrooms_data=[]

for key in range(len(suburb_keys)):
    suburbs_pc=suburb_keys[key]  #suburb name + postal_code
    suburb_data=results[key][suburbs_pc] #access the suburb dict
    unit_suburb_data=suburb_data['property_types']['UNIT'] #access the unit data in the dict
    unit_bedrooms=unit_suburb_data['bedrooms'] #access the unit bedrooms info in the dict
    
    ubk=list(unit_bedrooms.keys()) #check if the suburb has units with 0 or 5+ bedrooms data, if so, remove them

    if '0' in ubk:  #remove record with bedroom num=0
        unit_bedrooms.pop('0')
        
    elif '5+' in ubk:  #remove record with bedroom num=5+
        unit_bedrooms.pop('5+')
    else:
        unit_bedrooms
    
    unit_bedrooms_data.append(unit_bedrooms)    


In [19]:
# Buid a unit buy and rent dataframe based on the num of bedrooms from each suburb

df_units_list=[]

for i in range(10): #10 suburbs
    suburb=suburbs[i]
    unit_data=unit_bedrooms_data[i]
    udk=list(unit_data.keys())
        
    units_data_list=[]
    
    for u in udk: #use loop to return the unit data based on the num of bedrooms (num=1,2,3,4 or ALL)
        units_data=unit_data[u]['investor_metrics']
            
        units_data['bedrooms']=u # add a new column to show the bedroom number
        df_units=pd.DataFrame(units_data, index=np.arange(1)) #store the unit data in a dataframe
        units_data_list.append(df_units) #append all unit dfs for a suburb into a list

    df_units=pd.concat(units_data_list,axis=0, ignore_index=True) #merge all dataframes in the list into 1 for each suburb

    df_units['suburb']=str(suburb) #add a new colum to show the suburb name

    cols=df_units.columns.to_list() # 10 columns
    cols=cols[-1:]+cols[-2:-1]+cols[:-2] #move the last 2 colunns ("suburb, bedrooms") to the front of dataframe
    df_units=df_units[cols]
    df_units_list.append(df_units)  #append all suburb unit dfs into a list

df_units=pd.concat(df_units_list,axis=0, ignore_index=True) #merge all suburb dataframes into 1
df_units['property_type']='units'
df_units.head()


Unnamed: 0,suburb,bedrooms,median_sold_price,median_sold_price_five_years_ago,median_rental_price,rental_yield,annual_growth,rental_demand,rental_properties,sold_properties,sold_properties_five_years_ago,property_type
0,runcorn,ALL,320000.0,355000.0,400.0,0.065,-0.021,342.26,112,61.0,141.0,units
1,runcorn,2,286000.0,306500.0,345.0,0.0627273,-0.014,573.64,12,7.0,12.0,units
2,runcorn,3,320000.0,350000.0,400.0,0.065,-0.018,331.29,97,52.0,106.0,units
3,runcorn,1,,,280.0,,,1059.62,1,,,units
4,runcorn,4,,,505.0,,,886.13,2,,,units


In [20]:
# we noticed that "median_sold_price_five_years_ago" and "annual_growth", "median_sold_price_five_years_ago"
#"sold_properties","sold_properties_five_years_ago" are shown strings
df_units.dtypes

suburb                               object
bedrooms                             object
median_sold_price                    object
median_sold_price_five_years_ago     object
median_rental_price                 float64
rental_yield                         object
annual_growth                        object
rental_demand                       float64
rental_properties                     int64
sold_properties                      object
sold_properties_five_years_ago       object
property_type                        object
dtype: object

In [21]:
df_units.isnull().sum() #check if there are null values

suburb                               0
bedrooms                             0
median_sold_price                   20
median_sold_price_five_years_ago    20
median_rental_price                  0
rental_yield                        20
annual_growth                       20
rental_demand                        0
rental_properties                    0
sold_properties                     20
sold_properties_five_years_ago      20
property_type                        0
dtype: int64

In [24]:
df_units.fillna(0, inplace=True) #replace null value with 0
df_units.head()

Unnamed: 0,suburb,bedrooms,median_sold_price,median_sold_price_five_years_ago,median_rental_price,rental_yield,annual_growth,rental_demand,rental_properties,sold_properties,sold_properties_five_years_ago,property_type
0,runcorn,ALL,320000.0,355000.0,400.0,0.065,-0.021,342.26,112,61,141,units
1,runcorn,2,286000.0,306500.0,345.0,0.062727,-0.014,573.64,12,7,12,units
2,runcorn,3,320000.0,350000.0,400.0,0.065,-0.018,331.29,97,52,106,units
3,runcorn,1,0.0,0.0,280.0,0.0,0.0,1059.62,1,0,0,units
4,runcorn,4,0.0,0.0,505.0,0.0,0.0,886.13,2,0,0,units


In [25]:
#convert "median_sold_price_five_years_ago" and "annual_growth", "median_sold_price_five_years_ago"
#"sold_properties","sold_properties_five_years_ago" into numerical data types

dict1={"suburb":str, "bedrooms":str,
      "median_sold_price":np.float64,
     "median_sold_price_five_years_ago":np.float64,
     "median_rental_price":np.float64,
      "annual_growth":np.float64,
      "rental_demand":np.float64,
      "rental_properties":np.int64,
      "sold_properties":np.int64,
      "sold_properties_five_years_ago":np.int64,
       "property_type": str}

In [26]:
#check the data type again
df_units.apply(dict1)
df_units.dtypes

suburb                               object
bedrooms                             object
median_sold_price                   float64
median_sold_price_five_years_ago    float64
median_rental_price                 float64
rental_yield                        float64
annual_growth                       float64
rental_demand                       float64
rental_properties                     int64
sold_properties                       int64
sold_properties_five_years_ago        int64
property_type                        object
dtype: object

In [27]:
df_units.head()

Unnamed: 0,suburb,bedrooms,median_sold_price,median_sold_price_five_years_ago,median_rental_price,rental_yield,annual_growth,rental_demand,rental_properties,sold_properties,sold_properties_five_years_ago,property_type
0,runcorn,ALL,320000.0,355000.0,400.0,0.065,-0.021,342.26,112,61,141,units
1,runcorn,2,286000.0,306500.0,345.0,0.062727,-0.014,573.64,12,7,12,units
2,runcorn,3,320000.0,350000.0,400.0,0.065,-0.018,331.29,97,52,106,units
3,runcorn,1,0.0,0.0,280.0,0.0,0.0,1059.62,1,0,0,units
4,runcorn,4,0.0,0.0,505.0,0.0,0.0,886.13,2,0,0,units


In [29]:
df_median_price=pd.concat([df_houses,df_units], axis=0, ignore_index=True)
df_median_price.head()

Unnamed: 0,suburb,bedrooms,median_sold_price,median_sold_price_five_years_ago,median_rental_price,rental_yield,annual_growth,rental_demand,rental_properties,sold_properties,sold_properties_five_years_ago,property_type
0,runcorn,ALL,569000.0,520000.0,417.5,0.038155,0.018,742.58,222,86,170,houses
1,runcorn,3,550000.0,478800.0,390.0,0.036873,0.028,821.38,108,35,81,houses
2,runcorn,4,605000.0,558500.0,470.0,0.040397,0.016,828.09,91,36,58,houses
3,runcorn,5+,777777.0,655000.0,530.0,0.035434,0.035,2015.98,17,7,14,houses
4,sunnybank,ALL,740000.0,686000.0,430.0,0.030216,0.015,597.14,257,66,145,houses


# Get supply and demand data

In [None]:
#define API URL
url_demand=r'https://investor-api.realestate.com.au/v2/states/qld/suburbs/runcorn/postcodes/4113/supply_demand.json'

#Request the URL and parse the JSON data
response_demand=requests.get(url_demand) #request the url
response_demand.raise_for_status() #raise exception if invaid response
print('Response Code: ',response.status_code) #check the request code 

result_demand=response_demand.json() #create a json object ofthe results
pprint.pprint(result_demand) #pretty print the result for easy viewing

In [None]:
# result_demand.pop('end_date') #drop start and end dates
# result_demand.pop('start_date')
print(result_demand)

In [None]:
df_demand=pd.DataFrame(result_demand, index=np.arange(1)) #create a dataframe to store the suburb demand data
df_demand #print dataframe

# Get suburb lifestyle and people data? - DONE

In [None]:
url_life=r'https://www.realestate.com.au/neighbourhoods/runcorn-4113-qld'
page = requests.get(url_life)
soup = BeautifulSoup(page.content, 'html.parser')

UNSURE how to get the 2nd tabbed demo data out

# Age group, occupancy, schools, nearby suburbs? Total population

# Get Trend Dynamic Line chart with Selanium?

In [None]:
#define API URL
url=r'https://investor-api.realestate.com.au/v2/states/qld/suburbs/runcorn.json?embed=suburb_geo'

#Request the URL and parse the JSON data
response=requests.get(url) #request the url
response.raise_for_status() #raise exception if invaid response
print('Response Code: ',response.status_code) #check the request code  

result=response.json() #create a json object ofthe results
#pprint.pprint(result) #pretty print the result for easy viewing

In [None]:
#get suburb name from the result- runcorn

def suburb_name(result):
    suburb_pc=list(result.keys())
    suburb=suburb_pc[0].split("-")[0].title()
    return suburb 

suburb_name(result)