In [1]:
from bs4 import BeautifulSoup,
import requests
import re
import time
import os
import csv
from datetime import datetime

import json

#### Globals

In [9]:
__url_base = "https://www.oldlistings.com.au/real-estate/VIC/"
__output_dir = "output"
__cache_dir = 'cache'
__from_cache_file = True
__cached_html_filename = ""
__data_output_file = 'data.json'
__targets = 'input/targets.txt'

### Helper Functions - Input Processing

In [3]:
def get_url(address, beds=1, page=1):
    # bespoke url elements
    address_components = {}
    add_arr = address.split()
    address_components["street"] = add_arr[1]
    address_components["suburb"] = "".join(filter(str.isalpha, add_arr[3]))
    address_components["postcode"] = add_arr[-1]

    return (__url_base 
        + address_components["suburb"] + '/' 
        + address_components["postcode"] + '/' 
        + 'rent/' 
        + str(page) + '/' 
        + str.upper(address_components["street"]) 
        + ':bedmax:' + str(beds))

def get_soup(source):
    if __from_cache_file: # get soup from cache
        filename = make_filename_friendly(source)
        global __cached_html_filename 
        __cached_html_filename = filename
        try:
            with open(make_html_cache_outpath(filename), 'r') as fp:
                return BeautifulSoup(fp.read())

        except FileNotFoundError:
            print("error: 'File Not Found' in get_soup() from cached file option\n")

    else:
        html = requests.get(source).text
        cache_html(make_filename_friendly(source), html)
        return BeautifulSoup(html)

### Helper Functions - Output

In [4]:
def replace_slashs(s):
    return re.sub('/' , '_', s)

def replace_commas(s):
    return re.sub(' *, *',' - ', s)

def rm_all_invalid_file_name_chars(s):
    return re.sub('[^\w\s_.-]', '',s)

def make_filename_friendly(s):
    s = replace_commas(s)
    s = replace_slashs(s)
    s = rm_all_invalid_file_name_chars(s)
    return s

def make_outpath_string(directory, filename):
    return os.path.join(directory, make_filename_friendly(filename))

def make_html_cache_outpath(filename):
    return make_outpath_string(__cache_dir, filename + '.html')

def write_line_to_file(s):
    out = make_outpath_string(__output_dir,'out.txt')
    with open(out, 'a') as f:
        f.write(s)

def cache_html(file_name, content):
    outpath = make_html_cache_outpath(file_name)

    # *** CANCELLED VERSION NUMBER FEATURE OF CACHED FILES
    #       Problem was that read function did not know filename of
    #       the most recent file. Must implement the endowment of this
    #       ability in reader function first.
    #
    # outpath = ''
    # version = '0'
    # while(True):
    #     outpath = make_html_cache_outpath(file_name + '_' + version)
    #     if(not os.path.exists(outpath)):
    #         break

    #     version = str(int(version) + 1)
    
    write_file(outpath, content)

def write_file(path, content):
# consider using the following next time 
# (https://stackoverflow.com/questions/273192/how-can-i-safely-create-a-nested-directory-in-python)
    # from pathlib import Path
    # Path("/my/directory").mkdir(parents=True, exist_ok=True)
    print("\tSaving '" + path +"'")
    if not os.path.exists(os.path.dirname(path)):
        os.makedirs(os.path.dirname(path))
    with open(path, 'w') as fp:
        fp.write(content)
        



### The Search Criteria
#### Get targets from file

In [5]:
def load_targets():  
    rtn_dict = {}

    try:
        with open(__targets) as file:
            while (line := file.readline().rstrip()):
                t = line.split(';')
                try:
                    nras = t[3].lower().strip() in ['true', 'yes', 'y', 't']
                except:
                    nras = False
                try:
                    beds = int(t[2].strip())
                except:
                    beds = 1
                rtn_dict[t[0]] = {"price": int(t[1]), "beds": beds, "nras": nras}
    except FileNotFoundError:
        print('file not found')

    return rtn_dict

In [6]:
def find_listing_in_soup(address, beds):
    print("\tsearching website")
    page = 1
    while(True):
        number = address.split(' ')[0]
        url = get_url(address=address, beds=beds, page=page)
        soup = get_soup(url)
        if "An Error has occurred" in soup.find(class_="sub-page-h1").strings:
            print("Listing wasn't found")
            break

        print('\t\t' + soup.find(class_ = "sub-page-h2").get_text().strip())

        listing_name = soup.find(string=re.compile(number))
        if(listing_name != None):
            break

        page += 1
        if page > 10:
            print("page is getting pretty high: {0}".format(page))
            break
        if not __from_cache_file:
            time.sleep(3)

    old_listing = listing_name.parent.parent.parent
    
    # print("found " + listing_name)
    # print(old_listing.prettify())
    # print([x for x in listing_name.stripped_strings])
    # cache_html("listing - "
    #  + "".join(
    #      [x for x in listing_name.stripped_strings]), 
    #      old_listing.prettify())
    cache_html("00-listing - "
     + listing_name.get_text(), 
         old_listing.prettify())
    print("\tFound listing")
    return old_listing

In [7]:
def extract_data_from_listing(soup):
    past_price_list = soup.find(class_=re.compile('hist')).find('ul')
    price_date_pairs = {}
    for child in past_price_list.find_all("li"):
        strings = [x for x in child.strings]
        date = str(strings[0]).strip()
        dt = datetime.strptime(str(date), '%B %Y')
        price_date_pairs[dt.strftime('%Y-%m-%d')] = re.sub('[^0-9]','',strings[1])
    print("\tgot price-date pairs")

    return price_date_pairs

### Main Loop

In [10]:
targets = load_targets()
target_results_list = []
if __from_cache_file:
    print("Scraping from html cache")
else:
    print("Scraping from web")

for target in targets.items():
    print("Target is " + str(target))
    address,features = target
    
    # make_url()
    url = get_url(address, features["beds"])
    write_line_to_file(url)
    # get_soup()
    soup = get_soup(url)
    
    # find_listing in soup()
    listing_soup = find_listing_in_soup(address, features["beds"])
    
    # extract_data()
    price_date_pairs = extract_data_from_listing(listing_soup)

    # TODO: Interpolate price peak amount and date
    #       include in output data 
    # interpolate_peak_price()

    # TODO: Create results object to append to results list.
    #       Write each item in the results list to output file.
    # target_results_list.append(results)

    # write_to_file()
    data_output_path = make_outpath_string(__output_dir, __data_output_file)
    with open(data_output_path, 'w', encoding='utf-8') as f:
        json.dump({address:(price_date_pairs,features)}, 
                    f, ensure_ascii=False, indent=4)

Scraping from html cache
Target is ('317/4 Acacia Place, Abbotsford, Vic 3067', {'price': 290, 'beds': 1, 'nras': True})
	searching website
		Displaying 1 to 50 of 287 Old Real Estate Listings*
		Displaying 51 to 100 of 287 Old Real Estate Listings*
		Displaying 101 to 150 of 287 Old Real Estate Listings*
		Displaying 151 to 200 of 287 Old Real Estate Listings*
	Saving 'cache\00-listing - 317_4 ACACIA PLACE - ABBOTSFORD.html'
	Found listing
	got price-date pairs
