### 1. Extracting Australia VIC postcode `mosque-finder.com.au`

In [1]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import pickle
import os

# URL of the webpage you want to scrape
url = "https://www.mosque-finder.com.au/vic/postcode.html"

# Send an HTTP GET request to the URL
response = requests.get(url)
    
# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')

suburb_list = soup.select("ul.bullets li")
suburb_data = []
for item in suburb_list:
    # Extract the text from the <a> tag
    text = item.find('a').get_text(strip=True)
    
    # Split the text to separate postcode and suburb
    # Expected format: "postcode: 3000 - Suburb: Melbourne"
    if "postcode:" in text and "- Suburb:" in text:
        parts = text.split("- Suburb:")
        postcode = parts[0].split("postcode:")[1].strip()
        suburb = parts[1].strip()
        suburb_data.append((postcode, suburb))


### 2. Using the postcode to geting housing data from `domain.com.au`

In [None]:
# URL of the webpage to scrape
# url = "https://www.domain.com.au/suburb-profile/box-hill-vic-3128"

def get_data(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36"
    }

    try:
        # Send a request with throttling
        response = requests.get(url, headers=headers)
        # Check if the response status code indicates success
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, "html.parser")

            td_elements = soup.find_all("td", class_="css-15k02nu")

            results = []
            for i in range(0, len(td_elements), 2):
                results.append((td_elements[i].get_text(strip=True), td_elements[i + 1].get_text(strip=True)))

            return results
        else:
            return []
    except requests.exceptions.RequestException as e:
        return []

house_data = {}

for i in suburb_data:
    url = f'https://www.domain.com.au/suburb-profile/{i[1].lower().replace(" ", "-")}-vic-{i[0]}'
    # print(url)
    house_data[f"{i[0]}_{i[1]}"] = get_data(url)



### 3. Saving the housing data

In [2]:
file_path = 'real_estate_data.pkl'

if os.path.exists(file_path):
    with open(file_path, 'rb') as file:
        house_data = pickle.load(file)
        print("Loaded existing data from", file_path)
else:
    with open(file_path, 'wb') as file:
        pickle.dump(house_data, file)
        print("Stored house_data in", file_path)

Loaded existing data from real_estate_data.pkl


In [3]:
def transform_real_estate_data(data):
    def convert_price(value):
        """Helper function to convert price strings to numerical values."""
        if value == '-' or not value:  # Handle missing or invalid entries
            return None
        # Parse the price string
        match = re.match(r'\$(\d+(\.\d+)?)([km]?)', value.lower())
        if match:
            base = float(match.group(1))
            multiplier = match.group(3)
            if multiplier == 'k':
                base *= 1000
            elif multiplier == 'm':
                base *= 1_000_000
            return int(base)
        return None  # Handle unexpected formats

    # If the input is an empty list, return an empty list.
    if data == []:
        return []

    # Validate the input is a list.
    if not isinstance(data, list):
        raise ValueError("Input data should be a non-empty list.")
    
    results = {}
    is_house = True  # Default assumption is that the entries are houses.

    for i in range(len(data)):
        if i > 0:  # Compare current and previous entry to check if data remains consistent.
            if data[i][0] <= data[i-1][0]:
                is_house = False

        # Determine type and print the result
        property_type = "House" if is_house else "Unit"
        results[f"{property_type} {data[i][0]} BR"] = convert_price(data[i][1])
    
    return results


In [4]:
test_house_data = house_data.copy()

for keys, values in test_house_data.items():
    test_house_data[keys] = transform_real_estate_data(values)

filtered_data = {key: value for key, value in test_house_data.items() if value != []}
cenverted_data = {key: value for key, value in filtered_data.items() if value != []}


df = pd.DataFrame(cenverted_data)
df = df.transpose()
df.index = df.index.str.replace(r'^\d+_', '', regex=True)
df.to_csv('real_estate_data.csv')
df

Unnamed: 0,House 2 BR,House 3 BR,Unit 1 BR,Unit 2 BR,Unit 3 BR,House 4 BR,House 5 BR,House 1 BR
Melbourne,,,373000.0,588000.0,1044999.0,,,
East Melbourne,,2918000.0,563000.0,750000.0,2900000.0,4480000.0,,
West Melbourne,960000.0,1500000.0,450000.0,650000.0,1140000.0,,,
Melbourne,,,385000.0,680000.0,1240000.0,,,
Southbank,,,395000.0,620000.0,975000.0,,,
...,...,...,...,...,...,...,...,...
South Dudley,,,,,,,,
St Clair,,,,,,,,
Wattle Bank,,,,,,,,
Wonthaggi,410000.0,510000.0,,390000.0,,641000.0,,
