In [13]:
import requests
from bs4 import BeautifulSoup
import re
import json

BASE_LIST_URL = "https://opshop.org/list/ALL/"

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                  "AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/120.0.0.0 Safari/537.36",
    "Accept-Language": "en-US,en;q=0.9",
    "Referer": "https://opshop.org/",
}

# Words / phrases we NEVER want as shop names
BLACKLIST_EXACT = {
    "Edit", "Review", "OpShop.org", "List View", "Map View", "Add New", "Login",
    "Opshop locations", "State:", "ALL",
    "ACT", "NSW", "VIC", "SA", "TAS", "QLD", "NT", "WA"
}

DAYS = ("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun")

def fetch_list_page():
    session = requests.Session()
    session.headers.update(HEADERS)
    resp = session.get(BASE_LIST_URL, timeout=20)
    resp.raise_for_status()
    return resp.text

def looks_like_address(text: str) -> bool:
    # Very simple heuristic: has a number and a word (e.g., "61 Ryedale Road")
    return bool(re.search(r"\d+\s+\w+", text))

def is_valid_shop_name(text: str) -> bool:
    if text in BLACKLIST_EXACT:
        return False
    if text.startswith(DAYS):
        return False
    if re.match(r"\d{4}\s*-\s*", text):  # postcode headers like "4740 - Glenella"
        return False
    if len(text) < 3:
        return False
    # Must contain letters
    if not re.search(r"[A-Za-z]", text):
        return False
    return True

def parse_shops_to_list_of_lists(html):
    soup = BeautifulSoup(html, "html.parser")

    text = soup.get_text(separator="\n")
    lines = [l.strip() for l in text.split("\n") if l.strip()]

    shops = []
    i = 0

    while i < len(lines):
        line = lines[i]

        if is_valid_shop_name(line):
            name = line

            address = ""
            hours = []

            # Try next line as address (must look like address)
            if i + 1 < len(lines) and looks_like_address(lines[i + 1]):
                address = lines[i + 1]
                j = i + 2
            else:
                i += 1
                continue  # skip if no address â†’ probably not a real shop

            # Collect opening hours lines
            while j < len(lines) and lines[j].startswith(DAYS):
                hours.append(lines[j])
                j += 1

            shop_row = [name, address] + hours
            shops.append(shop_row)

            i = j
        else:
            i += 1

    return shops

def save_json(data, filename="opshops.json"):
    with open(filename, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

def main():
    html = fetch_list_page()
    shops = parse_shops_to_list_of_lists(html)

    print(f"Total shops parsed: {len(shops)}")
    print("First 3 entries:")
    for row in shops[:3]:
        print(row)

    save_json(shops, "opshops.json")
    print("Saved to opshops.json")

if __name__ == "__main__":
    main()

Total shops parsed: 2297
First 3 entries:
['Koo Wee Rup and District Lions Community Opportunity Shop', '290 Rossiter Road, Koo Wee Rup', 'Mon - Fri 10am-4pm', 'Sat 9am-12pm', 'Sun 10am-3pm selected Sundays']
['Rose Lodge Opportunity Shop', '42 McBride Avenue, Wonthaggi, 3996', 'Mon - Fri 9am - 4pm', 'Sat 9am - 12pm']
['The Green Shed Underground', 'Basement level, 34 E Row Canberra ACT 2601 Australia', 'Tue - Fri 10:30am - 6pm', 'Sat 10:30am - 4pm']
Saved to opshops.json
