<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Step-1:-Imports" data-toc-modified-id="Step-1:-Imports-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Step 1: Imports</a></span></li><li><span><a href="#Step-2:-Config-+-helpers" data-toc-modified-id="Step-2:-Config-+-helpers-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Step 2: Config + helpers</a></span></li><li><span><a href="#Step-3-IMPORTANT:-confirm-page-2-is-different-from-page-1" data-toc-modified-id="Step-3-IMPORTANT:-confirm-page-2-is-different-from-page-1-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Step 3 IMPORTANT: confirm page 2 is different from page 1</a></span></li><li><span><a href="#Step-4-Extract-rows-from-HTML" data-toc-modified-id="Step-4-Extract-rows-from-HTML-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Step 4 Extract rows from HTML</a></span></li><li><span><a href="#Step-5--Scrape-pages-1–40" data-toc-modified-id="Step-5--Scrape-pages-1–40-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Step 5  Scrape pages 1–40</a></span></li><li><span><a href="#Step-6:-Save-to-CSV" data-toc-modified-id="Step-6:-Save-to-CSV-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Step 6: Save to CSV</a></span></li></ul></div>

In [3]:
import sys
print(sys.executable)

C:\Users\Tanu\AppData\Local\Programs\Python\Python312\python.exe


In [4]:
import sys
!{sys.executable} -m pip install requests

Collecting requests
  Using cached requests-2.32.5-py3-none-any.whl.metadata (4.9 kB)
Collecting charset_normalizer<4,>=2 (from requests)
  Downloading charset_normalizer-3.4.4-cp312-cp312-win_amd64.whl.metadata (38 kB)
Collecting idna<4,>=2.5 (from requests)
  Using cached idna-3.11-py3-none-any.whl.metadata (8.4 kB)
Collecting urllib3<3,>=1.21.1 (from requests)
  Downloading urllib3-2.6.3-py3-none-any.whl.metadata (6.9 kB)
Collecting certifi>=2017.4.17 (from requests)
  Downloading certifi-2026.1.4-py3-none-any.whl.metadata (2.5 kB)
Using cached requests-2.32.5-py3-none-any.whl (64 kB)
Downloading charset_normalizer-3.4.4-cp312-cp312-win_amd64.whl (107 kB)
Using cached idna-3.11-py3-none-any.whl (71 kB)
Downloading urllib3-2.6.3-py3-none-any.whl (131 kB)
Downloading certifi-2026.1.4-py3-none-any.whl (152 kB)
Installing collected packages: urllib3, idna, charset_normalizer, certifi, requests

   ---------------------------------------- 0/5 [urllib3]
   --------------------------------

In [5]:
import requests
print("requests version:", requests.__version__)

requests version: 2.32.5


## Step 1: Imports

In [6]:
import re
import time
import random
import hashlib
import requests
import pandas as pd
from bs4 import BeautifulSoup

## Step 2: Config + helpers

In [7]:
BASE_URL = "https://www.rightmove.co.uk/house-prices/sk8.html"
TOTAL_PAGES = 40

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
                  "(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
    "Accept-Language": "en-GB,en;q=0.9",
}

def polite_sleep():
    time.sleep(random.uniform(1.0, 2.0))

def page_url(n: int) -> str:
    return BASE_URL if n == 1 else f"{BASE_URL}?pageNumber={n}"

def fetch_html(session: requests.Session, url: str) -> str:
    r = session.get(url, headers=HEADERS, timeout=30)
    r.raise_for_status()
    return r.text


## Step 3 IMPORTANT: confirm page 2 is different from page 1

In [8]:
session = requests.Session()

html1 = fetch_html(session, page_url(1))
html2 = fetch_html(session, page_url(2))

h1 = hashlib.md5(html1.encode("utf-8", errors="ignore")).hexdigest()
h2 = hashlib.md5(html2.encode("utf-8", errors="ignore")).hexdigest()

print("Different pages?", h1 != h2)


Different pages? True


## Step 4 Extract rows from HTML

In [9]:
POSTCODE_RE = re.compile(r"\b([A-Z]{1,2}\d{1,2}[A-Z]?\s*\d[A-Z]{2})\b", re.I)
PRICE_RE = re.compile(r"£\s?[\d,]+")
DATE_RE  = re.compile(r"\b\d{1,2}\s+[A-Za-z]{3}\s+\d{4}\b")

BEDS_TENURE_RE = re.compile(r"\b(\d{1,2})\b\s*(Freehold|Leasehold)\b", re.I)

PROPERTY_TYPES = [
    "Detached", "Semi-Detached", "Terraced", "End Terrace",
    "Flat", "Maisonette", "Bungalow", "Cottage", "Town House"
]

def norm(s: str) -> str:
    return re.sub(r"\s+", " ", s or "").strip()

def split_address_postcode(full: str):
    m = POSTCODE_RE.search(full or "")
    if not m:
        return norm(full), None
    pc = re.sub(r"\s+", " ", m.group(1).upper()).strip()
    addr = (full[:m.start()] + full[m.end():]).strip(" ,")
    return norm(addr), pc

def parse_property_type(text: str):
    for t in PROPERTY_TYPES:
        if re.search(rf"\b{re.escape(t)}\b", text, re.I):
            return t
    return None

def parse_tenure(text: str):
    if re.search(r"\bFreehold\b", text, re.I):
        return "Freehold"
    if re.search(r"\bLeasehold\b", text, re.I):
        return "Leasehold"
    return None

def extract_rows_from_html(html: str):
    soup = BeautifulSoup(html, "lxml")
    rows = []

    blocks = soup.find_all(["a", "article", "li", "div", "section"])

    for b in blocks:
        text = norm(b.get_text(" ", strip=True))
        if not text:
            continue
        if not POSTCODE_RE.search(text):
            continue
        if not PRICE_RE.search(text):
            continue
        if not re.search(r"\bFreehold\b|\bLeasehold\b", text, re.I):
            continue

        # address window around postcode
        mpc = POSTCODE_RE.search(text)
        start = max(0, mpc.start() - 200)
        window = text[start:mpc.end() + 30]

        maddr = re.search(r"\d{1,4}\s*,[^£]{0,260}", window)
        if not maddr:
            continue

        addr_line = norm(maddr.group(0)).strip(" ,")
        address, postcode = split_address_postcode(addr_line)

        beds, tenure = None, None
        mt = BEDS_TENURE_RE.search(text)
        if mt:
            beds = int(mt.group(1))
            tenure = mt.group(2).capitalize()
        else:
            tenure = parse_tenure(text)

        prop_type = parse_property_type(text)

        d = DATE_RE.search(text)
        p = PRICE_RE.search(text)
        if not d or not p:
            continue

        rows.append({
            "address": address,
            "postcode": postcode,
            "property_type": prop_type,
            "bedrooms": beds,
            "tenure": tenure,
            "last_sold_date": d.group(0),
            "last_sold_price": p.group(0),
        })

    # dedupe
    seen = set()
    out = []
    for r in rows:
        key = (r["address"], r["postcode"], r["last_sold_date"], r["last_sold_price"])
        if key in seen:
            continue
        seen.add(key)
        out.append(r)

    return out


## Step 5  Scrape pages 1–40

In [10]:
session = requests.Session()
all_rows = []

prev_hash = None

for n in range(1, TOTAL_PAGES + 1):
    url = page_url(n)
    print(f"Scraping page {n}: {url}")

    html = fetch_html(session, url)
    h = hashlib.md5(html.encode("utf-8", errors="ignore")).hexdigest()

    if prev_hash is not None and h == prev_hash:
        print("  WARNING: Page HTML repeated. Stopping to avoid duplicates.")
        break
    prev_hash = h

    rows = extract_rows_from_html(html)
    print(f"  Rows found: {len(rows)}")
    all_rows.extend(rows)

    polite_sleep()

df_raw = pd.DataFrame(all_rows)
df_raw.shape, df_raw.head()


Scraping page 1: https://www.rightmove.co.uk/house-prices/sk8.html
  Rows found: 26
Scraping page 2: https://www.rightmove.co.uk/house-prices/sk8.html?pageNumber=2
  Rows found: 25
Scraping page 3: https://www.rightmove.co.uk/house-prices/sk8.html?pageNumber=3
  Rows found: 25
Scraping page 4: https://www.rightmove.co.uk/house-prices/sk8.html?pageNumber=4
  Rows found: 26
Scraping page 5: https://www.rightmove.co.uk/house-prices/sk8.html?pageNumber=5
  Rows found: 24
Scraping page 6: https://www.rightmove.co.uk/house-prices/sk8.html?pageNumber=6
  Rows found: 25
Scraping page 7: https://www.rightmove.co.uk/house-prices/sk8.html?pageNumber=7
  Rows found: 25
Scraping page 8: https://www.rightmove.co.uk/house-prices/sk8.html?pageNumber=8
  Rows found: 25
Scraping page 9: https://www.rightmove.co.uk/house-prices/sk8.html?pageNumber=9
  Rows found: 24
Scraping page 10: https://www.rightmove.co.uk/house-prices/sk8.html?pageNumber=10
  Rows found: 24
Scraping page 11: https://www.rightmove.c

((1004, 7),
                                              address postcode property_type  \
 0  19, Easton Drive, Cheadle 13 Semi-Detached 3 F...  SK8 2JD      Detached   
 1  19, Easton Drive, Cheadle 13 Semi-Detached 3 F...  SK8 2JD      Detached   
 2  105, East Avenue, Heald Green, Cheadle 10 Deta...  SK8 3BS      Detached   
 3  3, Massie Street, Cheadle 15 Terraced 4 Freeho...  SK8 1BW      Terraced   
 4  12, Adshall Road, Cheadle 14 Terraced 2 Freeho...  SK8 2JN      Terraced   
 
    bedrooms     tenure last_sold_date last_sold_price  
 0       3.0   Freehold    19 Dec 2025        £384,044  
 1       3.0   Freehold    19 Dec 2025        £315,000  
 2       3.0  Leasehold    17 Dec 2025        £427,450  
 3       4.0   Freehold    12 Dec 2025        £287,500  
 4       2.0   Freehold    12 Dec 2025        £210,000  )

## Step 6: Save to CSV

In [11]:
df_raw.to_csv("rightmove_sk8_df_raw.csv", index=False, encoding="utf-8")
print("Saved: rightmove_sk8_df_raw.csv", df_raw.shape)


Saved: rightmove_sk8_df_raw.csv (1004, 7)
