In [16]:
import requests 
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm
import os 


# os.mkdir('rent_data')

In [17]:
def url_to_html(url):
    # returns html soup
    html = requests.get(url)
    soup = BeautifulSoup(html.text, 'html.parser')
    return soup 

In [18]:
def get_regions():
    url = "https://www.rightmove.co.uk/london-popular-regions.html"
    soup = url_to_html(url)
    regions =  []
    for li in soup.findAll("div", {"class": "primarycontent"})[0].findAll("li"):
        regions.append(li.find("a").string)
    regions = [i.split(' property')[0] for ind, i in enumerate(regions) if ind%3 == 0]
    regions = ['-'.join(i.split(' ')) for i in regions]
    return regions

In [19]:
def get_property_ids(region, ind):

    ## try except stuff for non existant pages
    index = ind*24
    region_url = f"https://www.rightmove.co.uk/property-for-rent/{region}.html?index={index}&"

    soup = url_to_html(region_url)

    property_ids = []
    cards = soup.find(id = "l-searchResults")
    for card in cards.findAll("div", {"class":"l-searchResult"}):
        property_ids.append(card["id"].split('-')[1])
    property_ids = property_ids[1:] ## ignore the first 
    property_ids = [id for id in property_ids if id != '0'] ## drop non existant ones
    return property_ids

In [20]:
def get_attributes(property_ids, region):
    infos = []
    for property_id in property_ids:
        
        property_url = f"https://www.rightmove.co.uk/properties/{property_id}#/?channel=RES_LET"
        
        soup = url_to_html(property_url)
        info_reel = soup.find("div", {"data-test": "infoReel"})
        ps = info_reel.findAll("p")
        
        # type
        type = ps[0].string

        # square metres 
        if len(ps) == 6:
            try: 
                square_metres = int(ps[4].string.split("(")[1].split(" sq")[0])
            except ValueError:
                try:
                    square_metres = ps[4].string.split("(")[1].split(" sq")[0]
                    square_metres = square_metres = int(''.join(i for i in square_metres if i.isdigit()))
                except ValueError: 
                    square_metres = (
                        (int(ps[4].split("(")[1].split(" sq")[0].split('-')[0])
                        + int(ps[4].split("(")[1].split(" sq")[0].split('-')[1])
                        ) / 2
                    )
        else:
            square_metres = None
        
        # beds
        try: 
            beds = ps[1].string.split('x')[0]
        except IndexError:
            try: 
                beds = ps[1]
            except IndexError:
                beds = None
        
        # baths
        try:
            baths = ps[2].string.split('x')[0]
        except IndexError:
            try: 
                baths = ps[2]
            except IndexError:
                baths = None

        # price
        price = soup.findAll("article")[1].findAll("span")[0].string
        try: 
            price = int(''.join(i for i in price if i.isdigit()))
        except ValueError:
            price = price 

        # all together
        info = {
            "type": type,
            "beds": beds,
            "baths": baths,
            "sqm": square_metres,
            "price": price,
            "url": property_url}

        infos.append(info)

    infos = pd.DataFrame(infos)
    infos['region'] = region
    return infos

### Main Loop 

In [6]:
regions = get_regions()

attributes = []
for region in (pbar := tqdm(regions)):
    id = 0 
    while True:
        pbar.set_description(f'{region} - {id}')
        # try: 
        id += 1
        property_ids = get_property_ids(region, id)
        if len(property_ids) == 0:
            break
        attributes.append(get_attributes(property_ids, region))
        # except AttributeError:
        #     break 
    

Balham - 1:   0%|          | 0/65 [00:04<?, ?it/s]

In [112]:
df = pd.concat(attributes)
df = df.loc[df.price != 'Coming Soon']
df.groupby('region').mean()

  df.groupby('region').mean()


Unnamed: 0_level_0,sqm,price
region,Unnamed: 1_level_1,Unnamed: 2_level_1
Balham,120.782609,640297.47331
Barking,70.88,332273.071429


In [94]:
ids = get_property_ids(region, 9

In [97]:
ids = [i for i in ['0', '0', '0'] if i != '0']

In [98]:
property_url = 
soup = url_to_html(property_url)
info_reel = soup.find("div", {"data-test": "infoReel"})
ps = info_reel.findAll("p")

# type
type = ps[0].string

# square metres 
if len(ps) == 6:
    try: 
        square_metres = int(ps[4].string.split("(")[1].split(" sq")[0])
    except ValueError:
        try:
            square_metres = ps[4].string.split("(")[1].split(" sq")[0]
            square_metres = square_metres = int(''.join(i for i in square_metres if i.isdigit()))
        except ValueError: 
            square_metres = (
                (int(ps[4].split("(")[1].split(" sq")[0].split('-')[0])
                + int(ps[4].split("(")[1].split(" sq")[0].split('-')[1])
                ) / 2
            )
else:
    square_metres = None

# beds
try: 
    beds = ps[1].string.split('x')[0]
except IndexError:
    try: 
        beds = ps[1]
    except IndexError:
        beds = None

# baths
try:
    baths = ps[2].string.split('x')[0]
except IndexError:
    try: 
        baths = ps[2]
    except IndexError:
        baths = None

# price
price = soup.findAll("article")[1].findAll("span")[0].string
try: 
    price = int(''.join(i for i in price if i.isdigit()))
except ValueError:
    price = price 

# all together
info = {
    "type": type,
    "beds": beds,
    "baths": baths,
    "sqm": square_metres,
    "price": price,
    "url": property_url}
info

[]

In [10]:
get_property_ids('Chelsea', 1)

['129963470',
 '112856762',
 '129024068',
 '129960041',
 '129956393',
 '129829466',
 '129954218',
 '129953315',
 '123336371',
 '129951641',
 '129948155',
 '129945446',
 '129944792',
 '129942827',
 '129942767',
 '129942650',
 '129940169',
 '129939344',
 '129938705',
 '129806750',
 '129931502',
 '129931472',
 '129930626',
 '129928619']