# Data extraction

In this section I am going to webscrape data about flats across Poland from https://www.otodom.pl.

In [105]:
from bs4 import BeautifulSoup
import requests
import json
import numpy as np
from typing import NamedTuple
import datetime as dt
import time

In [120]:
class PrimaryOfferInfo(NamedTuple):
    # PRIMARY
    area: float
    ownership: str
    n_rooms: int
    construction_status: str
    floor: int
    outdoor_amenities: list
    rent_price: float
    car: str
    # ADDITIONAL
    build_year: int
    lift: str
        
    def __str__(self):
        return f"| Area={self.area} | Ownership={self.ownership} | No.Rooms={self.n_rooms} | Status={self.construction_status} |\
         Floor={self.floor} | Amenities={self.outdoor_amenities} | RentPrice={self.rent_price} | Car={self.car} | \
         BuildYear={self.build_year} | Lift={self.lift}"

In [91]:
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 14.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.4970.63 Safari/537.36 OPRGX/105.0.4970.63 OPR/105.0.4970.63"
}

def get_page_info(endpoint, n_page: str) -> list:
    try:
        response = requests.get(ENDPOINT+n_page, headers=HEADERS)
        response.raise_for_status()
        print(response.status_code, end="")
        soup = BeautifulSoup(response.text, "html.parser")
    except Exception as e:
        print("ERROR", e)
        return None

    res = soup.find_all("a")
    return [r["href"] for r in res if "/pl/oferta/" in r["href"] and r["href"].startswith("/pl")]
    

# URLs scraper

In [100]:
#%xmode verbose
URL_NUMBER = 3000

ENDPOINT = "https://www.otodom.pl/pl/wyniki/sprzedaz/mieszkanie/cala-polska?viewType=listing&limit=72&page="
pages = np.arange(1, 1000).astype("str")

start = dt.datetime.now()

urls = set()

idx = 0
while len(urls) < URL_NUMBER:
    urls |= set(get_page_info(ENDPOINT, pages[idx]))
    idx += 1
    print(f"|{idx}: ", len(urls))

with open("urls1.txt", "w") as file:
    for url in urls:
        file.write(f"{url}\n")



    
print(dt.datetime.now() - start)



200
|1:  75
200
|2:  149
200
|3:  224
200
|4:  289
200
|5:  363
200
|6:  432
200
|7:  503
200
|8:  574
200
|9:  646
200
|10:  711
200
|11:  782
200
|12:  854
200
|13:  925
200
|14:  997
200
|15:  1068
200
|16:  1136
200
|17:  1208
200
|18:  1280
200
|19:  1352
200
|20:  1424
200
|21:  1495
200
|22:  1567
200
|23:  1639
200
|24:  1711
200
|25:  1783
200
|26:  1855
200
|27:  1927
200
|28:  1999
200
|29:  2060
200
|30:  2132
200
|31:  2204
200
|32:  2276
200
|33:  2337
200
|34:  2409
200
|35:  2481
200
|36:  2553
200
|37:  2625
200
|38:  2697
200
|39:  2769
200
|40:  2814
200
|41:  2822
200
|42:  2880
200
|43:  2952
200
|44:  3024
0:01:04.419012


# Data scraping from each single URL

In [130]:
OTODOM_ROOT = "https://www.otodom.pl"
TEST_LINK = "/pl/oferta/mieszkanie-w-samym-centrum-z-tarasem-ID4q2pi"


with open("urls1.txt", "r") as file:
    URLs = file.read().splitlines()
    
    # PRIMARY
    area: float
    #ownership: str
    #n_rooms: int
    #construction_status: str
    #floor: int
    #outdoor_amenities: list
    #rent_price: float
    #car: str
    ## ADDITIONAL
    #build_year: int
    #lift: str
    
data = []

    
for i in range(5):
    try:
        response = requests.get(OTODOM_ROOT+URLs[i], headers=HEADERS)
        response.raise_for_status()
        print(response.status_code)
        soup = BeautifulSoup(response.text, "html.parser")

        json_text = soup.find("script", {"id": "__NEXT_DATA__"})
        js = json.loads(json_text.contents[0])
        #print(js)
        
        area = js["props"]["pageProps"]["ad"]["topInformation"][0]["values"]
        ownership = js["props"]["pageProps"]["ad"]["topInformation"][1]["values"]
        n_rooms = js["props"]["pageProps"]["ad"]["topInformation"][2]["values"]
        construction_status = js["props"]["pageProps"]["ad"]["topInformation"][3]["values"]
        floor = js["props"]["pageProps"]["ad"]["topInformation"][4]["values"]
        outdoor_amenities = js["props"]["pageProps"]["ad"]["topInformation"][5]["values"]
        rent_price = js["props"]["pageProps"]["ad"]["topInformation"][6]["values"]
        car = js["props"]["pageProps"]["ad"]["topInformation"][7]["values"]
        
        if js["props"]["pageProps"]["ad"]["additionalInformation"]:
            build_year = js["props"]["pageProps"]["ad"]["additionalInformation"][3]["values"]
            lift = js["props"]["pageProps"]["ad"]["additionalInformation"][6]["values"]
        else:
            build_year = 0
            lift = "0"
    
        info = PrimaryOfferInfo(area, ownership, n_rooms, construction_status,
                                floor, outdoor_amenities, rent_price, car,
                                build_year, lift)
        
        data.append(info)
        


    except Exception as e:
        print("ERROR", e, i)
        

for info in data:
    print(info)

200
200
200
200
200
| Area=['76.68'] | Ownership=['building_ownership::full_ownership'] | No.Rooms=['4'] | Status=['construction_status::to_completion'] |         Floor=['floor_no::floor_1', '/1'] | Amenities=[] | RentPrice=[] | Car=['extras_types-85::garage'] |          BuildYear=0 | Lift=0
| Area=['52.68'] | Ownership=[] | No.Rooms=['3'] | Status=['construction_status::to_completion'] |         Floor=['floor_no::floor_2', '/3'] | Amenities=['extras_types::balcony'] | RentPrice=[] | Car=['extras_types-85::garage'] |          BuildYear=['2023'] | Lift=['::n']
| Area=['35'] | Ownership=['building_ownership::full_ownership'] | No.Rooms=['1'] | Status=['construction_status-67::ready_to_use'] |         Floor=['floor_no::floor_3', '/3'] | Amenities=['extras_types::balcony'] | RentPrice=['200 zł'] | Car=['extras_types-85::garage'] |          BuildYear=['2008'] | Lift=['::n']
| Area=['41.32'] | Ownership=['building_ownership::full_ownership'] | No.Rooms=['2'] | Status=['construction_status::t