# Data extraction

In this section I am going to webscrape data about flats across Poland from https://www.otodom.pl.

In [105]:
from bs4 import BeautifulSoup
import requests
import json
import numpy as np
from typing import NamedTuple
import datetime as dt
import time

In [152]:
class PrimaryOfferInfo(NamedTuple):
    # PRIMARY
    price: str
    area: str
    ownership: str
    n_rooms: int
    construction_status: str
    floor: int
    balcony: str
    terrace: str
    garden: str
    rent_price: str
    car: str
    # ADDITIONAL
    build_year: str
    lift: str
    city: str
        
    def csv_format(self):
        return ",".join([self.price, self.area, self.ownership, self.n_rooms, self.construction_status,
                         self.floor, self.balcony, self.terrace, self.garden, self.rent_price,
                         self.car, self.build_year, self.lift, self.city])
        
    def __str__(self):
        return f"| Price={self.price} | Area={self.area} | Ownership={self.ownership} | No.Rooms={self.n_rooms} | Status={self.construction_status} |\
         Floor={self.floor} | Balcony={self.balcony} | Terrace={self.terrace} | Garden={self.garden} | RentPrice={self.rent_price} | Car={self.car} | \
         BuildYear={self.build_year} | Lift={self.lift} | City={self.city} |"

In [91]:
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 14.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.4970.63 Safari/537.36 OPRGX/105.0.4970.63 OPR/105.0.4970.63"
}

def get_page_info(endpoint, n_page: str) -> list:
    try:
        response = requests.get(ENDPOINT+n_page, headers=HEADERS)
        response.raise_for_status()
        print(response.status_code, end="")
        soup = BeautifulSoup(response.text, "html.parser")
    except Exception as e:
        print("ERROR", e)
        return None

    res = soup.find_all("a")
    return [r["href"] for r in res if "/pl/oferta/" in r["href"] and r["href"].startswith("/pl")]
    

# URLs scraper

In [163]:
#%xmode verbose
URL_NUMBER = 10000
OUTPUT = "urls2.txt"

ENDPOINT = "https://www.otodom.pl/pl/wyniki/sprzedaz/mieszkanie/cala-polska?viewType=listing&limit=72&page="
pages = np.arange(1, 1000).astype("str")

start = dt.datetime.now()

urls = set()

idx = 0
while len(urls) < URL_NUMBER:
    urls |= set(get_page_info(ENDPOINT, pages[idx]))
    idx += 1
    print(f"|{idx}: ", len(urls))

with open(OUTPUT, "w") as file:
    for url in urls:
        file.write(f"{url}\n")



    
print(dt.datetime.now() - start)



200
|1:  75
200
|2:  150
200
|3:  223
200
|4:  297
200
|5:  370
200
|6:  445
200
|7:  517
200
|8:  589
200
|9:  663
200
|10:  736
200
|11:  810
200
|12:  884
200
|13:  954
200
|14:  1023
200
|15:  1090
200
|16:  1160
200
|17:  1227
200
|18:  1301
200
|19:  1373
200
|20:  1446
200
|21:  1516
200
|22:  1556
200
|23:  1565
200
|24:  1576
200
|25:  1587
200
|26:  1600
200
|27:  1607
200
|28:  1617
200
|29:  1668
200
|30:  1738
200
|31:  1785
200
|32:  1853
200
|33:  1918
200
|34:  1988
200
|35:  2057
200
|36:  2129
200
|37:  2201
200
|38:  2273
200
|39:  2345
200
|40:  2417
200
|41:  2489
200
|42:  2556
200
|43:  2628
200
|44:  2700
200
|45:  2772
200
|46:  2844
200
|47:  2911
200
|48:  2983
200
|49:  3055
200
|50:  3122
200
|51:  3194
200
|52:  3259
200
|53:  3331
200
|54:  3403
200
|55:  3475
200
|56:  3547
200
|57:  3619
200
|58:  3686
200
|59:  3758
200
|60:  3830
200
|61:  3902
200
|62:  3974
200
|63:  4046
200
|64:  4118
200
|65:  4190
200
|66:  4262
200
|67:  4334
200
|68:  4406
200

# Data scraping from each single URL

In [167]:
OTODOM_ROOT = "https://www.otodom.pl"
TEST_LINK = "/pl/oferta/mieszkanie-w-samym-centrum-z-tarasem-ID4q2pi"
BLOCK_SIZE = 20
URL_FILE = "urls2.txt"
CSV_OUTPUT = "otodom_data_raw.csv"

start = dt.datetime.now()

with open(URL_FILE, "r") as file:
    URLs = file.read().splitlines()
    
data = []
    
for i in range(60):
    try:
        response = requests.get(OTODOM_ROOT+URLs[i], headers=HEADERS)
        response.raise_for_status()
        #print(response.status_code)
            
        soup = BeautifulSoup(response.text, "html.parser")

        json_text = soup.find("script", {"id": "__NEXT_DATA__"})
        js = json.loads(json_text.contents[0])
        #print(js)
        try:
            price = str(js["props"]["pageProps"]["ad"]["target"]["Price"])
        except:
            price = "-1"
        city = str(js["props"]["pageProps"]["ad"]["target"]["City"])
        
        area = js["props"]["pageProps"]["ad"]["topInformation"][0]["values"]
        if area: area = str(area[0])
        else: area = "-1"
            
        ownership = js["props"]["pageProps"]["ad"]["topInformation"][1]["values"]
        if ownership: ownership = str(ownership[0])
        else: ownership = "-1"
            
        n_rooms = js["props"]["pageProps"]["ad"]["topInformation"][2]["values"]
        if n_rooms: n_rooms = str(n_rooms[0])
        else: n_rooms = "-1"
            
        construction_status = js["props"]["pageProps"]["ad"]["topInformation"][3]["values"]
        if construction_status: construction_status = str(construction_status[0])
        else: construction_status = "-1"
            
        floor = js["props"]["pageProps"]["ad"]["topInformation"][4]["values"]
        if floor: floor = str(floor[0])
        else: floor = "-1"
            
        outdoor_amenities = js["props"]["pageProps"]["ad"]["topInformation"][5]["values"]
        balcony = str(int("extras_types::balcony" in outdoor_amenities))
        terrace = str(int("extras_types::terrace" in outdoor_amenities))
        garden = str(int("extras_types::garden" in outdoor_amenities))
        
        
        rent_price = js["props"]["pageProps"]["ad"]["topInformation"][6]["values"]
        if rent_price: rent_price = ''.join(filter(str.isdigit, str(rent_price[0])))
        else: rent_price = "-1"
            
        car = js["props"]["pageProps"]["ad"]["topInformation"][7]["values"]
        if car: car = str(car[0])
        else: car = "-1"
        
        if js["props"]["pageProps"]["ad"]["additionalInformation"]:
            build_year = js["props"]["pageProps"]["ad"]["additionalInformation"][3]["values"]
            if build_year: build_year = str(build_year[0])
            else: build_year = "-1"
                
            lift = js["props"]["pageProps"]["ad"]["additionalInformation"][6]["values"]
            if lift: lift = str(lift[0])
            else: lift = "-1"
        else:
            build_year = "-1"
            lift = "-1"
    
        info = PrimaryOfferInfo(price, area, ownership, n_rooms, construction_status,
                                floor, balcony, terrace, garden, rent_price, car,
                                build_year, lift, city)
        
        data.append(info)
        
        if i % BLOCK_SIZE == BLOCK_SIZE-1:
            print("BLOCKSAVING", i)
            with open(CSV_OUTPUT, "a") as file:
                for info in data:
                    file.write(info.csv_format()+"\n")
            data = []
        
        if i % 10 == 1: print(i)
        


    except Exception as e:
        print("ERROR", e, i)

print("EXECUTION:", dt.datetime.now() - start)
    


1
11
BLOCKSAVING 19
21
31
BLOCKSAVING 39
41
51
BLOCKSAVING 59
EXECUTION: 0:00:31.924276


''