# Data extraction

In this section I am going to webscrape data about flats across Poland from https://www.otodom.pl.

In [2]:
from bs4 import BeautifulSoup
import requests
import json
import numpy as np
from typing import NamedTuple
import datetime as dt
import time

In [69]:
class PrimaryOfferInfo(NamedTuple):
    # PRIMARY
    price: str
    area: str
    ownership: str
    n_rooms: int
    construction_status: str
    floor: int
    balcony: str
    terrace: str
    garden: str
    rent_price: str
    car: str
    # ADDITIONAL
    build_year: str
    lift: str
    city: str
    longitude: float
    latitude: float
        
    def csv_format(self):
        return ",".join([self.price, self.area, self.ownership, self.n_rooms, self.construction_status,
                         self.floor, self.balcony, self.terrace, self.garden, self.rent_price,
                         self.car, self.build_year, self.lift, self.city, self.longitude, self.latitude])
        
    def __str__(self):
        return f"| Price={self.price} | Area={self.area} | Ownership={self.ownership} | No.Rooms={self.n_rooms} | Status={self.construction_status} |\
         Floor={self.floor} | Balcony={self.balcony} | Terrace={self.terrace} | Garden={self.garden} | RentPrice={self.rent_price} | Car={self.car} | \
         BuildYear={self.build_year} | Lift={self.lift} | City={self.city} | Longitude={self.longitude} | Latitude={self.latitude}"

# Headers needed for the correct filtering

Simple **User-Agent** was not sufficient for filtered data ( only for Krakow city ), the Otodom site kept on returning the results from the whole country. I had to investigate **headers and payload** of the **GET** requests i have been sending via browser to replicate it and use it in the script.

In [44]:
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Mobile Safari/537.36"
}

headers = {
  'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
  'accept-language': 'pl-PL,pl;q=0.9,en-US;q=0.8,en;q=0.7',
  'cache-control': 'max-age=0',
  'cookie': 'lang=pl; laquesisff=euads-4389#gre-12226#rer-165#rer-166#rst-73#rst-74; dfp_user_id=43c7f907-e9fb-4f73-ac51-172615941f23; _gcl_au=1.1.421783841.1712652353; st_userID=GA1.2.1804371638.1712652353__unlogged; OptanonAlertBoxClosed=2024-04-09T08:45:54.308Z; eupubconsent-v2=CP8zBjAP8zBjAAcABBENAvE8AP_gAAAAAAYgJ9NX_H_fbX9j8Xp0aft0eY1f99j7rsQxBhfJk-4FyLvW_JwX32EzNA16pqYKmRIEu3bBIQFlHIDUDUigaogVrTDMakWMgTNKJ6BEiFMRe2dYCF5vmQFD-QKY5tpvd3d52Te9_dv83dzyz4Vnn3Kp_-e1WJCdA5cgAAAAAAAAAAAAAAAQAAAAAAAAAQAIAAAAAAAAAAAAAAAAAAAAA_cBf78AAABgSCEAAgABcAFAAVAA4AB4AEEALwAwgBkAGoAPAAiABMACqAGYAN4AegA_ACEgEMARIAjgBLACaAGAAMOAZQBlgDZAHPAO4A74B7AHxAPsA_YB_gIAARSAi4CMAEagJEAksBPwFBgKgAq4BcwC9AGKANEAbQA3ABxIEegSIAnYBQ4CjwFIgLYAXIAu8BeYDBgGGwMjAyQBk4DMwGcwNXA1kBt4DcwG6gOCAcmA5cCbgQAuAA4AEgARwCDgEcAJoAX0BKwCbQFIAK5AWEAsQBbgC8gGIAMWAZCA0YBqYDaAG3AN0HALAAEQAOAA8AC4AJAAfgBHACgAGgARwA5ACAQEHAQgAiIBHACaAFQAOOAdIBKwCYgEygJtAUnArkCuwFiALUAW4AugBggDEAGLAMhAZMA0YBqYDXgG0ANsAbdA3MDdAHHgOWgc6Bz4E2x0E4ABcAFAAVAA4ACCAFwAagA8ACIAEwAKsAXABdADEAGYAN4AegA_QCGAIkASwAmgBRgDAAGGAMoAaIA2QBzwDuAO8Ae0A-wD9AH_ARQBGICOgJLAT8BQYCogKuAWIAucBeQF6AMUAbQA3ABxADqAH2ARfAj0CRAEyAJ2AUPAo8CkAFNAKsAWLAtgC2QFugLgAXIAu0Bd4C8wF9AMGAYaAx6BkYGSAMnAZUAywBmYDOQGmwNXA1gBt4DdQHFgOTAcuBNwCbwE4SABYABAADwA0ADkAI4AWIAvoCbQFJgK5AWIAvIBggDPAGjANTAbYA24BugDlgHPgTbIQIgAFgAUABcADEAGoATAAqgBcADEAG8APQAjgBgADngHcAd4A_wCKAEpAKDAVEBVwC5gGKANoAdQBHoCmgFWALFAWiAuABcgDIwGTgM5JQJQAEAALAAoAByAGAAYgA8ACIAEwAKoAXAAxQCGAIkARwAowBgADZAHeAPyAqICrgFzAMUAdQBEwCL4EegSIAo8BTQCxQFsALzgZGBkgDJwGcgNYAbeBNwCcJIAkABcAI4A7gCAAEHAI4AVABKwCYgE2gKTAW4AxYBlgDPAG6AOWAm2UARgAKAAuACQAFwARwAtgCOAHIAO4AfYBAACDgFiALqAa8A7YB_wExAJtAVIArsBbgC6AF5AMEAYsAyYBngDRgGpgNegbmBugDlgJtgThKQPAAFwAUABUADgAIIAYABqADwAIgATAAqgBiADMAH6AQwBEgCjAGAAMoAaIA2QBzgDvgH4AfoBFgCMQEdASUAoMBUQFXALmAXkAxQBtADcAHUAPaAfYBEwCL4EegSIAnYBQ4CkAFNAKsAWKAtgBcAC5AF2gLzAX0Aw2BkYGSAMnAZYAzmBrAGsgNvAbqA4IByYE3i0AoAGoAjgBgADuAL0AfYBTQCrAGZgTcLACgBlgEcAR6AmIBNoCuQGjANTAboA5YAA.f_wAAAAAAAAA; laquesissu=666@pin_click|1#666@zoom_map|1#666@pan_map|1; PHPSESSID=rmb36bnalblmn4g7bbar16lnd6; mobile_default=desktop; ninja_user_status=unlogged; smcx_430910767_last_shown_at=1715524368033; _gid=GA1.2.932160579.1716542441; laquesis=eure-19653@b#eure-19720@b#eure-25578@b#eure-25610@a#eure-26485@a#resl-427@b#resl-648@a#seore-998@b#sfs-1183@a#smr-3411@a; lqstatus=1716547712|18fa9e7044cx294e3764|eure-19720#resl-648#smr-3411#sfs-1183#eure-26485#eure-25578#eure-19653||; OptanonConsent=isGpcEnabled=0&datestamp=Fri+May+24+2024+12%3A34%3A44+GMT%2B0200+(czas+%C5%9Brodkowoeuropejski+letni)&version=202401.2.0&browserGpcFlag=0&isIABGlobal=false&hosts=&consentId=1add18f4-8ac2-4629-9787-a862c4ed7a1e&interactionCount=1&landingPath=NotLandingPage&groups=C0001%3A1%2CC0002%3A1%2CC0003%3A1%2CC0004%3A1%2Cgad%3A1&geolocation=%3B&AwaitingReconsent=false; _ga=GA1.1.1804371638.1712652353; _ga_20T1C2M3CQ=GS1.1.1716542441.20.1.1716546885.58.0.0; _ga_6PZTQNYS5C=GS1.1.1716544661.21.1.1716546885.58.0.0; onap=18ec208f9bcx4af0f441-16-18fa9e7044cx294e3764-301-1716549068; __gads=ID=8697554daeb46e78:T=1712652354:RT=1716547266:S=ALNI_MbVJygBMTOjQ9oRQ1l-kRF8vSzxfQ; __gpi=UID=00000de810a1f421:T=1712652354:RT=1716547266:S=ALNI_MbNDORPHcmuZrna2Ff3jVOdjiBu1g; __eoi=ID=0bb58e9662bc0efe:T=1713095891:RT=1716547266:S=AA-AfjbirfVdeNo6-j0PeaMkbRt4',
  'sec-ch-ua': '"Opera GX";v="109", "Not:A-Brand";v="8", "Chromium";v="123"',
  'sec-ch-ua-mobile': '?1',
  'sec-ch-ua-platform': '"Android"',
  'sec-fetch-dest': 'document',
  'sec-fetch-mode': 'navigate',
  'sec-fetch-site': 'none',
  'sec-fetch-user': '?1',
  'upgrade-insecure-requests': '1',
  'user-agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Mobile Safari/537.36'
}

payload = {}



def get_page_info(endpoint: str, n_page: str) -> list:
    try:
        response = requests.get(endpoint+n_page, headers=headers, data=payload)
        response.raise_for_status()
        print(response.status_code, end="")
        soup = BeautifulSoup(response.text, "html.parser")
    except Exception as e:
        print("ERROR", e)
        return None

    res = soup.find_all("a")
    return [r["href"] for r in res if "/pl/oferta/" in r["href"] and r["href"].startswith("/pl")]
    

# URLs scraper

In [47]:
#%xmode verbose
URL_NUMBER = 7800
OUTPUT = "urls_100k.txt"
OUTPUT_KRK = "urls_krk.txt"

ENDPOINT = "https://www.otodom.pl/pl/wyniki/sprzedaz/mieszkanie/cala-polska?viewType=listing&limit=72&page="
ENDPOINT_KRK = "https://www.otodom.pl/pl/wyniki/sprzedaz/mieszkanie/malopolskie/krakow/krakow/krakow?limit=72&areaMin=15&areaMax=100&viewType=listing&page="

start = dt.datetime.now()

urls = set()

page = 1
while len(urls) < URL_NUMBER:
    urls |= set(get_page_info(ENDPOINT_KRK, str(page)))
    page += 1
    print(f"|{idx}: ", len(urls))

with open(OUTPUT_KRK, "w") as file:
    for url in urls:
        file.write(f"{url}\n")



    
print(dt.datetime.now() - start)



200|3:  73
200|3:  148
200|3:  221
200|3:  293
200|3:  365
200|3:  439
200|3:  510
200|3:  559
200|3:  631
200|3:  704
200|3:  777
200|3:  848
200|3:  921
200|3:  992
200|3:  1063
200|3:  1130
200|3:  1202
200|3:  1273
200|3:  1344
200|3:  1415
200|3:  1486
200|3:  1558
200|3:  1627
200|3:  1699
200|3:  1771
200|3:  1843
200|3:  1915
200|3:  1977
200|3:  1982
200|3:  1987
200|3:  2018
200|3:  2090
200|3:  2162
200|3:  2234
200|3:  2306
200|3:  2378
200|3:  2450
200|3:  2522
200|3:  2594
200|3:  2644
200|3:  2716
200|3:  2786
200|3:  2858
200|3:  2930
200|3:  3002
200|3:  3073
200|3:  3145
200|3:  3217
200|3:  3289
200|3:  3361
200|3:  3431
200|3:  3503
200|3:  3575
200|3:  3647
200|3:  3692
200|3:  3760
200|3:  3832
200|3:  3904
200|3:  3976
200|3:  4047
200|3:  4119
200|3:  4191
200|3:  4263
200|3:  4325
200|3:  4394
200|3:  4453
200|3:  4525
200|3:  4597
200|3:  4669
200|3:  4736
200|3:  4790
200|3:  4862
200|3:  4934
200|3:  5006
200|3:  5078
200|3:  5150
200|3:  5222
200|3:  5272
2

# Data scraping from each single URL

In [71]:
OTODOM_ROOT = "https://www.otodom.pl"
BLOCK_SIZE = 200
URL_FILE = "urls2.txt"
CSV_OUTPUT = "otodom_krk_raw.csv"

start = dt.datetime.now()

with open(OUTPUT_KRK, "r") as file:
    URLs = file.read().splitlines()
    
data = []
    
for i in range(len(URLs)):
    try:
        response = requests.get(OTODOM_ROOT+URLs[i], headers=HEADERS)
        response.raise_for_status()
        #print(response.status_code)
            
        soup = BeautifulSoup(response.text, "html.parser")

        json_text = soup.find("script", {"id": "__NEXT_DATA__"})
        js = json.loads(json_text.contents[0])
        #print(js)
        try:
            price = str(js["props"]["pageProps"]["ad"]["target"]["Price"])
        except:
            price = "-1"
        city = str(js["props"]["pageProps"]["ad"]["target"]["City"])
        
        area = js["props"]["pageProps"]["ad"]["topInformation"][0]["values"]
        if area: area = str(area[0])
        else: area = "-1"
            
        ownership = js["props"]["pageProps"]["ad"]["topInformation"][1]["values"]
        if ownership: ownership = str(ownership[0])
        else: ownership = "-1"
            
        n_rooms = js["props"]["pageProps"]["ad"]["topInformation"][2]["values"]
        if n_rooms: n_rooms = str(n_rooms[0])
        else: n_rooms = "-1"
            
        construction_status = js["props"]["pageProps"]["ad"]["topInformation"][3]["values"]
        if construction_status: construction_status = str(construction_status[0])
        else: construction_status = "-1"
            
        floor = js["props"]["pageProps"]["ad"]["topInformation"][4]["values"]
        if floor: floor = str(floor[0])
        else: floor = "-1"
            
        outdoor_amenities = js["props"]["pageProps"]["ad"]["topInformation"][5]["values"]
        balcony = str(int("extras_types::balcony" in outdoor_amenities))
        terrace = str(int("extras_types::terrace" in outdoor_amenities))
        garden = str(int("extras_types::garden" in outdoor_amenities))
        
        
        rent_price = js["props"]["pageProps"]["ad"]["topInformation"][6]["values"]
        if rent_price: rent_price = ''.join(filter(str.isdigit, str(rent_price[0])))
        else: rent_price = "-1"
            
        car = js["props"]["pageProps"]["ad"]["topInformation"][7]["values"]
        if car: car = str(car[0])
        else: car = "-1"
        
        if js["props"]["pageProps"]["ad"]["additionalInformation"]:
            build_year = js["props"]["pageProps"]["ad"]["additionalInformation"][3]["values"]
            if build_year: build_year = str(build_year[0])
            else: build_year = "-1"
                
            lift = js["props"]["pageProps"]["ad"]["additionalInformation"][6]["values"]
            if lift: lift = str(lift[0])
            else: lift = "-1"
        else:
            build_year = "-1"
            lift = "-1"
        
        coords = js["props"]["pageProps"]["ad"]["location"]["coordinates"]
        if coords:
            long = coords["longitude"]
            lat = coords["latitude"]
        else:
            long, lat = (-1,-1)
        
    
        info = PrimaryOfferInfo(price, area, ownership, n_rooms, construction_status,
                                floor, balcony, terrace, garden, rent_price, car,
                                build_year, lift, city, long, lat)
        
        data.append(info)
        
        if i % BLOCK_SIZE == BLOCK_SIZE-1:
            print("BLOCKSAVING", i)
            with open(CSV_OUTPUT, "a") as file:
                for info in data:
                    file.write(info.csv_format()+"\n")
            data = []
        
        if i % 10 == 1: print(i)
        


    except Exception as e:
        print("ERROR", e, i)

print("EXECUTION:", dt.datetime.now() - start)
    


1
11
21
31
41
51
61
71
81
91
101
111
121
131
141
151
161
171
181
191
BLOCKSAVING 199
ERROR sequence item 14: expected str instance, float found 199
201
211
221
231
241
251
261
271
281
291
301
311
321
331
341
351
361
371
381
391
BLOCKSAVING 399
ERROR sequence item 14: expected str instance, float found 399
401
411
421
431
441
451
461
471
481
491
501
511
521
531
541
551
561
ERROR 502 Server Error: Bad Gateway for url: https://www.otodom.pl/pl/oferta/38-17-m-bez-prowizji-bez-pcc-ID4q4nO 568
571
581
591
BLOCKSAVING 599
ERROR sequence item 14: expected str instance, float found 599
601
611
621
631
641
651
661
671
681
691
701
711
721
731
741
751
761
771
781
791
BLOCKSAVING 799
ERROR sequence item 14: expected str instance, float found 799
801
811
821
831
841
851
861
871
881
891
901
911
921
931
941
951
961
971
981
991
BLOCKSAVING 999
ERROR sequence item 14: expected str instance, float found 999
1001
1011
1021
1031
1041
1051
1061
1071
1081
1091
1101
1111
1121
1131
1141
1151
1161
1171
1181
119

5831
5841
5851
5861
5871
5881
5891
5901
5911
5921
5931
5941
5951
5961
5971
5981
5991
BLOCKSAVING 5999
ERROR sequence item 14: expected str instance, float found 5999
6001
6011
6021
6031
6041
6051
6061
6071
6081
6091
6101
6111
6121
6131
6141
6151
6161
6171
6181
6191
BLOCKSAVING 6199
ERROR sequence item 14: expected str instance, float found 6199
6201
6211
6221
6231
6241
6251
6261
6271
6281
6291
6301
6311
6321
6331
6341
6351
6361
6371
6381
6391
BLOCKSAVING 6399
ERROR sequence item 14: expected str instance, float found 6399
6401
6411
6421
6431
6441
6451
ERROR 410 Client Error: Gone for url: https://www.otodom.pl/pl/oferta/osiedle-piltza-3-pokojowe-50-m2-iii-pietro-ID4pC7K 6456
6461
6471
6481
6491
ERROR 410 Client Error: Gone for url: https://www.otodom.pl/pl/oferta/2-pokojowe-mieszkanie-40m2-balkon-ID4qmJO 6501
6511
ERROR 502 Server Error: Bad Gateway for url: https://www.otodom.pl/pl/oferta/3pokoje-balkon-m-parkingowe-zamkniete-osiedle-ID4qy5V 6517
6521
6531
6541
6551
6561
6571
6581
659

In [48]:
test_url = "https://www.otodom.pl/pl/oferta/mieszkanie-60m2-4pok-balkon-piwnica-ID4nhOC"

In [50]:
OTODOM_ROOT = "https://www.otodom.pl"
response = requests.get(test_url, headers=HEADERS)

soup = BeautifulSoup(response.text, parser="html")

In [54]:
with open("TESTa.txt", "w", encoding="utf-8") as file:
    file.write(response.text)

In [55]:
response = requests.get("https://www.otodom.pl/pl/oferta/mieszkanie-60m2-4pok-balkon-piwnica-ID4nhOC", headers=HEADERS)
response.raise_for_status()
#print(response.status_code)
            
soup = BeautifulSoup(response.text, "html.parser")

json_text = soup.find("script", {"id": "__NEXT_DATA__"})
js = json.loads(json_text.contents[0])

In [68]:
js["props"]["pageProps"]["ad"]["location"]["coordinates"]["latitude"]

49.6593548