In [58]:
import requests
import json
import pandas as pd
import requests
import re
import pickle

from bs4 import BeautifulSoup
from zeep import Client
from otodom import category, utils

In [59]:
def get_page_count(soup: BeautifulSoup) -> int:
    text = str(soup)

    # Search for "page_count":XX in the text
    match = re.search('"page_count":([0-9]+)', text)

    # If a match is found, convert it to an integer
    if match:
        page_count = int(match.group(1))
        return page_count
    else:
        raise Exception('No match found.')

In [237]:
def get_page(page_number: int = 1, region: str = "pomorskie/gdansk/gdansk/gdansk") -> str:
    headers = {
        'User-Agent': 'Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148'
    }

    params = {
        "viewType": "listing",
        "limit": 72,
        "page": page_number
    }

    r = requests.get(
        f"https://www.otodom.pl/pl/wyniki/sprzedaz/mieszkanie/{region}",
        headers=headers,
        params=params,
    )
    print(page_number, end=" ")
    if not r.ok:
        raise Exception(r.text)

    return r.text


def get_pages(load_saved: bool = False, save: bool = True) -> list:
    import os
    if os.path.exists("pages.pkl") and load_saved:
        with open("pages.pkl", "rb") as f:
            return pickle.load(f)
    else:
        soup = BeautifulSoup(get_page(), "lxml")

        page_count = get_page_count(soup)

        pages = []
        for i in range(1, page_count):
            import time
            time.sleep(3)
            pages.append(get_page(i))

    if save:
        def save_pages():
            with open("pages.pkl", "wb+") as f:
                pickle.dump(pages, f)
        save_pages()
    return pages

In [159]:
def get_offers_from_page(page: str) -> list:
    """
    @deprecated

    uses the 'application/ld+json' script tag with '@graph' in it that contains
    all of the search results

    this is a brittle approach as the SEO data is only given on the first page
    of the search results

    this is deprecated in favor of the function that uses __NEXT_DATA__ script
    tag

    :param page: HTML of the page
    :return: list of offers
    """
    soup = BeautifulSoup(page, "lxml")
    ld_seo_scripts = soup.find_all('script', {'type': 'application/ld+json'})
    print(ld_seo_scripts)
    if ld_seo_scripts is not None and len(ld_seo_scripts) > 0:
        for script in ld_seo_scripts:
            data = json.loads(script.string)
            # there is just one product below
            # indexing like that might cause an error in the future
            offers = [d for d in data['@graph'] if d['@type']
                      == 'Product'][0]['offers']['offers']
            return offers

In [251]:
def get_items_from_page(page: str, save: bool = False) -> list:
    """
    uses the __NEXT_DATA__ script tag to get the search results

    :param page: HTML of the page
    :param save: if True, saves the data to data.json, useful for debugging
    :return: list of items
    """
    soup = BeautifulSoup(page, "lxml")
    next_data_script = soup.find('script', {'id': '__NEXT_DATA__'})
    data = json.loads(next_data_script.string)
    if save:
        with open("data.json", "w+") as f:
            f.write(json.dumps(data))
    return data['props']['pageProps']['data']['searchAds']['items']

In [238]:
pages = get_pages()

1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 

In [252]:
all_items = []
for page in pages:
    items = get_items_from_page(page)
    if items is not None and len(items):
        all_items += items

In [264]:
def number_of_rooms_to_int(number_of_rooms: str) -> int:
    if number_of_rooms == "ONE":
        return 1
    elif number_of_rooms == "TWO":
        return 2
    elif number_of_rooms == "THREE":
        return 3
    elif number_of_rooms == "FOUR":
        return 4
    elif number_of_rooms == "FIVE":
        return 5


def preprocess_items_df(df: pd.DataFrame):
    _df = pd.DataFrame()
    _df['title'] = df['title']
    _df['price_hidden'] = df['hidePrice']
    _df['price'] = df.apply(lambda x: x['totalPrice']['value']
                            if x['hidePrice'] == False else None, axis=1)
    _df['price_per_m2'] = df.apply(
        lambda x: x['pricePerSquareMeter']['value'] if x['hidePrice'] == False else None, axis=1,
    )
    _df['floor_size'] = df['areaInSquareMeters']
    _df['city'] = df['location'].apply(lambda x: x['address']['city']['name'])
    _df['address'] = df['location'].apply(lambda x: "{} {}".format(
        x['address']['street']['name'],
        x['address']['street']['number'],
    ) if x['address']['street'] is not None else None)
    _df['number_of_rooms'] = df['roomsNumber'].apply(
        lambda x: number_of_rooms_to_int(x))
    # fill the number of rooms with the floor size divided by 35
    # TODO 35 is an arbitrary number
    # could use the sample mean instead
    # the price and floor_size are still relevant from those entries, so it's
    # better to keep them
    _df['number_of_rooms'].fillna(_df['floor_size'] / 35, inplace=True)
    _df['url'] = df['slug'].map(
        lambda x: "https://otodom.pl/pl/oferta/{}".format(x))

    # get rid of properties with hidden price
    _df = _df[_df['price_hidden'] == False]

    return _df

In [265]:
df = pd.DataFrame(all_items)
df.columns
df = preprocess_items_df(df)
df.shape

(5081, 9)

In [162]:
def preprocess_offers_df(df: pd.DataFrame):
    """
    @deprecated

    :param df: offers dataframe
    :return: clean dataframe with just the relevant information
    """
    def address_to_string(address: dict) -> str:
        return "{} {} {} {}".format(
            address["addressCountry"],
            address["addressLocality"],
            address["addressRegion"],
            address["streetAddress"] if "streetAddress" in address else "",
        )

    floor_sizes = []
    numbers_of_rooms = []
    addresses = []
    cities = []
    # regions = []
    for i in range(len(df)):
        item = df['itemOffered'][i]
        floor_sizes.append(item['floorSize']['value'])
        numbers_of_rooms.append(item['numberOfRooms'])
        addresses.append(address_to_string(item['address']))
        cities.append(item['address']['addressLocality'])
        # regions.append(item['address']['addressRegion'])

    df['floorSize'] = floor_sizes
    df['address'] = addresses
    df['numberOfRooms'] = numbers_of_rooms
    df['city'] = cities
    # df['region'] = region

    clean_df = df[['price', 'name', 'floorSize',
                   'address', 'numberOfRooms', 'city']]

    with open("offers_df.pkl", "wb+") as f:
        pickle.dump(clean_df, f)

    return clean_df

In [268]:
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [269]:
X = df[['floor_size', 'number_of_rooms']].to_numpy()
y = df['price'].to_numpy()

# Step 2: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.1, random_state=42)

print(X_train, y_train)

# Step 3: Model Training
model = LinearRegression()
model.fit(X_train, y_train)

# Generate predictions and plot regression line
# x_range = np.linspace(X.min(), X.max(), 100)
# y_pred_line = model.predict(x_range)

# Step 4: Model Evaluation
y_pred = model.predict(X_test)
print(
    f"Root Mean Squared Error: {mean_squared_error(y_test, y_pred, squared=False)}")

[[ 65.     2.  ]
 [ 76.98   3.  ]
 [ 63.6    3.  ]
 ...
 [ 45.     2.  ]
 [ 65.     3.  ]
 [100.     4.  ]] [1880000.  869000.  589000. ...  455500.  699000. 1280000.]
Root Mean Squared Error: 412960.2907052206


In [160]:
%pip install -q matplotlib

Note: you may need to restart the kernel to use updated packages.


In [273]:
import matplotlib.pyplot as plt
import pandas as pd

# Assuming you have already created the 'city' column in your dataframe
# df['city'] = ...

# Scatter plot
fig, ax = plt.subplots()
ax.scatter(df['floor_size'], df['price'])
ax.set_xlabel('Floor Size (m^2)')
ax.set_ylabel('Price (PLN)')
ax.set_title('Price vs Floor Size')

plt.show()

In [250]:
len(df['title'].unique())

47

In [282]:
pd.set_option('max_colwidth', 100)

In [None]:
# TODO calculate distance to the center to use the location factor also
# calculate the distance to the nearest SKM station
# calculate distance to the nearest store, train station etc
# those coordinates are available in the page data I suppose

In [292]:
df[df['address'].notna()].sort_values(by="price_per_m2")[(
    df['price'] >= 400_000) & (df['price'] <= 500_000)].head(50)

  df[df['address'].notna()].sort_values(by="price_per_m2")[(df['price'] >= 400_000) & (df['price'] <= 500_000)].head(50)


Unnamed: 0,title,price_hidden,price,price_per_m2,floor_size,city,address,number_of_rooms,url
4089,Mieszkanie 3 pokojowe z garażem- licytacja,False,496500.0,4728.0,105.01,Gdańsk,ul. Zeusa,3.0,https://otodom.pl/pl/oferta/mieszkanie-3-pokojowe-z-garazem-licytacja-ID4miat
5841,Okazja Dwupoziomowe Mieszkanie 100 M2,False,500000.0,5000.0,100.0,Gdańsk,ul. Rycerza Blizbora,4.0,https://otodom.pl/pl/oferta/okazja-dwupoziomowe-mieszkanie-100-m2-ID4kWAe
4109,Nowe mieszkanie w gdańskim Borkowie!,False,425171.0,5990.0,70.98,Gdańsk,ul. gen. Elżbiety Zawackiej,4.0,https://otodom.pl/pl/oferta/nowe-mieszkanie-w-gdanskim-borkowie-ID4mhu0
4079,"Nowy Port! 2 pokoje, osobna kuchnia!",False,459000.0,6955.0,66.0,Gdańsk,ul. Oliwska,2.0,https://otodom.pl/pl/oferta/nowy-port-2-pokoje-osobna-kuchnia-ID4iISh
1314,2 Pokoje Blisko Morza Do Remontu,False,459000.0,6959.0,65.96,Gdańsk,ul. Oliwska,2.0,https://otodom.pl/pl/oferta/2-pokoje-blisko-morza-do-remontu-ID4mKus
5225,Mieszkanie na ostatnim IV piętrze Gdańsk Kokoszki,False,439000.0,7109.0,61.75,Gdańsk,ul. Fundamentowa,3.0,https://otodom.pl/pl/oferta/mieszkanie-na-ostatnim-iv-pietrze-gdansk-kokoszki-ID4gu0A
5732,2 pokoje z widokiem na Park Oliwski,False,459000.0,7237.0,63.42,Gdańsk,ul. Opata Jacka Rybińskiego,2.0,https://otodom.pl/pl/oferta/2-pokoje-z-widokiem-na-park-oliwski-ID4lgZs
6329,***Rabat 113 000zł.**** 20min do Centrum Gdańska.,False,421392.0,7631.0,55.22,Gdańsk,ul. Władysława Jagiełły,2.0,https://otodom.pl/pl/oferta/rabat-113-000zl-20min-do-centrum-gdanska-ID4jhWM
6353,2 pokoje z odzielną kuchnią | Blisko Morza,False,429000.0,7800.0,55.0,Gdańsk,ul. Jeremiasza Falck Polonusa,2.0,https://otodom.pl/pl/oferta/2-pokoje-z-odzielna-kuchnia-blisko-morza-ID4jesW
1063,GOTOWE 3 pokoje +OGRÓDEK_ Widna kuchnia_duży salon,False,482195.0,7800.0,61.82,Gdańsk,Maćkowy,3.0,https://otodom.pl/pl/oferta/gotowe-3-pokoje-ogrodek-widna-kuchnia-duzy-salon-ID4mKYd


In [293]:
GOOGLE_MAPS_API_KEY = 'AIzaSyA8uokgwZ1o3EKtZX37iuOCAbwxBoVRYv8'