In [336]:
import os
import requests
import json
import pandas as pd
import requests
import re
import pickle
import matplotlib.pyplot as plt

from bs4 import BeautifulSoup

In [None]:
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [59]:
def get_page_count(soup: BeautifulSoup) -> int:
    text = str(soup)

    # Search for "page_count":XX in the text
    match = re.search('"page_count":([0-9]+)', text)

    # If a match is found, convert it to an integer
    if match:
        page_count = int(match.group(1))
        return page_count
    else:
        raise Exception('No match found.')

In [237]:
def get_page(page_number: int = 1, region: str = "pomorskie/gdansk/gdansk/gdansk") -> str:
    headers = {
        'User-Agent': 'Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148'
    }

    params = {
        "viewType": "listing",
        "limit": 72,
        "page": page_number
    }

    r = requests.get(
        f"https://www.otodom.pl/pl/wyniki/sprzedaz/mieszkanie/{region}",
        headers=headers,
        params=params,
    )
    print(page_number, end=" ")
    if not r.ok:
        raise Exception(r.text)

    return r.text


def get_pages(load_saved: bool = False, save: bool = True) -> list:
    import os
    if os.path.exists("pages.pkl") and load_saved:
        with open("pages.pkl", "rb") as f:
            return pickle.load(f)
    else:
        soup = BeautifulSoup(get_page(), "lxml")

        page_count = get_page_count(soup)

        pages = []
        for i in range(1, page_count):
            import time
            time.sleep(3)
            pages.append(get_page(i))

    if save:
        def save_pages():
            with open("pages.pkl", "wb+") as f:
                pickle.dump(pages, f)
        save_pages()
    return pages

In [159]:
def get_offers_from_page(page: str) -> list:
    """
    @deprecated

    uses the 'application/ld+json' script tag with '@graph' in it that contains
    all of the search results

    this is a brittle approach as the SEO data is only given on the first page
    of the search results

    this is deprecated in favor of the function that uses __NEXT_DATA__ script
    tag

    :param page: HTML of the page
    :return: list of offers
    """
    soup = BeautifulSoup(page, "lxml")
    ld_seo_scripts = soup.find_all('script', {'type': 'application/ld+json'})
    print(ld_seo_scripts)
    if ld_seo_scripts is not None and len(ld_seo_scripts) > 0:
        for script in ld_seo_scripts:
            data = json.loads(script.string)
            # there is just one product below
            # indexing like that might cause an error in the future
            offers = [d for d in data['@graph'] if d['@type']
                      == 'Product'][0]['offers']['offers']
            return offers

In [307]:
def get_items_from_page(page: str, save: bool = True) -> list:
    """
    uses the __NEXT_DATA__ script tag to get the search results

    :param page: HTML of the page
    :param save: if True, saves the data to data.json, useful for debugging
    :return: list of items
    """
    soup = BeautifulSoup(page, "lxml")
    next_data_script = soup.find('script', {'id': '__NEXT_DATA__'})
    data = json.loads(next_data_script.string)
    if save:
        with open("data.json", "w+") as f:
            f.write(json.dumps(data))
    return data['props']['pageProps']['data']['searchAds']['items']


get_items_from_page(pages[2])

[{'id': 62521691,
  'title': '3-pokojowe mieszkanie 73m2 + ogródek',
  'slug': '3-pokojowe-mieszkanie-73m2-ogrodek-ID4ekKn',
  'estate': 'FLAT',
  'developmentId': 62521565,
  'developmentTitle': 'Dąbka 33',
  'developmentUrl': 'https://www.otodom.pl/pl/oferta/dabka-33-ID4ekIl.html',
  'transaction': 'SELL',
  'location': {'mapDetails': {'radius': 0, '__typename': 'MapDetails'},
   'address': {'street': {'name': 'ul. Świętokrzyska',
     'number': '',
     '__typename': 'Street'},
    'city': {'name': 'Gdańsk', '__typename': 'City'},
    'province': {'name': 'pomorskie', '__typename': 'Province'},
    '__typename': 'Address'},
   'reverseGeocoding': {'locations': [{'fullName': 'pomorskie',
      '__typename': 'BasicLocationObject'},
     {'fullName': 'Gdańsk, pomorskie', '__typename': 'BasicLocationObject'},
     {'fullName': 'Ujeścisko-Łostowice, Gdańsk, pomorskie',
      '__typename': 'BasicLocationObject'}],
    '__typename': 'ReverseGeocoding'},
   '__typename': 'LocationDetails'},

In [238]:
pages = get_pages()

1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 

In [308]:
all_items = []
for page in pages:
    items = get_items_from_page(page)
    if items is not None and len(items):
        all_items += items

In [373]:
def number_of_rooms_to_int(number_of_rooms: str) -> int:
    if number_of_rooms == "ONE":
        return 1
    elif number_of_rooms == "TWO":
        return 2
    elif number_of_rooms == "THREE":
        return 3
    elif number_of_rooms == "FOUR":
        return 4
    elif number_of_rooms == "FIVE":
        return 5


def preprocess_items_df(df: pd.DataFrame):
    _df = pd.DataFrame()
    _df['title'] = df['title']
    _df['price_hidden'] = df['hidePrice']
    _df['price'] = df.apply(lambda x: x['totalPrice']['value']
                            if x['hidePrice'] == False else None, axis=1)
    _df['price_per_m2'] = df.apply(
        lambda x: x['pricePerSquareMeter']['value'] if x['hidePrice'] == False else None, axis=1,
    )
    _df['floor_size'] = df['areaInSquareMeters']
    _df['city'] = df['location'].apply(lambda x: x['address']['city']['name'])
    _df['address'] = df['locationLabel'].apply(lambda x: x['value'])
    _df['number_of_rooms'] = df['roomsNumber'].apply(
        lambda x: number_of_rooms_to_int(x))
    # fill the number of rooms with the floor size divided by 35
    # TODO 35 is an arbitrary number
    # could use the sample mean instead
    # the price and floor_size are still relevant from those entries, so it's
    # better to keep them
    _df['number_of_rooms'].fillna(_df['floor_size'] / 35, inplace=True)
    _df['url'] = df['slug'].map(
        lambda x: "https://otodom.pl/pl/oferta/{}".format(x))

    # get rid of properties with hidden price
    _df = _df[_df['price_hidden'] == False]

    return _df

In [374]:
all_items_df = pd.DataFrame(all_items)
df.columns

df = preprocess_items_df(all_items_df)
df.shape

(5081, 9)

In [162]:
def preprocess_offers_df(df: pd.DataFrame):
    """
    @deprecated

    :param df: offers dataframe
    :return: clean dataframe with just the relevant information
    """
    def address_to_string(address: dict) -> str:
        return "{} {} {} {}".format(
            address["addressCountry"],
            address["addressLocality"],
            address["addressRegion"],
            address["streetAddress"] if "streetAddress" in address else "",
        )

    floor_sizes = []
    numbers_of_rooms = []
    addresses = []
    cities = []
    # regions = []
    for i in range(len(df)):
        item = df['itemOffered'][i]
        floor_sizes.append(item['floorSize']['value'])
        numbers_of_rooms.append(item['numberOfRooms'])
        addresses.append(address_to_string(item['address']))
        cities.append(item['address']['addressLocality'])
        # regions.append(item['address']['addressRegion'])

    df['floorSize'] = floor_sizes
    df['address'] = addresses
    df['numberOfRooms'] = numbers_of_rooms
    df['city'] = cities
    # df['region'] = region

    clean_df = df[['price', 'name', 'floorSize',
                   'address', 'numberOfRooms', 'city']]

    with open("offers_df.pkl", "wb+") as f:
        pickle.dump(clean_df, f)

    return clean_df

In [363]:
def run_preds(df: pd.DataFrame):
    X = df[['floor_size', 'number_of_rooms']].to_numpy()
    y = df['price'].to_numpy()

    # Step 2: Train-Test Split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

    print(X_train.shape, y_train.shape)

    # Step 3: Model Training
    model = LinearRegression()
    model.fit(X_train, y_train)

    # Step 4: Model Evaluation
    y_pred = model.predict(X_test)
    print(
        f"Root Mean Squared Error: {mean_squared_error(y_test, y_pred, squared=False)}")

In [321]:
def plot(df: pd.DataFrame):
    # Scatter plot
    fig, ax = plt.subplots()
    ax.scatter(df['floor_size'], df['price'])
    ax.set_xlabel('Floor Size (m^2)')
    ax.set_ylabel('Price (PLN)')
    ax.set_title('Price vs Floor Size')

    plt.show()

In [282]:
pd.set_option('max_colwidth', 100)

In [325]:
%pip install -qU googlemaps

Note: you may need to restart the kernel to use updated packages.


In [404]:
df['address'].unique().shape

(1008,)

In [448]:
# TODO calculate distance to the center to use the location factor also
# calculate the distance to the nearest SKM station
# calculate distance to the nearest store, train station etc
import googlemaps

Coords = [float, float]


class HowCloseIsItService:
    """
    defining a class for this for caching
    """

    GOOGLE_MAPS_API_KEY = os.getenv("GOOGLE_MAPS_API_KEY")

    def __init__(self):
        self._cache = self._load_cache()
        self.gmaps = googlemaps.Client(key=self.GOOGLE_MAPS_API_KEY)

    def _load_cache(self) -> dict:
        if os.path.exists("cache.pkl"):
            with open("cache.pkl", "rb") as f:
                cache = pickle.load(f)
                print(f"Loaded cache of length: {len(cache)}")
                return cache
        else:
            return {}

    def _is_cached(self, key: str) -> bool:
        return key in self._cache if self._cache is not None else False

    def _get_from_cache(self, key: str) -> Coords:
        return self._cache[key]

    def _write_to_cache(self, key: str, value: Coords):
        self._cache[key] = value
        with open("cache.pkl", "wb+") as f:
            pickle.dump(self._cache, f)

    def city_center(self) -> Coords:
        return self.response_to_coords(self.geocode("Gdańsk"))

    def geocode(self, address: str) -> Coords:
        if self._is_cached(address):
            return self._get_from_cache(address)
        geocode_result = self.gmaps.geocode(address)
        if geocode_result is not None:
            self._write_to_cache(address, geocode_result)
        return geocode_result

    def get_car_distance(self, origin: Coords, dest: Coords) -> Coords:
        pass

    def get_walking_distance(self):
        pass

    def get_distance(self, origin, dest):
        pass

    def get_distance_to_center(self, coords: Coords) -> Coords:
        gdansk_coords = self.city_center()
        return self.get_distance(coords, gdansk_coords)
        pass

    def get_distance_to_nearest_skm_station(self, coords: Coords) -> Coords:
        res = self.gmaps.places(
            type="train_station",
            query="SKM",
            location=self.city_center(),
            radius=10_000
        )
        if 'next_page_token' in res:
            pass


    def get_distance_to_nearest_zabka(self, coords: Coords) -> Coords:
        pass

    def get_distance_to_nearest_store(self, coords: Coords) -> Coords:
        pass

    def get_distance_to_nearest_train_station(self, coords: Coords) -> Coords:
        pass

    def response_to_coords(self, response: dict) -> Coords:
        return response[0]['geometry']['location']['lat'], response[0]['geometry']['location']['lng']


how_close_service = HowCloseIsItService()

coords = []
for address in df['address'].values:
    res = how_close_service.geocode(address)
    coords.append(how_close_service.response_to_coords(res))

df['coords'] = coords
out = how_close_service.get_distance_to_nearest_skm_station(df.iloc[0].coords)
with open("out.json", "w+") as f:
    f.write(json.dumps(out))
!open out.json

Loaded cache of length: 1013


ValueError: Must provide API key or enterprise credentials when creating client.

In [405]:
how_close_service.geocode("Gdansk, Centrum")

[{'address_components': [{'long_name': 'Gdańsk',
    'short_name': 'Gdańsk',
    'types': ['locality', 'political']},
   {'long_name': 'Gdańsk',
    'short_name': 'Gdańsk',
    'types': ['administrative_area_level_2', 'political']},
   {'long_name': 'Pomeranian Voivodeship',
    'short_name': 'Pomeranian Voivodeship',
    'types': ['administrative_area_level_1', 'political']},
   {'long_name': 'Poland',
    'short_name': 'PL',
    'types': ['country', 'political']},
   {'long_name': '80',
    'short_name': '80',
    'types': ['postal_code', 'postal_code_prefix']}],
  'formatted_address': 'Gdańsk, Poland',
  'geometry': {'bounds': {'northeast': {'lat': 54.44721879999999,
     'lng': 18.9512795},
    'southwest': {'lat': 54.27495589999999, 'lng': 18.4287748}},
   'location': {'lat': 54.35202520000001, 'lng': 18.6466384},
   'location_type': 'APPROXIMATE',
   'viewport': {'northeast': {'lat': 54.44721879999999, 'lng': 18.9512795},
    'southwest': {'lat': 54.27495589999999, 'lng': 18.4287

In [362]:
df[df['address'].notna()].sort_values(by="price_per_m2")[
    (df['price'] >= 0) & (df['price'] <= 500_000)].head()

  df[df['address'].notna()].sort_values(by="price_per_m2")[(df['price'] >= 0) & (df['price'] <= 500_000)].head()


Unnamed: 0,title,price_hidden,price,price_per_m2,floor_size,city,address,number_of_rooms,url
4089,Mieszkanie 3 pokojowe z garażem- licytacja,False,496500.0,4728.0,105.01,Gdańsk,ul. Zeusa,3.0,https://otodom.pl/pl/oferta/mieszkanie-3-pokojowe-z-garazem-licytacja-ID4miat
5841,Okazja Dwupoziomowe Mieszkanie 100 M2,False,500000.0,5000.0,100.0,Gdańsk,ul. Rycerza Blizbora,4.0,https://otodom.pl/pl/oferta/okazja-dwupoziomowe-mieszkanie-100-m2-ID4kWAe
1529,Tanie mieszkanie Gdańsk - Rudniki ul. Rzęsna,False,248000.0,5345.0,46.4,Gdańsk,ul. Rzęsna 1A,2.0,https://otodom.pl/pl/oferta/tanie-mieszkanie-gdansk-rudniki-ul-rzesna-ID4mdBE
4109,Nowe mieszkanie w gdańskim Borkowie!,False,425171.0,5990.0,70.98,Gdańsk,ul. gen. Elżbiety Zawackiej,4.0,https://otodom.pl/pl/oferta/nowe-mieszkanie-w-gdanskim-borkowie-ID4mhu0
3957,mieszkanie z ogródkiem w Gdańsku!,False,317000.0,6265.0,50.6,Gdańsk,ul. Nowiny,2.0,https://otodom.pl/pl/oferta/mieszkanie-z-ogrodkiem-w-gdansku-ID4mfpC
