### **Scraping data about real estate prices in Katowice**

### Libraries

In [3]:
import requests
import pandas as pd
import numpy as np
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
import seaborn as sns
from selenium.webdriver.support import expected_conditions as EC
import matplotlib.pyplot as plt

### The code

In [34]:
url = "https://www.otodom.pl/pl/wyniki/sprzedaz/mieszkanie/slaskie/katowice/katowice/katowice"
s = Service('C:\webdrivers\chromedriver.exe')
driver = webdriver.Chrome(service=s)
driver.get(url)

WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.XPATH, '//div[@id="onetrust-button-group"]//button[1]')))

accept_cookies = driver.find_element(By.XPATH,'//div[@id="onetrust-button-group"]//button[1]')
accept_cookies.click()
driver.refresh()



all_real_estate_links = []
all_data = []

for page in range(1,11):
    page_url = f"https://www.otodom.pl/pl/wyniki/sprzedaz/mieszkanie/slaskie/katowice/katowice/katowice?page={page}"
    driver.get(page_url)
    driver.implicitly_wait(3)
    print(f'Currently getting data from page number {page}')


    all_links_elements = driver.find_elements(By.XPATH,'.//div[@class="css-13gthep eeungyz2"]//a')
    all_links = [link.get_attribute('href') for link in all_links_elements]


    for link in all_links:
        driver.get(link)
        try:
            price = driver.find_element(By.XPATH,'//strong[@class="css-t3wmkv e9aa0kv0"]').text.strip()
        except:
            price = np.nan
        try:
            price_square_meter = driver.find_element(By.XPATH,'//div[@class="css-1h1l5lm e1csqp8m9"]').text.strip()
        except:
            price = np.nan
        try:
            area = driver.find_element(By.XPATH,'(//div[@class="css-2vlfd7 evcbp5k0"]//div[@class="css-1ivc1bc ewb0mtf1"]//div[@class="css-1wi2w6s ewb0mtf5"])[1]').text.strip()
        except:
            area = np.nan
        all_data.append({"link":link,
                            "price":price,
                            "price_sqm":price_square_meter,
                            "area":area})
        time.sleep(1)
    time.sleep(1)

  s = Service('C:\webdrivers\chromedriver.exe')


Currently getting data from page number 1
Currently getting data from page number 2
Currently getting data from page number 3
Currently getting data from page number 4
Currently getting data from page number 5
Currently getting data from page number 6
Currently getting data from page number 7
Currently getting data from page number 8
Currently getting data from page number 9
Currently getting data from page number 10


### Cleaning the data

In [104]:
df = pd.DataFrame(all_data)

In [105]:
df

Unnamed: 0,link,price,price_sqm,area
0,https://www.otodom.pl/pl/oferta/2-pokojowe-mie...,Zapytaj o cenę,,"79,09 m²"
1,https://www.otodom.pl/pl/oferta/4-pokoje-z-wid...,595 000 zł,9 597 zł/m²,62 m²
2,https://www.otodom.pl/pl/oferta/katowice-bryno...,695 000 zł,8 688 zł/m²,80 m²
3,https://www.otodom.pl/pl/oferta/4-pokojowe-mie...,735 968 zł,10 900 zł/m²,"67,52 m²"
4,https://www.otodom.pl/pl/oferta/srodmiescie-no...,345 000 zł,13 254 zł/m²,"26,03 m²"
...,...,...,...,...
345,https://www.otodom.pl/pl/oferta/2-pokojowe-mie...,576 828 zł,12 600 zł/m²,"45,78 m²"
346,https://www.otodom.pl/pl/oferta/nowe-mieszkani...,513 428 zł,9 879 zł/m²,"51,97 m²"
347,https://www.otodom.pl/pl/oferta/4-pokojowy-apa...,885 701 zł,9 207 zł/m²,"96,2 m²"
348,https://www.otodom.pl/pl/oferta/nowy-apartamen...,778 480 zł,9 255 zł/m²,"84,11 m²"


In [106]:
def fix_price_column(x):
    if "zł" not in x:
        return np.nan
    else:
        return x.replace(" ","").replace("zł","")

In [107]:
df['price'] = df['price'].apply(fix_price_column)

In [108]:
df['price_sqm'].apply(lambda x: x.strip())

0                  
1       9 597 zł/m²
2       8 688 zł/m²
3      10 900 zł/m²
4      13 254 zł/m²
           ...     
345    12 600 zł/m²
346     9 879 zł/m²
347     9 207 zł/m²
348     9 255 zł/m²
349     9 697 zł/m²
Name: price_sqm, Length: 350, dtype: object

In [109]:
df['price_sqm'] = df['price_sqm'].apply(lambda x: x.replace(' zł/m²','').replace(' ',''))

In [110]:
df['area'] = df['area'].apply(lambda x: x.replace(' m²','').replace(' ',''))

In [111]:
df = df.replace(r'^\s*$', np.nan, regex=True)

In [112]:
df = df.dropna()

In [113]:
df.iloc[:,1:] = df.iloc[:,1:].map(lambda x: x.replace(',','.'))

In [114]:
df.dtypes

link         object
price        object
price_sqm    object
area         object
dtype: object

In [116]:
for col in df.iloc[:,1:].columns.to_list():
     df[col] = pd.to_numeric(df[col])

In [118]:
df.dtypes

link          object
price        float64
price_sqm      int64
area         float64
dtype: object

In [119]:
df

Unnamed: 0,link,price,price_sqm,area
1,https://www.otodom.pl/pl/oferta/4-pokoje-z-wid...,595000.0,9597,62.00
2,https://www.otodom.pl/pl/oferta/katowice-bryno...,695000.0,8688,80.00
3,https://www.otodom.pl/pl/oferta/4-pokojowe-mie...,735968.0,10900,67.52
4,https://www.otodom.pl/pl/oferta/srodmiescie-no...,345000.0,13254,26.03
5,https://www.otodom.pl/pl/oferta/centrum-1-dzie...,857000.0,19930,43.00
...,...,...,...,...
345,https://www.otodom.pl/pl/oferta/2-pokojowe-mie...,576828.0,12600,45.78
346,https://www.otodom.pl/pl/oferta/nowe-mieszkani...,513428.0,9879,51.97
347,https://www.otodom.pl/pl/oferta/4-pokojowy-apa...,885701.0,9207,96.20
348,https://www.otodom.pl/pl/oferta/nowy-apartamen...,778480.0,9255,84.11
