# 🏠 Sarajevo Flats Scraper
This notebook demonstrates how to collect real estate data (flats in Sarajevo Canton) from **NEKRETNINE.ba**, a popular Bosnian classifieds platform.

The goal is to:
- Collect key property details (title, price, size, location, condition…)
- Store them in a structured dataset (`sarajevo_flats.csv`)
- Prepare the dataset for future analysis or machine learning (e.g. AI price estimation)

We'll use **Selenium** for dynamic page loading and **BeautifulSoup** for parsing HTML.


In [9]:
import os
import time
import csv
import re
import random
from urllib.parse import urljoin
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.firefox.service import Service
from selenium.common.exceptions import WebDriverException, TimeoutException
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from concurrent.futures import ThreadPoolExecutor, as_completed
import threading

In [10]:
# Firefox + Geckodriver setup
firefox_binary = "/usr/bin/firefox"
geckodriver_binary = "/home/mustafasinanovic/miniforge3/bin/geckodriver"

# Scraper settings
BASE_URL = "https://nekretnine.ba/listing.php?lang=ba&sel=nekretnine&grad=65&naselje=&kat=3&subjekt=2&cij1=&cij2=&pov1=&pov2=&spr1=&spr2=&firma=&page={}"
OUTPUT_CSV = "data/sarajevo_flats_nekretnine.csv"
MAX_PAGES = 88
REQUEST_DELAY = (2, 5)

# Multithreading settings
MAX_WORKERS = 3  # Number of parallel browser instances (don't set too high to avoid blocking)

os.makedirs("data", exist_ok=True)

The scraper will fetch up to 88 pages of listings from the OLX search results for *Sarajevo Canton flats*.  
All results are stored in `data/sarajevo_flats_nekretnine.csv`.  
We use randomized delays between requests to reduce the risk of blocking.

In [11]:
def clean_text(s):
    return " ".join(s.split()).strip() if s else None

def extract_price(text):
    if not text:
        return None
    cleaned = re.sub(r"[^0-9]", "", text)
    return int(cleaned) if cleaned else None

def extract_number(text):
    if not text:
        return None
    m = re.search(r"(\d+)", text)
    return int(m.group(1)) if m else None

In [12]:
def fetch_page_source(url, driver, short_wait=10):
    """
    Loads a given URL and returns whatever HTML is available immediately.
    Does NOT wait for the page to fully load (useful for slow or problematic websites).

    Parameters:
        url (str): URL to load
        driver (webdriver): Selenium WebDriver instance
        short_wait (int or float): seconds to wait after opening page before returning source

    Returns:
        str or None: HTML source (may be partially loaded)
    """
    try:
        print(f"[+] Attempting to load URL quickly: {url}")
        driver.get(url)
        time.sleep(short_wait)  # minimal wait to let some content render
        html = driver.page_source
        if html:
            print(f"[+] HTML fetched (may be partial): {url}")
        else:
            print(f"[!] No HTML returned for {url}")
        return html
    except (TimeoutException, WebDriverException, OSError) as e:
        print(f"[!] Failed to load page: {url} → {e}")
        return None
    except Exception as e:
        print(f"[!] Unexpected error loading page: {url} → {e}")
        return None


This function uses Selenium to load pages dynamically.
If a page fails (timeout, network error, etc.), we log the issue but continue scraping.

In [13]:
def parse_detail_page(url, driver):
    html = fetch_page_source(url, driver)
    if not html:
        return None

    try:
        soup = BeautifulSoup(html, "lxml")

        # Extract title
        title_elem = soup.select_one("div.listing-titlebar-title h2")
        if title_elem:
            # Remove the tag span from title
            tag_span = title_elem.find("span", class_="listing-tag")
            if tag_span:
                tag_span.decompose()
            title = clean_text(title_elem.get_text())
        else:
            title = None

        # Extract municipality (address/location)
        municipality_elem = soup.select_one("a.listing-address")
        municipality = clean_text(municipality_elem.get_text()) if municipality_elem else None

        # Extract price
        price_elem = soup.select_one("span.re-slidep")
        price_numeric = extract_price(price_elem.get_text()) if price_elem else None

        # Extract property type
        property_type_elem = soup.find("b", string="TIP")
        property_type = clean_text(property_type_elem.find_next("div").get_text()) if property_type_elem else None

        # Extract ad type (subject - prodaja/izdavanje)
        ad_type_elem = soup.find("b", string="SUBJEKT")
        ad_type = clean_text(ad_type_elem.find_next("div").get_text()) if ad_type_elem else None

        # Extract rooms
        rooms_elem = soup.find("b", string="BROJ SOBA")
        rooms = clean_text(rooms_elem.find_next("div").get_text()) if rooms_elem else None

        # Extract square meters
        square_m2_elem = soup.find("b", string="POVRŠINA")
        if square_m2_elem:
            area_text = square_m2_elem.find_next("div").get_text(strip=True)
            # Extract number and convert to float
            area_match = re.search(r'([\d,\.]+)', area_text)
            if area_match:
                area_str = area_match.group(1).replace(',', '.')
                try:
                    square_m2 = float(area_str)
                except:
                    square_m2 = None
            else:
                square_m2 = None
        else:
            square_m2 = None

        # Extract description
        description_head = soup.find("h3", string=re.compile("Opis nekretnine"))
        description = clean_text(description_head.find_next("p").get_text(" ")) if description_head else None

        # Extract equipment/amenities
        equipment_list = [clean_text(li.get_text()) for li in soup.select("ul.listing-features li")]
        equipment = ", ".join([e for e in equipment_list if e])  # Filter out None values

        details = {
            "title": title,
            "url": url,
            "price_numeric": price_numeric,
            "municipality": municipality,
            "property_type": property_type,
            "ad_type": ad_type,
            "rooms": rooms,
            "square_m2": square_m2,
            "equipment": equipment,
            "description": description
        }

        print("Parsed:", details)
        return details
    except Exception as e:
        print(f"[!] Failed to parse details for {url} → {e}")
        return None


In [14]:
def create_driver():
    print("[*] Initializing Firefox WebDriver...")
    try:
        options = Options()
        options.binary_location = firefox_binary
        options.add_argument("--headless")

        # ✅ New way to set pageLoadStrategy (Selenium 4+)
        options.set_capability("pageLoadStrategy", "none")

        service = Service(executable_path=geckodriver_binary)
        driver = webdriver.Firefox(service=service, options=options)
        driver.set_page_load_timeout(10)
        print("[+] WebDriver started successfully.")
        return driver
    except Exception as e:
        print(f"[!] Failed to start Firefox driver: {e}")
        return None


## Multithreaded Scraping Functions

We'll use ThreadPoolExecutor to run multiple Selenium instances in parallel. Each thread gets its own WebDriver instance to avoid conflicts.

In [15]:
def scrape_listing(link, driver):
    """
    Scrape a single listing and return the data.
    Each thread will call this function with its own driver instance.
    """
    try:
        data = parse_detail_page(link, driver)
        if data:
            print(f"      ✔ Scraped: {link}")
        else:
            print(f"      ✖ Failed: {link}")
        time.sleep(random.uniform(*REQUEST_DELAY))
        return data
    except Exception as e:
        print(f"[!] Error scraping {link}: {e}")
        return None


def scrape_page_listings(page_num, driver):
    """
    Scrape all listings from a single search results page.
    Returns a list of listing URLs found on that page.
    """
    print(f"\n[+] Fetching search page {page_num}: {BASE_URL.format(page_num)}")
    html = fetch_page_source(BASE_URL.format(page_num), driver)
    
    if not html:
        print(f"[!] No HTML for page {page_num}, skipping.")
        return []
    
    try:
        soup = BeautifulSoup(html, "lxml")
        links = [urljoin("https://nekretnine.ba/", a["href"]) 
                for a in soup.find_all("a", href=re.compile(r"^real-estate\.php\?lang=ba&sel=nekretnine&view="))]
        
        print(f"  → Found {len(links)} listings on page {page_num}")
        
        if not links:
            print(f"[!] No links found on page {page_num}. Possible structure change?")
        
        return links
    except Exception as e:
        print(f"[!] Failed to parse search page {page_num} → {e}")
        return []


def scrape_with_threading():
    """
    Multithreaded scraping function.
    Creates multiple WebDriver instances and processes listings in parallel.
    """
    fieldnames = ["title", "url", "price_numeric", "municipality", "property_type", "ad_type", "rooms", "square_m2", "equipment", "description"]
    
    # Thread-safe lock for writing to CSV
    csv_lock = threading.Lock()
    
    # Create main driver for collecting listing URLs
    print("[*] Creating main driver for collecting listing URLs...")
    main_driver = create_driver()
    if not main_driver:
        print("[!] Failed to create main driver. Exiting.")
        return
    
    # Collect all listing URLs first
    print(f"[*] Collecting listing URLs from {MAX_PAGES} pages...")
    all_listing_urls = []
    
    for page in range(1, MAX_PAGES + 1):
        links = scrape_page_listings(page, main_driver)
        all_listing_urls.extend(links)
        time.sleep(random.uniform(1, 2))  # Small delay between pages
    
    main_driver.quit()
    print(f"\n[+] Collected {len(all_listing_urls)} total listings to scrape.")
    
    if not all_listing_urls:
        print("[!] No listings found. Exiting.")
        return
    
    # Prepare CSV file
    write_header = not os.path.exists(OUTPUT_CSV)
    
    def worker_scrape(url_batch):
        """Worker function that each thread will execute"""
        driver = create_driver()
        if not driver:
            print("[!] Failed to create worker driver")
            return []
        
        results = []
        for url in url_batch:
            data = scrape_listing(url, driver)
            if data:
                results.append(data)
        
        driver.quit()
        return results
    
    # Split listings into batches for each worker
    batch_size = len(all_listing_urls) // MAX_WORKERS + 1
    url_batches = [all_listing_urls[i:i + batch_size] for i in range(0, len(all_listing_urls), batch_size)]
    
    print(f"\n[*] Starting multithreaded scraping with {MAX_WORKERS} workers...")
    print(f"[*] Processing {len(url_batches)} batches...")
    
    # Use ThreadPoolExecutor for parallel scraping
    all_results = []
    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        # Submit all batches to thread pool
        futures = {executor.submit(worker_scrape, batch): i for i, batch in enumerate(url_batches)}
        
        # Process results as they complete
        for future in as_completed(futures):
            batch_num = futures[future]
            try:
                batch_results = future.result()
                all_results.extend(batch_results)
                print(f"[+] Batch {batch_num + 1}/{len(url_batches)} completed. Scraped {len(batch_results)} listings.")
            except Exception as e:
                print(f"[!] Batch {batch_num + 1} failed: {e}")
    
    # Write all results to CSV at once (thread-safe)
    print(f"\n[*] Writing {len(all_results)} results to CSV...")
    with csv_lock:
        with open(OUTPUT_CSV, "a" if not write_header else "w", newline="", encoding="utf-8") as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            if write_header:
                writer.writeheader()
                print(f"[+] Created new CSV file: {OUTPUT_CSV}")
            
            for data in all_results:
                writer.writerow(data)
    
    print(f"\n✅ Finished scraping. Data saved to: {OUTPUT_CSV}")
    print(f"✅ Total listings scraped: {len(all_results)}/{len(all_listing_urls)}")


def scrape():
    """Original single-threaded scraping function (kept for reference)"""
    driver = create_driver()

    fieldnames = ["title", "url", "price_numeric", "municipality", "property_type", "ad_type", "rooms", "square_m2", "equipment", "description"]

    write_header = not os.path.exists(OUTPUT_CSV)
    with open(OUTPUT_CSV, "a", newline="", encoding="utf-8") as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        if write_header:
            writer.writeheader()
            print(f"[+] Created new CSV file: {OUTPUT_CSV}")
        else:
            print(f"[+] Appending to existing CSV: {OUTPUT_CSV}")

        print(f"[*] Starting scraping of up to {MAX_PAGES} pages...")

        for page in range(1, MAX_PAGES + 1):
            print(f"\n[+] Fetching search page {page}: {BASE_URL.format(page)}")
            html = fetch_page_source(BASE_URL.format(page), driver)
            if not html:
                print(f"[!] No HTML for page {page}, skipping.")
                continue

            try:
                soup = BeautifulSoup(html, "lxml")

                links = [urljoin("https://nekretnine.ba/", a["href"]) for a in soup.find_all("a", href=re.compile(r"^real-estate\.php\?lang=ba&sel=nekretnine&view="))]
                
                print(f"  → Found {len(links)} listings on page {page}")

                if not links:
                    print(f"[!] No links found on page {page}. Possible structure change?")
                    continue

                for i, link in enumerate(links, start=1):
                    print(f"    [{i}/{len(links)}] Scraping listing: {link}")
                    try:
                        data = parse_detail_page(link, driver)
                        if data:
                            writer.writerow(data)
                            print("      ✔ Saved listing data to CSV.")
                        else:
                            print("      ✖ No data parsed, skipping.")
                        time.sleep(random.uniform(*REQUEST_DELAY))
                    except Exception as e:
                        print(f"[!] Error scraping {link}: {e}")
            except Exception as e:
                print(f"[!] Failed to parse search page {page} → {e}")

    driver.quit()
    print(f"\n✅ Finished scraping. Data saved to: {OUTPUT_CSV}")

## Run the Scraper

Choose which scraper to run:
- `scrape_with_threading()` - **Multithreaded version** (faster, uses 3 parallel browsers)
- `scrape()` - Single-threaded version (slower, but more stable)

In [16]:
if __name__ == "__main__":
    # Use multithreaded version for faster scraping
    scrape_with_threading()
    
    # Or use single-threaded version (comment above, uncomment below)
    # scrape()

[*] Creating main driver for collecting listing URLs...
[*] Initializing Firefox WebDriver...
[+] WebDriver started successfully.
[*] Collecting listing URLs from 88 pages...

[+] Fetching search page 1: https://nekretnine.ba/listing.php?lang=ba&sel=nekretnine&grad=65&naselje=&kat=3&subjekt=2&cij1=&cij2=&pov1=&pov2=&spr1=&spr2=&firma=&page=1
[+] Attempting to load URL quickly: https://nekretnine.ba/listing.php?lang=ba&sel=nekretnine&grad=65&naselje=&kat=3&subjekt=2&cij1=&cij2=&pov1=&pov2=&spr1=&spr2=&firma=&page=1
[+] WebDriver started successfully.
[*] Collecting listing URLs from 88 pages...

[+] Fetching search page 1: https://nekretnine.ba/listing.php?lang=ba&sel=nekretnine&grad=65&naselje=&kat=3&subjekt=2&cij1=&cij2=&pov1=&pov2=&spr1=&spr2=&firma=&page=1
[+] Attempting to load URL quickly: https://nekretnine.ba/listing.php?lang=ba&sel=nekretnine&grad=65&naselje=&kat=3&subjekt=2&cij1=&cij2=&pov1=&pov2=&spr1=&spr2=&firma=&page=1
[+] HTML fetched (may be partial): https://nekretnine.

## 📊 Data Inspection

Let's load and inspect the scraped data from the CSV file.

In [33]:
import pandas as pd

# Load the CSV file
df = pd.read_csv('../data/sarajevo_flats_nekretnine.csv')

# Display basic information
print("=" * 80)
print("DATASET OVERVIEW")
print("=" * 80)
print(f"Total records: {len(df)}")
print(f"Total columns: {len(df.columns)}")
print(f"\nColumn names: {list(df.columns)}")
print("\n" + "=" * 80)

DATASET OVERVIEW
Total records: 1723
Total columns: 10

Column names: ['title', 'url', 'price_numeric', 'municipality', 'property_type', 'ad_type', 'rooms', 'square_m2', 'equipment', 'description']



In [31]:
# Display first few rows
print("FIRST 5 ROWS:")
print("=" * 80)
df.head()

FIRST 5 ROWS:


Unnamed: 0,title,url,price_numeric,municipality,property_type,ad_type,rooms,square_m2,equipment,description,price_per_m2
0,Sarajevo,https://nekretnine.ba/real-estate.php?lang=ba&...,,,Stambeni prostor,Prodaja,Dvosoban,82.0,"Garaža, Balkon, Centralno grijanje, Telefonski...",Agencija za nekretnine Stanpromet.ba izdvaja p...,
1,"Sarajevo, Sarajevo – Stari grad",https://nekretnine.ba/real-estate.php?lang=ba&...,339000.0,,Stambeni prostor,Prodaja,Četverosoban,94.0,"Plin, Telefonski priključak, Struja, Namješten...",Rental prodaje troiposoban salonski stan od 94...,3606.382979
2,Sarajevo,https://nekretnine.ba/real-estate.php?lang=ba&...,333000.0,,Stambeni prostor,Prodaja,Dvosoban,73.0,"Centralno grijanje, Telefonski priključak, Str...","Realno, za ponudu najboljih nekretnina treba V...",4561.643836
3,Sarajevo,https://nekretnine.ba/real-estate.php?lang=ba&...,,,Stambeni prostor,Prodaja,Dvosoban,81.0,"Garaža, Balkon, Centralno grijanje, Telefonski...",Stanpromet.ba agencija za nekretnine najavljuj...,
4,Sarajevo,https://nekretnine.ba/real-estate.php?lang=ba&...,,,Stambeni prostor,Prodaja,Dvosoban,75.0,"Garaža, Balkon, Centralno grijanje, Telefonski...",Stanpromet.ba agencija za nekretnine najavljuj...,


In [19]:
# Display data types and missing values
print("DATA TYPES AND MISSING VALUES:")
print("=" * 80)
df.info()

DATA TYPES AND MISSING VALUES:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1723 entries, 0 to 1722
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   title          1723 non-null   object 
 1   url            1723 non-null   object 
 2   price_numeric  1134 non-null   float64
 3   municipality   941 non-null    object 
 4   property_type  1723 non-null   object 
 5   ad_type        1723 non-null   object 
 6   rooms          1720 non-null   object 
 7   square_m2      1723 non-null   float64
 8   equipment      1526 non-null   object 
 9   description    1472 non-null   object 
dtypes: float64(2), object(8)
memory usage: 134.7+ KB


In [30]:
# Statistical summary of numeric columns
print("STATISTICAL SUMMARY (Numeric Columns):")
print("=" * 80)
df.describe()

STATISTICAL SUMMARY (Numeric Columns):


  sqr = _ensure_numeric((avg - values) ** 2)


Unnamed: 0,price_numeric,square_m2,price_per_m2
count,1134.0,1723.0,1134.0
mean,280190.9,1309.934997,inf
std,527676.3,9248.47691,
min,1.0,0.0,0.005784
25%,122250.0,51.0,1785.714286
50%,200000.0,69.0,2886.752137
75%,339000.0,103.0,4327.586207
max,16000000.0,200000.0,inf


In [28]:
# Check for missing values per column
print("MISSING VALUES PER COLUMN:")
print("=" * 80)
missing_values = df.isnull().sum()
missing_percentage = (missing_values / len(df)) * 100
missing_df = pd.DataFrame({
    'Missing Count': missing_values,
    'Percentage': missing_percentage.round(2)
})
print(missing_df[missing_df['Missing Count'] > 0])
print("\n" + "=" * 80)

MISSING VALUES PER COLUMN:
               Missing Count  Percentage
price_numeric            589       34.18
municipality             782       45.39
rooms                      3        0.17
equipment                197       11.43
description              251       14.57
price_per_m2             589       34.18



In [29]:
# Value counts for categorical columns
print("VALUE COUNTS FOR CATEGORICAL COLUMNS:")
print("=" * 80)

print("\n1. Property Type Distribution:")
print(df['property_type'].value_counts())

print("\n2. Ad Type Distribution:")
print(df['ad_type'].value_counts())

print("\n3. Rooms Distribution:")
print(df['rooms'].value_counts())

print("\n4. Municipality Distribution (Top 10):")
print(df['municipality'].value_counts().head(10))

print("\n" + "=" * 80)

VALUE COUNTS FOR CATEGORICAL COLUMNS:

1. Property Type Distribution:
property_type
Stambeni prostor    1723
Name: count, dtype: int64

2. Ad Type Distribution:
ad_type
Prodaja    1723
Name: count, dtype: int64

3. Rooms Distribution:
rooms
Dvosoban              469
Jednosoban            407
Trosoban              338
-                     142
Četverosoban          116
Garsonjera             63
Petosoban              42
Osmosoban              34
Dvoiposoban            31
Jednoiposoban          16
Šestosoban             15
Dvoipoiposoban         11
Troiposoban             9
Desetosoban             6
Jednoipoiposoban        6
Četveroiposoban         4
Sedmosoban              4
Troipoiposoban          4
Četveroipoiposoban      2
Devetosoban             1
Name: count, dtype: int64

4. Municipality Distribution (Top 10):
municipality
Breka                     14
-                         11
Put Mladih Muslimana 2     6
Stupska bb                 6
Semira Fraste              6
Skenderpašina 2

In [23]:
# Price analysis
print("PRICE ANALYSIS:")
print("=" * 80)
print(f"Average Price: {df['price_numeric'].mean():.2f} KM")
print(f"Median Price: {df['price_numeric'].median():.2f} KM")
print(f"Min Price: {df['price_numeric'].min():.2f} KM")
print(f"Max Price: {df['price_numeric'].max():.2f} KM")
print(f"Standard Deviation: {df['price_numeric'].std():.2f} KM")
print("\n" + "=" * 80)

PRICE ANALYSIS:
Average Price: 280190.93 KM
Median Price: 200000.00 KM
Min Price: 1.00 KM
Max Price: 16000000.00 KM
Standard Deviation: 527676.25 KM



In [24]:
# Square meters analysis
print("SQUARE METERS ANALYSIS:")
print("=" * 80)
print(f"Average Area: {df['square_m2'].mean():.2f} m²")
print(f"Median Area: {df['square_m2'].median():.2f} m²")
print(f"Min Area: {df['square_m2'].min():.2f} m²")
print(f"Max Area: {df['square_m2'].max():.2f} m²")
print(f"Standard Deviation: {df['square_m2'].std():.2f} m²")
print("\n" + "=" * 80)

SQUARE METERS ANALYSIS:
Average Area: 1309.93 m²
Median Area: 69.00 m²
Min Area: 0.00 m²
Max Area: 200000.00 m²
Standard Deviation: 9248.48 m²



In [34]:
# Filter out unrealistic property sizes (> 300 m²)
print("FILTERING UNREALISTIC PROPERTY SIZES:")
print("=" * 80)
print(f"Records before filtering: {len(df)}")

# Show properties that will be removed
large_properties = df[df['square_m2'] > 300]
if len(large_properties) > 0:
    print(f"\n⚠️ Found {len(large_properties)} properties with area > 300 m²:")
    print(large_properties[['title', 'square_m2', 'property_type', 'url']])
    
    # Remove properties with area > 300 m²
    df = df[df['square_m2'] <= 300]
    print(f"\n✅ Filtered out {len(large_properties)} properties")
else:
    print("\n✅ No properties with area > 300 m² found")

print(f"Records after filtering: {len(df)}")
print("\n" + "=" * 80)

FILTERING UNREALISTIC PROPERTY SIZES:
Records before filtering: 1723

⚠️ Found 178 properties with area > 300 m²:
                                title  square_m2     property_type  \
376                  Sarajevo, Ilijaš   157034.0  Stambeni prostor   
382    Sarajevo, Sarajevo – Novi grad    71000.0  Stambeni prostor   
383                  Sarajevo, Ilijaš     4517.0  Stambeni prostor   
384   Sarajevo, Sarajevo – Stari grad    11528.0  Stambeni prostor   
387                 Sarajevo, Vogošća     3000.0  Stambeni prostor   
...                               ...        ...               ...   
1708      Sarajevo, Sarajevo – Centar     1050.0  Stambeni prostor   
1710      Sarajevo, Sarajevo – Centar     3000.0  Stambeni prostor   
1711      Sarajevo, Sarajevo – Centar    20000.0  Stambeni prostor   
1713   Sarajevo, Sarajevo – Novi grad    50000.0  Stambeni prostor   
1715      Sarajevo, Sarajevo – Centar   200000.0  Stambeni prostor   

                                             

In [36]:
# Optionally save the cleaned data to a new CSV file
print("SAVING CLEANED DATA:")
print("=" * 80)
output_file = '../data/sarajevo_flats_nekretnine_cleaned.csv'
df.to_csv(output_file, index=False)
print(f"✅ Cleaned data saved to: {output_file}")
print(f"Total records saved: {len(df)}")
print("\n" + "=" * 80)

SAVING CLEANED DATA:
✅ Cleaned data saved to: ../data/sarajevo_flats_nekretnine_cleaned.csv
Total records saved: 1545



In [40]:
# Price per square meter analysis (filter out invalid data first)
print("PRICE PER SQUARE METER ANALYSIS:")
print("=" * 80)

# Check for zero or null square_m2 values
print(f"Properties with square_m2 = 0 or NaN: {((df['square_m2'] == 0) | df['square_m2'].isna()).sum()}")
print(f"Properties with price_numeric = 0 or NaN: {((df['price_numeric'] == 0) | df['price_numeric'].isna()).sum()}")

# Filter out properties with invalid data for price per m² calculation
valid_df = df[(df['square_m2'] > 0) & (df['price_numeric'] > 0) & df['square_m2'].notna() & df['price_numeric'].notna()].copy()

print(f"\nValid properties for price/m² analysis: {len(valid_df)}/{len(df)}")
print("=" * 80)

# Calculate price per m² only on valid data
valid_df['price_per_m2'] = valid_df['price_numeric'] / valid_df['square_m2']

print(f"\nAverage Price per m²: {valid_df['price_per_m2'].mean():.2f} KM/m²")
print(f"Median Price per m²: {valid_df['price_per_m2'].median():.2f} KM/m²")
print(f"Min Price per m²: {valid_df['price_per_m2'].min():.2f} KM/m²")
print(f"Max Price per m²: {valid_df['price_per_m2'].max():.2f} KM/m²")
print(f"Standard Deviation: {valid_df['price_per_m2'].std():.2f} KM/m²")

# Add price_per_m2 back to main dataframe
df['price_per_m2'] = df.apply(
    lambda row: row['price_numeric'] / row['square_m2'] 
    if (row['square_m2'] > 0 and row['price_numeric'] > 0) 
    else None, 
    axis=1
)

print("\n" + "=" * 80)

PRICE PER SQUARE METER ANALYSIS:
Properties with square_m2 = 0 or NaN: 5
Properties with price_numeric = 0 or NaN: 512

Valid properties for price/m² analysis: 1029/1545

Average Price per m²: 3864.43 KM/m²
Median Price per m²: 3100.00 KM/m²
Min Price per m²: 0.01 KM/m²
Max Price per m²: 192000.00 KM/m²
Standard Deviation: 6386.08 KM/m²



In [38]:
# Check for duplicate records
print("DUPLICATE RECORDS CHECK:")
print("=" * 80)
duplicates = df.duplicated(subset=['url']).sum()
print(f"Number of duplicate URLs: {duplicates}")

if duplicates > 0:
    print("\nDuplicate URLs found:")
    print(df[df.duplicated(subset=['url'], keep=False)][['title', 'url', 'price_numeric']])
else:
    print("No duplicate URLs found!")
    
print("\n" + "=" * 80)

DUPLICATE RECORDS CHECK:
Number of duplicate URLs: 0
No duplicate URLs found!



In [41]:
# Remove columns with constant values (property_type and ad_type)
print("REMOVING CONSTANT COLUMNS:")
print("=" * 80)

# Check unique values
print("Property Type unique values:", df['property_type'].unique())
print("Ad Type unique values:", df['ad_type'].unique())

# Drop columns that are always the same
columns_to_drop = ['property_type', 'ad_type']
df = df.drop(columns=columns_to_drop)

print(f"\n✅ Removed columns: {columns_to_drop}")
print(f"Remaining columns: {list(df.columns)}")
print("\n" + "=" * 80)

REMOVING CONSTANT COLUMNS:
Property Type unique values: ['Stambeni prostor']
Ad Type unique values: ['Prodaja']

✅ Removed columns: ['property_type', 'ad_type']
Remaining columns: ['title', 'url', 'price_numeric', 'municipality', 'rooms', 'square_m2', 'equipment', 'description', 'price_per_m2']



In [42]:
# Update the cleaned CSV file without property_type and ad_type columns
print("UPDATING CLEANED DATA FILE:")
print("=" * 80)
output_file = '../data/sarajevo_flats_nekretnine_cleaned_1.csv'
df.to_csv(output_file, index=False)
print(f"✅ Updated cleaned data saved to: {output_file}")
print(f"Total columns: {len(df.columns)}")
print(f"Column names: {list(df.columns)}")
print("\n" + "=" * 80)

UPDATING CLEANED DATA FILE:
✅ Updated cleaned data saved to: ../data/sarajevo_flats_nekretnine_cleaned_1.csv
Total columns: 9
Column names: ['title', 'url', 'price_numeric', 'municipality', 'rooms', 'square_m2', 'equipment', 'description', 'price_per_m2']

