In [22]:
import pandas as pd

In [88]:
df = pd.read_csv("data/raw/imotbg/2025_01_11_12_53_42_0.csv")
df.columns

Index(['listing_id', 'price', 'title', 'location', 'description',
       'contact_info', 'agency_url', 'details_url', 'num_photos', 'date_added',
       'offer_type', 'search_url', 'total_offers'],
      dtype='object')

In [113]:
from datetime import datetime
from typing import Dict, List, Optional, Union

import pandas as pd
from bs4 import BeautifulSoup, Tag
from pydantic import BaseModel, Field, ValidationError
import enum
import logging

# Set up logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

# Enums for consistent representation
class ListingSite(enum.Enum):
    IMOTI_NET = "imoti.net"
    HOMES_BG = "homes.bg"
    IMOT_BG = "imot.bg"

class Currency(enum.Enum):
    BGN = "BGN"
    EUR = "EUR"

class OfferType(enum.Enum):
    SELL = "продава"
    RENT = "наем"

class PropertyType(enum.Enum):
    EDNOSTAEN = "едностаен"
    DVUSTAEN = "двустаен"
    TRISTAEN = "тристаен"
    CHETIRISTAEN = "четиристаен"
    MEZONET = "мезонет"
    MNOGOSTAEN = "многостаен"
    LAND = "земя"

# Model for validated data
class ListingData(BaseModel):
    raw_title: Optional[str] = Field(default="")
    raw_description: Optional[str] = Field(default="")
    price: Optional[float] = Field(default=0.0)
    currency: Optional[str] = Field(default="")
    without_dds: Optional[bool] = Field(default=False)
    offer_type: Optional[str] = Field(default="")
    property_type: Optional[str] = Field(default="")
    city: Optional[str] = Field(default="")
    neighborhood: Optional[str] = Field(default="")
    contact_info: Optional[str] = Field(default="")
    agency: Optional[str] = Field(default="")
    agency_url: Optional[str] = Field(default="")
    details_url: Optional[str] = Field(default="")
    num_photos: Optional[float] = Field(default=0)
    date_time_added: Optional[datetime] = Field(default=None)
    search_url: Optional[str] = Field(default="")
    site: Optional[str] = Field(default="")
    total_offers: Optional[int] = Field(default=0)
    date: Optional[datetime] = Field(default=None)
    ref_no: Optional[str] = Field(default="")
    time: Optional[str] = Field(default="")
    price_per_m2: Optional[str] = Field(default="")
    area: Optional[str] = Field(default="")
    floor: Optional[str] = Field(default="")

# Helper functions for transformation
def to_property_type(x: str) -> Optional[PropertyType]:
    property_map = {
        "1-СТАЕН": PropertyType.EDNOSTAEN,
        "2-СТАЕН": PropertyType.DVUSTAEN,
        "3-СТАЕН": PropertyType.TRISTAEN,
        "4-СТАЕН": PropertyType.CHETIRISTAEN,
        "МЕЗОНЕТ": PropertyType.MEZONET,
        "МНОГОСТАЕН": PropertyType.MNOGOSTAEN,
        "ЗЕМЕДЕЛСКА ЗЕМЯ": PropertyType.LAND,
    }
    return property_map.get(x, None)

def to_offer_type(x: str) -> Optional[OfferType]:
    offer_map = {
        "продава": OfferType.SELL,
        "дава под наем": OfferType.RENT,
    }
    return offer_map.get(x.lower(), None)

def to_currency(x: str) -> Optional[Currency]:
    currency_map = {
        "eur": Currency.EUR,
        "bgn": Currency.BGN,
    }
    return currency_map.get(x.lower(), None)

def to_price(x: str) -> float:
    if x is None:
        return 0.0
    if not x:
        return 0.0
    if isinstance(x, str):
        x = x.replace(" ", "")
    try:
        return float(x)
    except ValueError:
        return 0.0

def to_listing_data(df: pd.DataFrame) -> pd.DataFrame:
    if df.empty:
        logging.warning("Input DataFrame is empty.")
        return pd.DataFrame()

    listing_df = pd.DataFrame()

    # Process columns
    listing_df["raw_title"] = df["title"]
    listing_df["raw_description"] = df["description"]

    # Extract price and currency
    price = df["price"].str.extract(r"(\d{1,3}(?: \d{3})*|\d+)")
    currency = df["price"].str.extract(r"(?i)(eur|bgn)").fillna("")
    dds = df["price"].str.contains(r"(?:ДДС|без ДДС)", case=False, na=False)

    listing_df["price"] = price[0].apply(to_price)
    listing_df["currency"] = currency[0].apply(to_currency)
    listing_df["without_dds"] = dds

    # Process title for offer type and property type
    process_title = df["title"].str.split()
    listing_df["offer_type"] = process_title.str.get(0).apply(to_offer_type)
    listing_df["property_type"] = process_title.str.get(1).apply(to_property_type)

    # Process location
    processed_location = df["location"].str.split(",")
    listing_df["city"] = processed_location.str.get(0).str.replace(r"\bград\b", "", regex=True).str.strip()
    listing_df["neighborhood"] = processed_location.str.get(1)

    # Map remaining columns
    listing_df["contact_info"] = df["contact_info"]
    listing_df["agency"] = None
    listing_df["agency_url"] = df["agency_url"]
    listing_df["details_url"] = df["details_url"]
    listing_df["num_photos"] = df["num_photos"]
    listing_df["search_url"] = df["search_url"]
    listing_df["site"] = ListingSite.IMOT_BG
    listing_df["total_offers"] = df["total_offers"]
    listing_df["ref_no"] = df["listing_id"]
    listing_df["date_time_added"] = pd.to_datetime(df["date_added"], errors='coerce')
    listing_df["date"] = listing_df["date_time_added"].dt.date

    validated_rows = []
    for index, row in listing_df.iterrows():
        try:
            validated_row = ListingData(**row.to_dict())
            validated_rows.append(validated_row.model_dump())
        except ValidationError as e:
            logging.error(f"Validation error at row {index}: {e}. Row data: {row.to_dict()}")
    result_df = pd.DataFrame(validated_rows)
    result_df = result_df.fillna("")
    return result_df

    
    return listing_df
to_listing_data(df)

Unnamed: 0,raw_title,raw_description,price,currency,without_dds,offer_type,property_type,city,neighborhood,contact_info,...,date_time_added,search_url,site,total_offers,date,ref_no,time,price_per_m2,area,floor
0,Продава 2-СТАЕН,"71 кв.м, 4-ти ет. от 6, ТЕЦ, Тухла 1940 г., ИР...",248000.0,EUR,False,продава,двустаен,София,Докторски паметник,0877332218,...,2025-01-11 12:53:45.960586,https://www.imot.bg/pcgi/imot.cgi?act=3&slink=...,imot.bg,735,2025-01-11,1b172911705986137,,,,
1,Продава 2-СТАЕН,"71 кв.м, 4-ти ет. от 6, ТЕЦ, Тухла 1940 г., ИР...",248000.0,EUR,False,продава,двустаен,София,Докторски паметник,0877332218,...,2025-01-11 12:53:45.961220,https://www.imot.bg/pcgi/imot.cgi?act=3&slink=...,imot.bg,735,2025-01-11,1b172911705986137,,,,
2,Продава 2-СТАЕН,"78 кв.м, ул. Шейново, 1-ви ет. от 4, ТЕЦ, Тухл...",300000.0,EUR,False,продава,двустаен,София,Докторски паметник,0896611040,...,2025-01-11 12:53:45.961389,https://www.imot.bg/pcgi/imot.cgi?act=3&slink=...,imot.bg,735,2025-01-11,1b173191520062987,,,,
3,Продава 2-СТАЕН,"78 кв.м, 1-ви ет. от 5, ТЕЦ, Тухла 1965 г., ...",310000.0,EUR,False,продава,двустаен,София,Докторски паметник,0888426291,...,2025-01-11 12:53:45.961565,https://www.imot.bg/pcgi/imot.cgi?act=3&slink=...,imot.bg,735,2025-01-11,1b172129722757947,,,,
4,Продава 2-СТАЕН,"62 кв.м, 2-ри ет. от 7, Тухла 1970 г., СРЕДЕН ...",144999.0,EUR,False,продава,двустаен,София,Оборище,0875363330,...,2025-01-11 12:53:45.961719,https://www.imot.bg/pcgi/imot.cgi?act=3&slink=...,imot.bg,735,2025-01-11,1b173461600031474,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
730,Продава МЕЗОНЕТ,"350 кв.м, 6-ти ет. от 7, ТЕЦ, Тухла 2017 г., Р...",,,False,продава,мезонет,София,Оборище,0883 093 093,...,2025-01-11 12:54:11.145179,https://www.imot.bg/pcgi/imot.cgi?act=3&slink=...,imot.bg,735,2025-01-11,1f168638602574423,,,,
731,Продава МЕЗОНЕТ,"149 кв.м, 8-ми ет. от 9, Тухла 2024 г., БЕЗ КО...",250000.0,EUR,False,продава,мезонет,София,Подуяне,0877888234,...,2025-01-11 12:54:11.145332,https://www.imot.bg/pcgi/imot.cgi?act=3&slink=...,imot.bg,735,2025-01-11,1f173046696799533,,,,
732,Продава МЕЗОНЕТ,"148 кв.м, 7-ми ет. от 8, ТЕЦ, Тухла 2024 г., ...",250222.0,EUR,False,продава,мезонет,София,Подуяне,0886553046,...,2025-01-11 12:54:11.145473,https://www.imot.bg/pcgi/imot.cgi?act=3&slink=...,imot.bg,735,2025-01-11,1f170570345898128,,,,
733,Продава МЕЗОНЕТ,"183 кв.м, 3-ти ет. от 4, Тухла 2010 г., Агенци...",700000.0,EUR,False,продава,мезонет,София,Подуяне,0893372070,...,2025-01-11 12:54:11.145614,https://www.imot.bg/pcgi/imot.cgi?act=3&slink=...,imot.bg,735,2025-01-11,1f173436897427102,,,,
