In [None]:
import pandas as pd, numpy as np, re
pd.set_option("display.max_columns", 120)

raw_path = "../data/raw/listings.csv.gz"
df = pd.read_csv(raw_path, low_memory=False)
len(df), df.columns[:12]


In [None]:
df["price_eur"] = (df["price"].astype(str)
                   .str.replace(r"[€$,]", "", regex=True)
                   .str.replace(r"\s", "", regex=True)
                   .astype(float))
df["price_eur"].describe()


In [None]:
if "bathrooms_text" in df.columns:
    df["bathrooms_num"] = (df["bathrooms_text"].astype(str)
                           .str.extract(r"(\d+(\.\d+)?)")[0]
                           .astype(float))


In [None]:
keep = ["id","latitude","longitude","neighbourhood_cleansed","room_type","property_type",
        "accommodates","bedrooms","bathrooms_num","minimum_nights","price_eur",
        "review_scores_rating","number_of_reviews"]
df = df[[c for c in keep if c in df.columns]].copy()
df.head(3)


In [None]:
# drop rows missing essentials
df = df.dropna(subset=["price_eur","latitude","longitude"]).drop_duplicates(subset=["id"])

# cap extreme prices to 1st–99th percentile
q1, q99 = df["price_eur"].quantile([0.01, 0.99])
df["price_eur"] = df["price_eur"].clip(q1, q99)

# light NA imputation for numerics
for col in ["bedrooms","bathrooms_num","review_scores_rating","number_of_reviews","minimum_nights","accommodates"]:
    if col in df.columns:
        df[col] = df[col].fillna(df[col].median())

df.describe().T.head(12)


In [None]:
out_path = "../data/processed/listings_clean.csv"
df.to_csv(out_path, index=False)
out_path, df.shape


- Preis als float geparst, Ausreißer auf 1.–99. Perzentil gecappt.
- Wichtige Spalten ausgewählt; fehlende numerische Werte mit Median gefüllt.
- Duplikate pro Listing-ID entfernt.
