In [1]:
import pandas as pd, numpy as np, re
pd.set_option("display.max_columns", 120)

raw_path = "../data/raw/listings.csv.gz"
df = pd.read_csv(raw_path, low_memory=False)
len(df), df.columns[:12]


(14187,
 Index(['id', 'listing_url', 'scrape_id', 'last_scraped', 'source', 'name',
        'description', 'neighborhood_overview', 'picture_url', 'host_id',
        'host_url', 'host_name'],
       dtype='object'))

In [2]:
df["price_eur"] = (df["price"].astype(str)
                   .str.replace(r"[€$,]", "", regex=True)
                   .str.replace(r"\s", "", regex=True)
                   .astype(float))
df["price_eur"].describe()


count     9183.000000
mean       211.728302
std       1653.597912
min          5.000000
25%         70.000000
50%        108.000000
75%        168.000000
max      50000.000000
Name: price_eur, dtype: float64

In [3]:
if "bathrooms_text" in df.columns:
    df["bathrooms_num"] = (df["bathrooms_text"].astype(str)
                           .str.extract(r"(\d+(\.\d+)?)")[0]
                           .astype(float))


In [4]:
keep = ["id","latitude","longitude","neighbourhood_cleansed","room_type","property_type",
        "accommodates","bedrooms","bathrooms_num","minimum_nights","price_eur",
        "review_scores_rating","number_of_reviews"]
df = df[[c for c in keep if c in df.columns]].copy()
df.head(3)


Unnamed: 0,id,latitude,longitude,neighbourhood_cleansed,room_type,property_type,accommodates,bedrooms,bathrooms_num,minimum_nights,price_eur,review_scores_rating,number_of_reviews
0,3176,52.53471,13.4181,Prenzlauer Berg Südwest,Entire home/apt,Entire rental unit,2,1.0,1.0,63,105.0,4.63,149
1,9991,52.53269,13.41805,Prenzlauer Berg Südwest,Entire home/apt,Entire rental unit,7,4.0,2.5,6,135.0,5.0,7
2,14325,52.54813,13.40366,Prenzlauer Berg Nordwest,Entire home/apt,Entire rental unit,1,0.0,1.0,150,75.0,4.68,26


In [5]:
# drop rows missing essentials
df = df.dropna(subset=["price_eur","latitude","longitude"]).drop_duplicates(subset=["id"])

# cap extreme prices to 1st–99th percentile
q1, q99 = df["price_eur"].quantile([0.01, 0.99])
df["price_eur"] = df["price_eur"].clip(q1, q99)

# light NA imputation for numerics
for col in ["bedrooms","bathrooms_num","review_scores_rating","number_of_reviews","minimum_nights","accommodates"]:
    if col in df.columns:
        df[col] = df[col].fillna(df[col].median())

df.describe().T.head(12)


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,9183.0,6.347248e+17,5.610167e+17,3176.0,33004860.0,7.70675e+17,1.162154e+18,1.447175e+18
latitude,9183.0,52.50866,0.03549185,52.340027,52.49111,52.50968,52.53158,52.65076
longitude,9183.0,13.40046,0.07363779,13.11815,13.35607,13.40628,13.43717,13.73727
accommodates,9183.0,3.312316,2.119453,1.0,2.0,2.0,4.0,16.0
bedrooms,9183.0,1.329413,0.8508024,0.0,1.0,1.0,2.0,14.0
bathrooms_num,9183.0,1.158445,0.4800488,0.0,1.0,1.0,1.0,15.0
minimum_nights,9183.0,26.73342,44.33157,1.0,2.0,3.0,30.0,999.0
price_eur,9183.0,136.5479,101.2566,28.0,70.0,108.0,168.0,659.54
review_scores_rating,9183.0,4.768335,0.3456945,0.0,4.73,4.85,4.93,5.0
number_of_reviews,9183.0,57.72634,114.4863,0.0,1.0,13.0,63.0,2877.0


In [6]:
out_path = "../data/processed/listings_clean.csv"
df.to_csv(out_path, index=False)
out_path, df.shape


('../data/processed/listings_clean.csv', (9183, 13))

- Preis als float geparst, Ausreißer auf 1.–99. Perzentil gecappt.
- Wichtige Spalten ausgewählt; fehlende numerische Werte mit Median gefüllt.
- Duplikate pro Listing-ID entfernt.
