In [None]:
import pandas as pd
import numpy as np

PATH = "../data/processed/merged.csv"
df = pd.read_csv(PATH).dropna(subset=["price_eur"])

print("Rows:", len(df), "| Cols:", df.shape[1])
df.head(2)


In [None]:
# Zielvariable
y = df["price_eur"]

# Kandidaten-Features (werden gefiltert, falls Spalten fehlen)
feat_candidates = [
    "accommodates","bedrooms","bathrooms_num","minimum_nights",
    "review_scores_rating","number_of_reviews",
    "room_type","property_type","neighbourhood_cleansed",
    "latitude","longitude","unavailable_rate_90d"
]
feats = [c for c in feat_candidates if c in df.columns]
X = df[feats].copy()

print("Using features:", feats)


In [None]:
# numerische NAs robust füllen (Median)
num_cols = X.select_dtypes(include=[np.number]).columns
for col in num_cols:
    X[col] = X[col].fillna(X[col].median())

cat_cols = X.select_dtypes(include="object").columns.tolist()
num_cols = [c for c in feats if c not in cat_cols]

print("Categorical:", cat_cols)
print("Numeric    :", num_cols)


In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

pre = ColumnTransformer(
    transformers=[("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)],
    remainder="passthrough"
)
pre


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

rf_pipe  = Pipeline([("pre", pre),
                     ("model", RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1))])

lin_pipe = Pipeline([("pre", pre),
                     ("model", LinearRegression())])
rf_pipe, lin_pipe


In [None]:
from sklearn.model_selection import cross_val_score

rf_r2  = cross_val_score(rf_pipe,  X, y, cv=5, scoring="r2", n_jobs=-1)
lin_r2 = cross_val_score(lin_pipe, X, y, cv=5, scoring="r2", n_jobs=-1)

print(f"RandomForest CV R²: {rf_r2.mean():.3f}  (+/- {rf_r2.std():.3f})")
print(f"LinearRegression CV R²: {lin_r2.mean():.3f}  (+/- {lin_r2.std():.3f})")


In [None]:
from sklearn.metrics import make_scorer, mean_absolute_error

mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False)

rf_mae  = -cross_val_score(rf_pipe,  X, y, cv=5, scoring=mae_scorer, n_jobs=-1)
lin_mae = -cross_val_score(lin_pipe, X, y, cv=5, scoring=mae_scorer, n_jobs=-1)

print(f"RandomForest CV MAE: €{rf_mae.mean():.2f}  (+/- {rf_mae.std():.2f})")
print(f"LinearRegression CV MAE: €{lin_mae.mean():.2f}  (+/- {lin_mae.std():.2f})")


In [None]:
import matplotlib.pyplot as plt

labels = ["RandomForest", "LinearRegression"]
r2_vals = [rf_r2.mean(), lin_r2.mean()]

plt.figure(figsize=(5,3.2))
plt.bar(labels, r2_vals)
plt.title("CV R² (5-fold)")
plt.ylim(0, max(0.01, max(r2_vals)*1.15))
plt.tight_layout()
plt.savefig("../reports/figures/experiments_cv_r2.png", dpi=160)
plt.show()
