# ⌚ End-to-End Machine Learning: Luxusuhren Preisvorhersage

Dieses Projekt vereint zwei Datensätze über Luxusuhren, um mithilfe von maschinellem Lernen eine Preisvorhersage zu ermöglichen. Das Modell wird trainiert, evaluiert und in einer Gradio-Webanwendung deployt.




In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
import pickle

# Daten laden
df_prices = pd.read_csv("watch_prices.csv")
df_listings = pd.read_csv("watch_listings.csv")

# Spaltennamen vereinheitlichen
df_prices.columns = df_prices.columns.str.lower().str.strip()
df_listings.columns = df_listings.columns.str.lower().str.strip()

# Listings bereinigen
df_listings = df_listings[df_listings["price"].notna()]
df_listings = df_listings[df_listings["price"].str.lower() != "price on request"]
df_listings["price"] = df_listings["price"].str.replace(r"[$,]", "", regex=True).astype(float)
df_listings = df_listings.dropna(subset=["brand", "model"])

# Merge auf brand & model
df = pd.merge(df_prices, df_listings, on=["brand", "model"], how="inner")
print(f"Zusammengeführt: {df.shape[0]} Einträge")

#Rename columns
df = df.rename(columns={
    "case material": "case_material",
})

#Prepare features
df["brand"] = df["brand"].fillna("unknown").astype("category").cat.codes

# Extract 4-digit year from 'yop' and clean it
year_extracted = df["yop"].astype(str).str.extract(r"(\d{4})")[0]  # get first column
df["year"] = pd.to_numeric(year_extracted, errors='coerce')
df["year"] = df["year"].fillna(df["year"].median())

df["case_material"] = df["case_material"].fillna("unknown").astype("category").cat.codes
df["condition"] = df["cond"].fillna("unknown").astype("category").cat.codes
df["automatic"] = df["mvmt"].fillna("").str.contains("automatic", case=False).astype(int)

#Final feature selection for model input
X = df[["case_material", "condition", "automatic", "brand", "year"]]
y = df["price"]

#Step 1: Convert all X columns to numeric (turn bad values into NaN)
X_numeric = X.apply(pd.to_numeric, errors='coerce')

#Step 2: Impute missing values using median strategy
imputer = SimpleImputer(strategy="median")
X_imputed = imputer.fit_transform(X_numeric)

#Step 3: Fix column mismatch by only using imputed columns
X = pd.DataFrame(X_imputed, columns=X_numeric.columns[:X_imputed.shape[1]])

#Step 4: Train/Test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Step 5: Random Forest with GridSearchCV
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20],
    'min_samples_split': [2, 5]
}
grid_search = GridSearchCV(
    RandomForestRegressor(random_state=42),
    param_grid,
    cv=3,
    scoring='neg_root_mean_squared_error',
    verbose=2
)
grid_search.fit(X_train, y_train)

#Step 6: Best RF model
rf_model = grid_search.best_estimator_
rf_model.fit(X, y)

y_pred_rf = rf_model.predict(X_test)
rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf))
r2_rf = r2_score(y_test, y_pred_rf)

print(f"Random Forest RMSE: {rmse_rf:.2f}")
print(f"Random Forest R²: {r2_rf:.3f}")

#Step 7: Linear Regression
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

y_pred_lr = lr_model.predict(X_test)
rmse_lr = np.sqrt(mean_squared_error(y_test, y_pred_lr))
r2_lr = r2_score(y_test, y_pred_lr)

print(f"Linear Regression RMSE: {rmse_lr:.2f}")
print(f"Linear Regression R²: {r2_lr:.3f}")

#Step 8: Cross-validation
rf_cv = cross_val_score(rf_model, X_train, y_train, cv=5, scoring="neg_root_mean_squared_error")
lr_cv = cross_val_score(lr_model, X_train, y_train, cv=5, scoring="neg_root_mean_squared_error")

print(f"Random Forest CV RMSE: {-np.mean(rf_cv):.2f}")
print(f"Linear Regression CV RMSE: {-np.mean(lr_cv):.2f}")

#Step 9: Final Model Comparison
print("\nModel Comparison:")
print(f"Random Forest -> RMSE: {rmse_rf:.2f}, R²: {r2_rf:.3f}")
print(f"Linear Regression -> RMSE: {rmse_lr:.2f}, R²: {r2_lr:.3f}")

# Daten speichern
df.to_csv("cleaned_watch_data.csv", index=False)

# Modell speichern
rf_bundle = {
    "model": rf_model,
    "feature_names": X.columns.tolist()
}
with open("watch_rf_model.pkl", "wb") as f:
    pickle.dump(rf_bundle, f)

lr_bundle = {
    "model": lr_model,
    "feature_names": X.columns.tolist()
}
with open("watch_lr_model.pkl", "wb") as f:
    pickle.dump(lr_bundle, f)

print("Modelle gespeichert: watch_rf_model.pkl & watch_lr_model.pkl")
print("Modell & Daten gespeichert.")

# Modellprüfung
with open("watch_rf_model.pkl", "rb") as f:
    rf_bundle = pickle.load(f)
print("Random Forest Feature Names:", rf_bundle["feature_names"])

with open("watch_lr_model.pkl", "rb") as f:
    lr_bundle = pickle.load(f)
print("Linear Regression Feature Names:", lr_bundle["feature_names"])



  df_listings = pd.read_csv("watch_listings.csv")


Zusammengeführt: 385303 Einträge
Fitting 3 folds for each of 8 candidates, totalling 24 fits
[CV] END max_depth=10, min_samples_split=2, n_estimators=100; total time=   7.7s
[CV] END max_depth=10, min_samples_split=2, n_estimators=100; total time=   7.4s
[CV] END max_depth=10, min_samples_split=2, n_estimators=100; total time=   7.5s
[CV] END max_depth=10, min_samples_split=2, n_estimators=200; total time=  15.6s
[CV] END max_depth=10, min_samples_split=2, n_estimators=200; total time=  14.5s
[CV] END max_depth=10, min_samples_split=2, n_estimators=200; total time=  14.4s
[CV] END max_depth=10, min_samples_split=5, n_estimators=100; total time=   7.3s
[CV] END max_depth=10, min_samples_split=5, n_estimators=100; total time=   7.3s
[CV] END max_depth=10, min_samples_split=5, n_estimators=100; total time=   7.3s
[CV] END max_depth=10, min_samples_split=5, n_estimators=200; total time=  14.3s
[CV] END max_depth=10, min_samples_split=5, n_estimators=200; total time=  14.3s
[CV] END max_dep

In [None]:
#Zusammenfassung der Ergebnisse
print("\n" + "="*50)
print("Final Model Comparison Summary")
print("="*50)
print("Random Forest")
print(f"   - RMSE:     {rmse_rf:,.2f}")
print(f"   - R²:       {r2_rf:.3f}")
print(f"   - CV-RMSE:  {-np.mean(rf_cv):,.2f}")

print("\nLinear Regression")
print(f"   - RMSE:     {rmse_lr:,.2f}")
print(f"   - R²:       {r2_lr:.3f}")
print(f"   - CV-RMSE:  {-np.mean(lr_cv):,.2f}")
print("="*50)


📊 Final Model Comparison Summary
🔸 Random Forest
   - RMSE:     89,185.20
   - R²:       0.348
   - CV-RMSE:  87,893.65

🔸 Linear Regression
   - RMSE:     110,268.60
   - R²:       0.004
   - CV-RMSE:  109,106.47
