# Version 1 Linear Regression

**Einfache Lineare Regression ohne Optimierung.**



In [1]:
import mlflow
import mlflow.sklearn
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import Ridge
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score

# ðŸ“Œ MLflow Experiment setzen
mlflow.set_experiment("Car Price Prediction")

# ðŸ“Œ MLflow Run starten
with mlflow.start_run():
    print("ðŸš€ MLflow Run gestartet!")

    # ðŸ“Œ Daten laden und vorbereiten
    df = pd.read_csv("car_price_dataset.csv")

    # ðŸ“Œ Unrealistische Werte entfernen
    df = df[(df["Price"] > 1000) & (df["Price"] < 500000)]
    df = df[(df["Mileage"] > 1000) & (df["Mileage"] < 500000)]

    # ðŸ“Œ Kategorische Werte in numerische umwandeln
    categorical_columns = ["Brand", "Model", "Fuel_Type", "Transmission"]
    for col in categorical_columns:
        df[col] = df[col].astype("category").cat.codes

    # ðŸ“Œ Feature Engineering
    df["Mileage_sqrt"] = np.sqrt(df["Mileage"])
    df["Car_Age"] = 2025 - df["Year"]
    df.drop(columns=["Year", "Mileage"], inplace=True)

    # ðŸ“Œ Standardisierung
    scaler = StandardScaler()
    numeric_features = ["Engine_Size", "Mileage_sqrt", "Car_Age", "Doors"]
    df[numeric_features] = scaler.fit_transform(df[numeric_features])

    # ðŸ“Œ Log-Transformation der Zielvariable
    df["Log_Price"] = np.log1p(df["Price"])

    # ðŸ“Œ Trainings- und Testdaten erstellen
    X = df.drop(columns=["Price", "Log_Price"])
    y = df["Log_Price"]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # ðŸ“Œ MLflow: Parameter loggen
    mlflow.log_param("model_type", "Polynomial Ridge Regression")
    mlflow.log_param("poly_degree", 2)
    mlflow.log_param("ridge_alpha", 1.0)

    # ðŸ“Œ Pipeline mit Polynomialen Features & Ridge Regression
    pipeline = Pipeline([
        ("poly", PolynomialFeatures(degree=2, include_bias=False)),
        ("scaler", StandardScaler()),
        ("ridge", Ridge(alpha=1.0))
    ])

    # ðŸ“Œ Modell trainieren
    pipeline.fit(X_train, y_train)

    # ðŸ“Œ Vorhersagen & Bewertung
    y_pred_log = pipeline.predict(X_test)
    y_pred = np.expm1(y_pred_log)
    mse = mean_squared_error(np.expm1(y_test), y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(np.expm1(y_test), y_pred)

    print(f"MSE: {mse:.2f}, RMSE: {rmse:.2f}, RÂ²: {r2:.4f}")

    # ðŸ“Œ MLflow: Metriken loggen
    mlflow.log_metric("mse", mse)
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2", r2)

    # ðŸ“Œ MLflow: Modell speichern
    mlflow.sklearn.log_model(pipeline, "ridge_model")

    # ðŸ“Œ Residuenplot speichern
    plt.figure(figsize=(8, 5))
    sns.histplot(np.expm1(y_test) - y_pred, bins=50, kde=True)
    plt.xlabel("Residuen")
    plt.ylabel("Anzahl")
    plt.title("Verteilung der Residuen")
    plt.savefig("residuals_plot.png")
    mlflow.log_artifact("residuals_plot.png")

    # ðŸ“Œ TatsÃ¤chlicher vs. Vorhergesagter Preis speichern
    plt.figure(figsize=(8, 5))
    sns.scatterplot(x=np.expm1(y_test), y=y_pred)
    plt.xlabel("TatsÃ¤chlicher Preis")
    plt.ylabel("Vorhergesagter Preis")
    plt.title("TatsÃ¤chlicher vs. Vorhergesagter Preis")
    plt.savefig("actual_vs_predicted.png")
    mlflow.log_artifact("actual_vs_predicted.png")

    print("âœ… MLflow Run abgeschlossen!")
    mlflow.end_run()


2025/03/15 18:24:52 INFO mlflow.tracking.fluent: Experiment with name 'Car Price Prediction' does not exist. Creating a new experiment.


ðŸš€ MLflow Run gestartet!


FileNotFoundError: [Errno 2] No such file or directory: 'car_price_dataset.csv'