In [1]:
import mlflow
mlflow.set_tracking_uri("http://127.0.0.1:5000")
print("Tracking URI:", mlflow.get_tracking_uri())


Tracking URI: http://127.0.0.1:5000


In [2]:
from sklearn.datasets import fetch_california_housing
import pandas as pd

data = fetch_california_housing(as_frame=True)
df = data.frame

df.to_csv("../data/raw/housing.csv", index=False)

print("Dataset saved to data/raw/housing.csv")
df.head()


Dataset saved to data/raw/housing.csv


Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import pandas as pd
import mlflow
import mlflow.sklearn

# Load the dataset
df = pd.read_csv("../data/raw/housing.csv")
X = df.drop(columns=["MedHouseVal"])
y = df["MedHouseVal"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

with mlflow.start_run(run_name="LinearRegression"):
    model = LinearRegression()
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    mse = mean_squared_error(y_test, preds)

    mlflow.log_metric("mse", mse)
    mlflow.sklearn.log_model(model, artifact_path="model") 

    print("MSE:", mse)




MSE: 0.5558915986952443
🏃 View run LinearRegression at: http://127.0.0.1:5000/#/experiments/0/runs/fa3800e1216f4258955242e478d03e2a
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/0


In [4]:
import joblib

joblib.dump(model, "../models/best_model.pkl") 


['../models/best_model.pkl']

In [5]:
from sklearn.tree import DecisionTreeRegressor

with mlflow.start_run(run_name="DecisionTreeRegressor"):
    model2 = DecisionTreeRegressor(max_depth=5, random_state=42)
    model2.fit(X_train, y_train)
    preds2 = model2.predict(X_test)
    mse2 = mean_squared_error(y_test, preds2)

    mlflow.log_param("max_depth", 5)
    mlflow.log_metric("mse", mse2)
    mlflow.sklearn.log_model(model2, artifact_path="model")

    print("Decision Tree MSE:", mse2)




Decision Tree MSE: 0.5245146178314735
🏃 View run DecisionTreeRegressor at: http://127.0.0.1:5000/#/experiments/0/runs/e26f062281bc4fa2853777161b79afe6
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/0


In [6]:
# Compare both MSEs and pick the better one
print(f"Linear Regression MSE: {mse}")
print(f"Decision Tree MSE: {mse2}")

if mse < mse2:
    best_model = model
    print("✅ Using Linear Regression as best model.")
else:
    best_model = model2
    print("✅ Using Decision Tree as best model.")

# Save the best model
import joblib
joblib.dump(best_model, "../models/best_model.pkl")
print("✅ Best model saved to models/best_model.pkl")


Linear Regression MSE: 0.5558915986952443
Decision Tree MSE: 0.5245146178314735
✅ Using Decision Tree as best model.
✅ Best model saved to models/best_model.pkl
