upload dataset

In [None]:
from google.colab import files
uploaded = files.upload() #housing.csv

Saving housing.csv to housing.csv


install dependencies

In [None]:
!pip install scikit-learn xgboost joblib pandas mlflow

Collecting mlflow
  Downloading mlflow-3.1.0-py3-none-any.whl.metadata (29 kB)
Collecting mlflow-skinny==3.1.0 (from mlflow)
  Downloading mlflow_skinny-3.1.0-py3-none-any.whl.metadata (30 kB)
Collecting alembic!=1.10.0,<2 (from mlflow)
  Downloading alembic-1.16.1-py3-none-any.whl.metadata (7.3 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==3.1.0->mlflow)
  Downloading databricks_sdk-0.56.0-py3-none-any.whl.metadata (39 kB)
Collecting opentelemetry-api<3,>=1.9.0 (from mlflow-skinny==3.1.0->mlflow)
  Downloading opentelemetry_api-1.34.1-py3-none-any.whl.metadata (1.5 kB)
Collecting opentelemetry-sdk<3,>=1.9.0 (from mlflow-skinny==3.1.0->mlflow)
  Downloading opentele

data loading, preprocessing, training

In [None]:
import pandas as pd
import joblib
import numpy as np
import mlflow
import mlflow.sklearn
from mlflow.models.signature import infer_signature
import hashlib
import shutil
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


load data

In [None]:
df = pd.read_csv('housing.csv')
print(f"dataset shape: {df.shape}")
print("feature types:\n", df.dtypes)

X = df.drop(columns=['median_house_value'])
y = df['median_house_value']

# snapshot and log
snapshot_path = 'housing_snapshot.csv'
df.to_csv(snapshot_path, index=False)
mlflow.log_artifact(snapshot_path)

# generate data hash for versioning
def get_data_hash(file_path):
  with open(file_path, 'rb') as f:
    return hashlib.sha256(f.read()).hexdigest()

data_hash = get_data_hash(snapshot_path)
mlflow.set_tag('data_hash', data_hash)


dataset shape: (20640, 10)
feature types:
 longitude             float64
latitude              float64
housing_median_age    float64
total_rooms           float64
total_bedrooms        float64
population            float64
households            float64
median_income         float64
median_house_value    float64
ocean_proximity        object
dtype: object


In [None]:
#split columns for cate and nume
numeric_features = X.select_dtypes(include=['float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

#preprocessing pipeline
numeric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')), # handle missing data
    ('scaler', StandardScaler()), # normalization
    ('pca', PCA(n_components=5)) # component reduction
])

categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ("onehot", OneHotEncoder(handle_unknown="ignore")) # convert cate to numer
])

pipeline = ColumnTransformer([
    ('num', numeric_pipeline, numeric_features),
    ('cat', categorical_pipeline, categorical_features)
])

# train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# fit and transform
X_train_proc = pipeline.fit_transform(X_train)
X_test_proc = pipeline.transform(X_test)


In [None]:
# define model
models = {
    "rf": {
        "model": RandomForestRegressor(),
        "params": {
            "n_estimators": [50, 100], # number of decision trees
            "max_depth": [5, 10, None] # depth of trees

        }
    },
    "xgb": {
        "model": XGBRegressor(),
        "params": {
            "n_estimators": [50, 100],
            "max_depth": [3, 6],
            "learning_rate": [0.1, 0.05]
        }
    }
}

best_score = float('inf')
best_model = None
best_pipeline = pipeline


In [None]:
mlflow.end_run()

In [None]:
for name, config in models.items():
  print(f"Tuning {name}...")
  grid = GridSearchCV(config["model"], config["params"], cv=5, scoring="neg_root_mean_squared_error", n_jobs=-1)
  grid.fit(X_train_proc, y_train)
  preds = grid.predict(X_test_proc)

  mse = mean_squared_error(y_test, preds)
  rmse = np.sqrt(mse) #root of mse
  mae = mean_absolute_error(y_test, preds)
  r2 = r2_score(y_test, preds)

  print(f"{name} RMSE: {rmse: .2f}")
  print(f"{name} MAE: {mae: .2f}")
  print(f"{name} R2: {r2: 2f}")


  # MLflow logging
  with mlflow.start_run(run_name=name):
    mlflow.log_params(grid.best_params_)
    mlflow.log_metrics({"rmse": rmse, "mae": mae, "r2": r2})
    mlflow.set_tag("model_type", name)
    mlflow.set_tag("dataset_version", "v1.0")
    mlflow.set_tag("feature_types", str(df.dtypes.to_dict()))
    mlflow.set_tag("data_hash", data_hash)

    # infer signature and input example --> rm warning
    input_example = pd.DataFrame(X_test_proc[:1].toarray() if hasattr(X_test_proc, "toarray") else X_test_proc[:1])
    signature = infer_signature(X_test_proc, preds)

    # log model
    mlflow.sklearn.log_model(
        sk_model=grid.best_estimator_,
        artifact_path="model",
        signature=signature,
        input_example=input_example
    )

    # save
    joblib.dump(best_pipeline, "pipeline.pkl")
    mlflow.log_artifact("pipeline.pkl")

# best model
  if rmse < best_score:
    best_score = rmse
    best_model = grid.best_estimator_
    best_model_name = name

print(f"Best model: {best_model_name} with RMSE: {best_score: .2f}")



Tuning rf...




rf RMSE:  58439.49
rf MAE:  39363.90
rf R2:  0.739381
Tuning xgb...




xgb RMSE:  56844.48
xgb MAE:  38601.74
xgb R2:  0.753413
Best model: xgb with RMSE:  56844.48


In [None]:
# save final model and pipeline
joblib.dump(best_model, "model.pkl")
joblib.dump(best_pipeline, "pipeline.pkl")

['pipeline.pkl']

download

In [None]:
from google.colab import files
files.download('model.pkl')
files.download('pipeline.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

save log

In [None]:
!zip -r mlruns.zip mlruns
from google.colab import files
files.download("mlruns.zip")

  adding: mlruns/ (stored 0%)
  adding: mlruns/0/ (stored 0%)
  adding: mlruns/0/b17bee53d421424d8a4b04546b79c033/ (stored 0%)
  adding: mlruns/0/b17bee53d421424d8a4b04546b79c033/artifacts/ (stored 0%)
  adding: mlruns/0/b17bee53d421424d8a4b04546b79c033/artifacts/pipeline.pkl (deflated 50%)
  adding: mlruns/0/b17bee53d421424d8a4b04546b79c033/params/ (stored 0%)
  adding: mlruns/0/b17bee53d421424d8a4b04546b79c033/params/max_depth (stored 0%)
  adding: mlruns/0/b17bee53d421424d8a4b04546b79c033/params/n_estimators (stored 0%)
  adding: mlruns/0/b17bee53d421424d8a4b04546b79c033/metrics/ (stored 0%)
  adding: mlruns/0/b17bee53d421424d8a4b04546b79c033/metrics/r2 (deflated 44%)
  adding: mlruns/0/b17bee53d421424d8a4b04546b79c033/metrics/mae (deflated 43%)
  adding: mlruns/0/b17bee53d421424d8a4b04546b79c033/metrics/rmse (deflated 43%)
  adding: mlruns/0/b17bee53d421424d8a4b04546b79c033/outputs/ (stored 0%)
  adding: mlruns/0/b17bee53d421424d8a4b04546b79c033/outputs/m-1ccbfcddc3ee4cf0a275d8da3e

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>