In [3]:
import mlflow
import pandas as pd
from sklearn.datasets import fetch_california_housing
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
california = fetch_california_housing(as_frame=True)
df = california.frame
df

In [None]:
df.describe()

In [None]:
# %%writefile script.py

import mlflow
import pandas as pd
from sklearn.datasets import fetch_california_housing
import matplotlib.pyplot as plt
import seaborn as sns


mlflow.set_tracking_uri('http://13.51.140.113:5000/')



# Initialiser l'expérience MLflow
mlflow.set_experiment("california_housing_regression")

with mlflow.start_run(run_name="data_exploration") as run:
    # Charger le jeu de données
    california = fetch_california_housing(as_frame=True)
    df = california.frame
    mlflow.log_param("dataset_shape", df.shape)

    # Analyse descriptive
    desc = df.describe()
    print(desc)
    mlflow.log_text(desc.to_string(), "description.txt")
    mlflow.log_artifact("script.py")

    # Visualisation des distributions (exemples)
    plt.figure(figsize=(10, 6))
    sns.histplot(df['MedHouseVal'], bins=50)
    plt.title('Distribution des Prix des Maisons')
    plt.savefig("histogram_price.png")
    mlflow.log_artifact("histogram_price.png")

    plt.figure(figsize=(10, 6))
    sns.scatterplot(x="MedInc", y="MedHouseVal", data=df)
    plt.title("Relation entre Revenu Médian et Prix")
    plt.savefig("scatter_income_price.png")
    mlflow.log_artifact("scatter_income_price.png")

    # Log des infos, observations, etc.
    mlflow.log_text("Observation: Le prix des maisons a une distribution non normale...", "data_insights.txt")
    print("Exploration terminée")

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report

with mlflow.start_run(run_name="logistic_regression_baseline") as run:
    # Préparation des données pour la classification (exemple: prix > moyenne devient 1, sinon 0)
    df['target_class'] = (df['MedHouseVal'] > df['MedHouseVal'].mean()).astype(int)
    X = df.drop(['MedHouseVal','target_class'], axis=1)
    y = df['target_class']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Mise à l'échelle des features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Entraînement du modèle de régression logistique
    model = LogisticRegression(solver='liblinear', random_state=42)  #solver to prevent warnings
    model.fit(X_train_scaled, y_train)

    # Prédiction et évaluation
    y_pred = model.predict(X_test_scaled)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)

    # Log des paramètres, métriques et modèle
    mlflow.log_param("solver", "liblinear")
    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_dict(report, "classification_report.json")
    mlflow.sklearn.log_model(model, "logistic_model")
    print("Modèle Logistic Regression Entraîné")


In [47]:
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

models = {
    "LinearRegression": LinearRegression(),
    "Ridge": Ridge(alpha=1.0),
    "GradientBoosting": GradientBoostingRegressor(n_estimators=100, random_state=42),
    "RandomForest": RandomForestRegressor(n_estimators=30, random_state=42)
}

# Prepare Data
X = df.drop(['MedHouseVal','target_class'], axis=1) # using the target_class column for now
y = df['MedHouseVal']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

for name, model in models.items():
    with mlflow.start_run(run_name=name) as run:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        mlflow.log_param("name", name)


        mse = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        mlflow.log_metric("mse", mse)
        mlflow.log_metric("r2", r2)

        input_example = pd.DataFrame(df).sample(1)
        mlflow.sklearn.log_model(sk_model=model, artifact_path='model',input_example=input_example)

        print(f"{name}: MSE = {mse:.2f}, R2 = {r2:.2f}")


  "dataframe_split": {
    "columns": [
      "MedInc",
      "HouseAge",
      "AveRooms",
      "AveBedrms",
      "Population",
      "AveOccup",
      "Latitude",
      "Longitude",
      "MedHouseVal",
      "target_class"
    ],
    "data": [
      [
        4.875,
        33.0,
        6.2809917355371905,
        0.987603305785124,
        658.0,
        2.71900826446281,
        33.78,
        -117.96,
        2.693,
        1
      ]
    ]
  }
}. Alternatively, you can avoid passing input example and pass model signature instead when logging the model. To ensure the input example is valid prior to serving, please try calling `mlflow.models.validate_serving_input` on the model uri and serving input example. A serving input example can be generated from model input example using `mlflow.models.convert_input_example_to_serving_input` function.
Got error: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- MedHouseVal
- target_clas

LinearRegression: MSE = 0.56, R2 = 0.58
🏃 View run LinearRegression at: http://13.51.140.113:5000/#/experiments/972577552072586799/runs/537442f3ca17468d9f4e51404b6186c4
🧪 View experiment at: http://13.51.140.113:5000/#/experiments/972577552072586799


  "dataframe_split": {
    "columns": [
      "MedInc",
      "HouseAge",
      "AveRooms",
      "AveBedrms",
      "Population",
      "AveOccup",
      "Latitude",
      "Longitude",
      "MedHouseVal",
      "target_class"
    ],
    "data": [
      [
        1.2281,
        25.0,
        5.503978779840849,
        1.1538461538461537,
        991.0,
        2.6286472148541113,
        39.77,
        -123.23,
        0.603,
        0
      ]
    ]
  }
}. Alternatively, you can avoid passing input example and pass model signature instead when logging the model. To ensure the input example is valid prior to serving, please try calling `mlflow.models.validate_serving_input` on the model uri and serving input example. A serving input example can be generated from model input example using `mlflow.models.convert_input_example_to_serving_input` function.
Got error: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- MedHouseVal
- target_c

Ridge: MSE = 0.56, R2 = 0.58
🏃 View run Ridge at: http://13.51.140.113:5000/#/experiments/972577552072586799/runs/6ebbd008f5d94862b4ed2456af74a237
🧪 View experiment at: http://13.51.140.113:5000/#/experiments/972577552072586799


  "dataframe_split": {
    "columns": [
      "MedInc",
      "HouseAge",
      "AveRooms",
      "AveBedrms",
      "Population",
      "AveOccup",
      "Latitude",
      "Longitude",
      "MedHouseVal",
      "target_class"
    ],
    "data": [
      [
        3.875,
        35.0,
        5.546568627450981,
        1.0465686274509804,
        1001.0,
        2.4534313725490198,
        32.84,
        -117.19,
        1.72,
        0
      ]
    ]
  }
}. Alternatively, you can avoid passing input example and pass model signature instead when logging the model. To ensure the input example is valid prior to serving, please try calling `mlflow.models.validate_serving_input` on the model uri and serving input example. A serving input example can be generated from model input example using `mlflow.models.convert_input_example_to_serving_input` function.
Got error: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- MedHouseVal
- target_cl

GradientBoosting: MSE = 0.29, R2 = 0.78
🏃 View run GradientBoosting at: http://13.51.140.113:5000/#/experiments/972577552072586799/runs/e6f8fe53256042498bb6fb942c28e875
🧪 View experiment at: http://13.51.140.113:5000/#/experiments/972577552072586799


  "dataframe_split": {
    "columns": [
      "MedInc",
      "HouseAge",
      "AveRooms",
      "AveBedrms",
      "Population",
      "AveOccup",
      "Latitude",
      "Longitude",
      "MedHouseVal",
      "target_class"
    ],
    "data": [
      [
        1.625,
        4.0,
        3.0,
        0.5,
        8.0,
        1.3333333333333333,
        35.22,
        -117.76,
        2.75,
        1
      ]
    ]
  }
}. Alternatively, you can avoid passing input example and pass model signature instead when logging the model. To ensure the input example is valid prior to serving, please try calling `mlflow.models.validate_serving_input` on the model uri and serving input example. A serving input example can be generated from model input example using `mlflow.models.convert_input_example_to_serving_input` function.
Got error: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- MedHouseVal
- target_class



In [None]:
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

with mlflow.start_run(run_name="random_forest_tuning") as run:
  rf_model = RandomForestRegressor(random_state=42)
  param_dist = {
        "n_estimators": np.arange(20, 60),
        "max_depth": [5, 10, 15, None],
        "min_samples_split": np.arange(2, 10),
        "min_samples_leaf": np.arange(1, 5)
  }
  rf_random = RandomizedSearchCV(rf_model, param_dist, n_iter=2, cv=3, random_state=42, scoring='neg_mean_squared_error')
  rf_random.fit(X_train, y_train)

  best_model = rf_random.best_estimator_
  best_params = rf_random.best_params_
  y_pred = best_model.predict(X_test)
  mse = mean_squared_error(y_test, y_pred)
  r2 = r2_score(y_test, y_pred)

  mlflow.log_params(best_params)
  mlflow.log_metric("mse", mse)
  mlflow.log_metric("r2", r2)
  mlflow.sklearn.log_model(best_model, "best_model")
  print(f"Best Random Forest Model (Tuned): MSE = {mse:.2f}, R2 = {r2:.2f}")

In [None]:
import mlflow.models
from mlflow.models.signature import infer_signature

with mlflow.start_run(run_name="linear_regression_signature") as run:
    model = LinearRegression()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    signature = infer_signature(X_train, y_pred) # this also works on Pandas DataFrames
    mlflow.sklearn.log_model(model, "model", signature=signature)
    print("Model with signature logged.")


In [None]:
from mlflow import MlflowClient
with mlflow.start_run(run_name="register_best_model") as run:

    # Retrieve Best Model Run
    client = MlflowClient()
    best_run_id = run.info.run_id

    # register the model
    model_uri = f"runs:/758f873092154a7e90ce88cf6766e4ff/best_model"
    registered_model = mlflow.register_model(model_uri, "california_housing_model")
    print(f"Model registered in model registry with name:{registered_model.name}, version: {registered_model.version}")


In [30]:
import mlflow
from mlflow.tracking import MlflowClient


"""
Liste les expériences, les runs, sélectionne un run et affiche ses métadonnées.
"""
client = MlflowClient()

print("------------------- Experiments --------------------")
experiments = client.search_experiments()
for exp in experiments:
    print(f"Experiment Name: {exp.name}, ID: {exp.experiment_id}")

experiments



------------------- Experiments --------------------
Experiment Name: california_housing_regression, ID: 972577552072586799
Experiment Name: Diabetes-3, ID: 985020413726465322
Experiment Name: Diabetes-1, ID: 411509918485022525
Experiment Name: Diabetes, ID: 653027391595642036
Experiment Name: Default, ID: 0


[<Experiment: artifact_location='mlflow-artifacts:/972577552072586799', creation_time=1734426297101, experiment_id='972577552072586799', last_update_time=1734426297101, lifecycle_stage='active', name='california_housing_regression', tags={}>,
 <Experiment: artifact_location='mlflow-artifacts:/985020413726465322', creation_time=1734425081209, experiment_id='985020413726465322', last_update_time=1734425081209, lifecycle_stage='active', name='Diabetes-3', tags={}>,
 <Experiment: artifact_location='mlflow-artifacts:/411509918485022525', creation_time=1734424687459, experiment_id='411509918485022525', last_update_time=1734424687459, lifecycle_stage='active', name='Diabetes-1', tags={}>,
 <Experiment: artifact_location='mlflow-artifacts:/653027391595642036', creation_time=1734358806562, experiment_id='653027391595642036', last_update_time=1734358806562, lifecycle_stage='active', name='Diabetes', tags={}>,
 <Experiment: artifact_location='mlflow-artifacts:/0', creation_time=1734357804055, exp

In [32]:
experiment_id = 985020413726465322
print("--------------------- Runs -----------------------")
all_runs = client.search_runs(experiment_id)
if not all_runs:
    print("No runs found for the current experiment. Please run an MLflow experiment first.")

for run in all_runs:
    print(f"Run ID: {run.info.run_name},; Run ID: {run.info.run_id}, Experiment ID: {run.info.experiment_id}")

all_runs

--------------------- Runs -----------------------
Run ID: diabetes_RForest_55,; Run ID: 2933bc841b164630ba16f1067fda24a9, Experiment ID: 985020413726465322
Run ID: diabetes_RForest_50,; Run ID: de4a8a6f05dd4a169c8ae16b4391bc1c, Experiment ID: 985020413726465322
Run ID: wise-dolphin-93,; Run ID: a3ed7b5b17324fb7af364966f9ad109d, Experiment ID: 985020413726465322
Run ID: unleashed-eel-385,; Run ID: 16927c6f7f824923ab132448b1e0052a, Experiment ID: 985020413726465322
Run ID: diabetes_RForest_55,; Run ID: 181829ec2ff94854a465047cc95943ca, Experiment ID: 985020413726465322
Run ID: diabetes_RForest_50,; Run ID: 51eb9909f7f94f11b004e50e41d3ef5b, Experiment ID: 985020413726465322
Run ID: diabetes_RForest_50,; Run ID: 34b4b53b952e4e17ad1546a232c348e3, Experiment ID: 985020413726465322


[<Run: data=<RunData: metrics={'mse': 3068.8356356207637}, params={'n_estimators': '55', 'random_state': '500'}, tags={'mlflow.log-model.history': '[{"run_id": "2933bc841b164630ba16f1067fda24a9", '
                              '"artifact_path": "model", "utc_time_created": '
                              '"2024-12-17 08:56:37.246735", "model_uuid": '
                              '"06102e81c0d54455aacc5da2b20f60f3", "flavors": '
                              '{"python_function": {"model_path": "model.pkl", '
                              '"predict_fn": "predict", "loader_module": '
                              '"mlflow.sklearn", "python_version": "3.10.12", '
                              '"env": {"conda": "conda.yaml", "virtualenv": '
                              '"python_env.yaml"}}, "sklearn": '
                              '{"pickled_model": "model.pkl", '
                              '"sklearn_version": "1.6.0", '
                              '"serialization_format": "cloudp

In [33]:
selected_run_id = input("Enter the run ID of the run you want to inspect :")

# Rechercher le run sélectionné
try:
    selected_run = client.get_run(selected_run_id)
except Exception as e:
    print(f"Error: Run with ID '{selected_run_id}' not found or an error occured. Details: {e}")

    
print("----------------- Selected Run Metadata ------------------")
print(f"Run ID: {selected_run.info.run_id}")
print(f"Experiment ID: {selected_run.info.experiment_id}")
print(f"Start Time: {selected_run.info.start_time}")
print(f"Status: {selected_run.info.status}")
print("--------------------- Parameters -----------------------")
for key, value in selected_run.data.params.items():
    print(f"  {key}: {value}")

print("--------------------- Metrics -----------------------")
for key, value in selected_run.data.metrics.items():
    print(f"  {key}: {value}")

print("--------------------- Tags -----------------------")
for key, value in selected_run.data.tags.items():
    print(f"  {key}: {value}")

print("--------------------- Artifacts -----------------------")
artifacts_list = client.list_artifacts(selected_run_id)
for artifact in artifacts_list:
    print(f"  {artifact.path}")


----------------- Selected Run Metadata ------------------
Run ID: 2933bc841b164630ba16f1067fda24a9
Experiment ID: 985020413726465322
Start Time: 1734425796508
Status: FINISHED
--------------------- Parameters -----------------------
  random_state: 500
  n_estimators: 55
--------------------- Metrics -----------------------
  mse: 3068.8356356207637
--------------------- Tags -----------------------
  mlflow.log-model.history: [{"run_id": "2933bc841b164630ba16f1067fda24a9", "artifact_path": "model", "utc_time_created": "2024-12-17 08:56:37.246735", "model_uuid": "06102e81c0d54455aacc5da2b20f60f3", "flavors": {"python_function": {"model_path": "model.pkl", "predict_fn": "predict", "loader_module": "mlflow.sklearn", "python_version": "3.10.12", "env": {"conda": "conda.yaml", "virtualenv": "python_env.yaml"}}, "sklearn": {"pickled_model": "model.pkl", "sklearn_version": "1.6.0", "serialization_format": "cloudpickle", "code": null}}}, {"run_id": "2933bc841b164630ba16f1067fda24a9", "artifa

In [35]:
client = MlflowClient()

# register the model
model_uri = f"runs:/{selected_run_id}/model"
registered_model = mlflow.register_model(model_uri, "auto_registred_california_housing_model")
print(f"Model registered in model registry with name:{registered_model.name}, version: {registered_model.version}")


Registered model 'auto_registred_california_housing_model' already exists. Creating a new version of this model...
2024/12/17 11:19:01 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: auto_registred_california_housing_model, version 1


Model registered in model registry with name:auto_registred_california_housing_model, version: 1


Created version '1' of model 'auto_registred_california_housing_model'.


In [45]:
all_registered_models = client.search_registered_models()
all_registered_models

[<RegisteredModel: aliases={}, creation_timestamp=1734428182771, description='', last_updated_timestamp=1734431907524, latest_versions=[<ModelVersion: aliases=[], creation_timestamp=1734428182930, current_stage='None', description='', last_updated_timestamp=1734428182930, name='CHousing-1', run_id='758f873092154a7e90ce88cf6766e4ff', run_link='', source='mlflow-artifacts:/972577552072586799/758f873092154a7e90ce88cf6766e4ff/artifacts/model', status='READY', status_message='', tags={}, user_id='', version='1'>,
  <ModelVersion: aliases=[], creation_timestamp=1734431821420, current_stage='Production', description='', last_updated_timestamp=1734431907524, name='CHousing-1', run_id='2933bc841b164630ba16f1067fda24a9', run_link='', source='models:/auto_registred_california_housing_model/1', status='READY', status_message='', tags={}, user_id='', version='2'>], name='CHousing-1', tags={'Evaluation': 'High', 'Stage': 'Dev'}>,
 <RegisteredModel: aliases={}, creation_timestamp=1734430723367, descr

In [None]:
# Tache parcours les runs de l'expriment "Diabetes-3" pour comparer les metrics (les afficher , choisir le metric)
# enregistrer le model avec les meilleurs  metrics
