In [2]:
import boto3
import sagemaker
from sagemaker.sklearn import SKLearn
from sagemaker.session import Session
import mlflow
import mlflow.sklearn
import os


os.environ["AWS_ACCESS_KEY_ID"]=""
os.environ["AWS_SECRET_ACCESS_KEY"]=""
os.environ["AWS_DEFAULT_REGION"]=""


# Configurations AWS et MLflow
aws_region =  region_name = "eu-north-1"  # Ex: us-east-1
bucket_name =   "formation-mlflow"
role_arn = "arn:aws:iam::390403854070:role/mlflow"
role_arn = role="arn:aws:iam::622333992348:role/service-role/AmazonSageMaker-ExecutionRole-20241206T115846"
mlflow_tracking_uri = "http://13.51.140.113:5000"  # URL de votre serveur MLflow (remplacez si besoin)

mlflow.set_tracking_uri(mlflow_tracking_uri)


# Initialisation des sessions AWS et SageMaker
boto_session = boto3.Session(region_name=aws_region)
sagemaker_session = sagemaker.Session(boto_session=boto_session, default_bucket=bucket_name)





sagemaker.config INFO - Not applying SDK defaults from location: /home/mmby/.config/kdedefaults/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/mmby/.config/sagemaker/config.yaml


In [4]:
%%writefile script.py

import argparse
import os
import joblib
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor 
from sklearn.metrics import mean_squared_error 
from sklearn.model_selection import train_test_split 
from sklearn.datasets import fetch_california_housing

if __name__ == "__main__":
    print("extracting arguments")
    parser = argparse.ArgumentParser()

    # Hyperparameters sent by the client are passed as command-line arguments to the script.
    parser.add_argument("--n-estimators", type=int, default=100) # Increased default
    parser.add_argument("--min-samples-leaf", type=int, default=2) # Increased default

    # Data, model, and output directories
    parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR"))
    parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN")) # No longer used
    parser.add_argument("--test", type=str, default=os.environ.get("SM_CHANNEL_TEST")) # No longer used
    parser.add_argument("--train-file", type=str, default="california-housing-train.csv") # Changed filename
    parser.add_argument("--test-file", type=str, default="california-housing-test.csv") # Changed filename

    args, _ = parser.parse_known_args()
    
    print("loading and preparing data")

    # Load the dataset directly.
    housing = fetch_california_housing(as_frame=True)
    housing_df = pd.DataFrame(housing.frame)

    # Split the data into training and testing sets
    train_df, test_df = train_test_split(housing_df, test_size=0.2, random_state=42) # added train test split.

    print("saving training and testing datasets")

    # Save the training and testing data as CSVs in respective channels.
    # train_path = os.path.join(args.train, args.train_file)
    # test_path = os.path.join(args.test, args.test_file)
    train_df.to_csv("california-housing-train.csv", index=False)
    test_df.to_csv("california-housing-test.csv", index=False)

    # print(f"training data persisted at {train_path}")
    # print(f"test data persisted at {test_path}")

    print("building training and testing datasets")

    # Prepare the training and testing datasets
    X_train = train_df.drop("MedHouseVal", axis=1)
    X_test = test_df.drop("MedHouseVal", axis=1)
    y_train = train_df[["MedHouseVal"]]
    y_test = test_df[["MedHouseVal"]]

    # Train model
    print("training model")

    model = RandomForestRegressor(
        n_estimators=args.n_estimators,
        min_samples_leaf=args.min_samples_leaf,
        n_jobs=-1,
        random_state=42,
    )

    model.fit(X_train, y_train.values.ravel()) # using .values.ravel() to get the 1d array

    # Print MSE
    print("validating model")

    mse_train = mean_squared_error(y_train, model.predict(X_train))
    mse_test = mean_squared_error(y_test, model.predict(X_test))

    print(f"Train MSE: {mse_train:.3f}")
    print(f"Test MSE: {mse_test:.3f}")

    # Persist model
    path = os.path.join(args.model_dir, "model.joblib")
    joblib.dump(model, path)
    print("model persisted at " + path)

Writing script.py


In [8]:
from sklearn.datasets import fetch_california_housing
import pandas as pd

housing = fetch_california_housing(as_frame=True)
housing_data = pd.DataFrame(housing.frame)

housing_data.to_csv("data/california_housing.csv")

# Configuration de MLflow
mlflow.set_experiment("SageMaker_MLflow_Experiment")


# Définir les hyperparamètres du modèle
hyperparameters = {
    "max_depth": 4,
    "n_estimators": 120,
    "random_state": 42
}

# Chemin du script d'entraînement local (uploadé dans SageMaker)
script_path = "script.py"  # Assurez-vous que ce fichier existe (voir ci-dessous)

# Configuration de l'estimateur SageMaker pour Scikit-learn
estimator = SKLearn(
    entry_point=script_path,
    framework_version="1.0-1",
    instance_type="ml.m5.large",
    instance_count=1,
    role=role_arn,
    hyperparameters=hyperparameters,
    sagemaker_session=sagemaker_session,
)

# Lancer l'expérience MLflow
with mlflow.start_run(run_name="sagemaker-randomForest") as run:
    # Enregistrement des hyperparamètres dans MLflow
    mlflow.log_params(hyperparameters)

    # Lancer l'entraînement SageMaker
    train_input = f"s3://{bucket_name}/train_data/"
    estimator.fit({"train": train_input}, wait=True)

    # Enregistrement du modèle dans MLflow
    model_uri = estimator.model_data  # Chemin S3 du modèle entraîné
    model_local_path = "model.tar.gz"
    # os.system(f"aws s3 cp {model_uri} {model_local_path}")  # Télécharger le modèle localement
    
    # Chargement du modèle entraîné
    mlflow.sklearn.log_model(model_uri=model_uri, artifact_path="model")

    # Enregistrement des métriques (exemple fictif)
    mlflow.log_metric("accuracy", 0.95)  # Remplacez par vos métriques réelles
    mlflow.log_metric("precision", 0.93)
    mlflow.log_metric("recall", 0.92)

    print(f"Run ID: {run.info.run_id}")



2024-12-18 10:10:00 Starting - Starting the training job...
2024-12-18 10:10:15 Starting - Preparing the instances for training...
2024-12-18 10:11:04 Downloading - Downloading the training image......
2024-12-18 10:12:00 Training - Training image download completed. Training in progress..2024-12-18 10:12:05,286 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training
2024-12-18 10:12:05,290 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2024-12-18 10:12:05,292 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)
2024-12-18 10:12:05,307 sagemaker_sklearn_container.training INFO     Invoking user training script.
2024-12-18 10:12:05,526 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2024-12-18 10:12:05,530 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)
2024-12-18 10:12:05,547 sagemaker-training-toolkit INFO    

In [9]:
model_uri

's3://formation-mlflow/sagemaker-scikit-learn-2024-12-18-10-09-57-804/output/model.tar.gz'

In [7]:
model_local_path

'model.tar.gz'