In [None]:
# Import pieces from codeflare-sdk
from codeflare_sdk import Cluster, ClusterConfiguration, TokenAuthentication


In [None]:
# Create authentication object for user permissions
# IF unused, SDK will automatically check for default kubeconfig, then in-cluster config
# KubeConfigFileAuthentication can also be used to specify kubeconfig path manually
auth = TokenAuthentication(
    token = "TOKEN",
    server = "SERVER",
    skip_tls=True
)
auth.login()

In [None]:
# Create and configure our cluster object (and appwrapper)
cluster_name='terrestrial-raytest'
cluster = Cluster(ClusterConfiguration(
    name=cluster_name,
    head_cpus=1,
    head_memory=4,
    num_workers=2,
    min_cpus=1,
    max_cpus=1,
    min_memory=4,
    max_memory=4,
    num_gpus=0,
    image="quay.io/modh/ray:2.35.0-py311-cu121",                                           
))

In [None]:
# Bring up the cluster
cluster.up()
cluster.wait_ready()

In [None]:
cluster.status()

In [None]:
from codeflare_sdk import generate_cert
# Create required TLS cert and export the environment variables to enable TLS
generate_cert.generate_tls_cert(cluster_name, cluster.config.namespace)
generate_cert.export_env(cluster_name, cluster.config.namespace)

In [None]:
!pip install numpy
!pip install ray[default]==2.35.0
!pip install onnxruntime
!pip install ml_metadata

In [278]:
import ray

ray_cluster_uri = cluster.cluster_uri()
ray.shutdown()

In [None]:
# Additional libs
runtime_env = {"pip": ["ipython", "torch" , "onnx", "ray[train]", "ml_metadata" ,"protobuf==3.20.1"]}

ray.init(address=ray_cluster_uri, runtime_env=runtime_env,_system_config={"PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION": "python"} )

print("Ray cluster is up and running: ", ray.is_initialized())

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import ray
from ray import tune
import time
import os

import grpc
import ml_metadata as mlmd
import ml_metadata.metadata_store.pywrap.metadata_store_extension.metadata_store
from grpc import insecure_channel
from ml_metadata.metadata_store import metadata_store
from ml_metadata.proto import metadata_store_pb2
from ml_metadata.proto import metadata_store_service_pb2
from ml_metadata.proto import metadata_store_service_pb2_grpc
from contextlib import contextmanager

## **  SET UP THE CONNECTION CONFIGURATION FOR THE DATABASE ** ##
# Set up the connection configuration for the MySQL database
#connection_config = metadata_store_pb2.ConnectionConfig()
#connection_config.mysql.host = 'mlmetadataservice'
#connection_config.mysql.port = 3306
#connection_config.mysql.database = 'mlmetadata'
#connection_config.mysql.user = 'usermetadata'
#connection_config.mysql.password = 'password'

## ** USE CONTEXTMANAGER WHILE CREATING METADATA STORE CONNECTION TO AVOID SERIALIZATION ** ##
@contextmanager
def get_metadata_store():
    # Create the Metadata Store with the connection configuration
    mdstore = metadata_store.MetadataStore(connection_config, enable_upgrade_migration=True)
    yield mdstore

# Create a gRPC channel with the appropriate options
channel = grpc.insecure_channel(
    'mlmetadataservice:8080',
    options=[('grpc.max_receive_message_length', 512 * 1024 * 1024),
             ('grpc.max_send_message_length', 512 * 1024 * 1024)]
)
metadata_store_stub = metadata_store_service_pb2_grpc.MetadataStoreServiceStub(channel)


global data_type_id, hpo_config_id, model_type_id, trainer_type_id
global experiment_type_id 
global trial_type_id

## ** CREATE ARTIFACT , CONTEXT AND EXECUTION TYPES FOR YOUR MODEL **##

'''
For the model in this code, below are created.

HPOExperiment: context type
HPOTrial: context type
HPOConfig: artifact type
DataSet: artifact type
Model: artifact type
'''

def create_mdtypes():
    with get_metadata_store() as mds:
        # Create ArtifactTypes for DataSet
        data_type = metadata_store_pb2.ArtifactType()
        data_type.name = "DataSet"
        data_type.properties["ds_input_size"] = metadata_store_pb2.INT
        data_type.properties["ds_output_size"] = metadata_store_pb2.INT
        data_type_id = mds.put_artifact_type(data_type) 

        # Create ArtifactTypes for HPOConfig
        #hpo_config = metadata_store_pb2.ArtifactType()
        #hpo_config.name = "HPOConfig"
        #hpo_config.properties["hp_hidden_size"] = metadata_store_pb2.INT
        #hpo_config.properties["hp_lr"] = metadata_store_pb2.DOUBLE
        #hpo_config_id = mds.put_artifact_type(hpo_config)

        # Create ArtifactTypes for Model
        #model_type = metadata_store_pb2.ArtifactType()
        #model_type.name = "Model"
        #model_type.properties["model_version"] = metadata_store_pb2.INT
        #model_type.properties["model_name"] = metadata_store_pb2.STRING
        #model_type.properties["model_accuracy"] = metadata_store_pb2.DOUBLE
        #model_type_id = mds.put_artifact_type(model_type)

        #  Register execution types for all steps in the ML workflow
        # Create an ExecutionType, Trainer
        trainer_type = metadata_store_pb2.ExecutionType()
        trainer_type.name = "Trainer"
        trainer_type.properties["trainer_state"] = metadata_store_pb2.STRING
        trainer_type_id = mds.put_execution_type(trainer_type)

        # Create a ContextType for HPO experiment
        #experiment_type = metadata_store_pb2.ContextType()
        #experiment_type.name = "Experiment"
        #experiment_type.properties["parent_exp"] = metadata_store_pb2.STRING
        #experiment_type_id = mds.put_context_type(experiment_type)

        # Create a ContextType for HPOTrial
        trial_type = metadata_store_pb2.ContextType()
        trial_type.name = "Trial"
        trial_type.properties["trial_id"] = metadata_store_pb2.STRING
        trial_type_id = mds.put_context_type(trial_type)    

## ** CREATE EXPERIMENT AS PARENT AND TRIALS AS CHILD ** ##

#def create_Parent_Context():
#    with get_metadata_store() as mds:
#        # Create a Context for the experiment 
#        experiment_context = metadata_store_pb2.Context()
#        experiment_context.type_id = experiment_type_id
#        experiment_context.name = raytune_experiment_name
#        experiment_context.properties["parent_exp"].string_value = "Overall experiment details"
#        [experiment_id] = mds.put_contexts([experiment_context])
#        return experiment_id


## ** LOG INPUT EVENTS INTO MDSTORE , UPDATE EXECUTION STATUS ** ##
def log_inputs(input_size,output_size,config,trial_name,trial_id,exp_name):
    with get_metadata_store() as metadata_store0:
        # Register the Execution of a Trainer run
        trainer_run = metadata_store_pb2.Execution()
        trainer_run.type_id = trainer_type_id
        trainer_run.properties["trainer_state"].string_value = "RUNNING"
        [run_id] = metadata_store0.put_executions([trainer_run])
        
        # Log metadata about the dataset
        dataset_artifact = metadata_store_pb2.Artifact()
        dataset_artifact.type_id = data_type_id
        dataset_artifact.uri = ""  # Replace with the actual path or identifier
        dataset_artifact.properties["ds_input_size"].int_value = input_size
        dataset_artifact.properties["ds_output_size"].int_value = output_size

        # Log metadata about the hyperparameter tuning
        #hpo_config_artifact = metadata_store_pb2.Artifact()
        #hpo_config_artifact.type_id = hpo_config_id
        #hpo_config_artifact.uri = ""  # Replace with the actual path or identifier
        #hpo_config_artifact.properties["hp_hidden_size"].int_value = config["hidden_size"]
        #hpo_config_artifact.properties["hp_lr"].double_value = config["lr"]
        #hpo_config_artifact.custom_properties["trial_name"].string_value = trial_name
        #hpo_config_artifact.custom_properties["trial_id"].string_value = trial_id
        #hpo_config_artifact.custom_properties["experiment_name"].string_value = exp_name

        #artifact_ids = metadata_store0.put_artifacts([dataset_artifact, hpo_config_artifact])
        artifact_ids = metadata_store0.put_artifacts([dataset_artifact])

        # Define the input events
        input_event_dataset = metadata_store_pb2.Event()
        input_event_dataset.artifact_id = artifact_ids[0]
        input_event_dataset.execution_id = run_id
        input_event_dataset.type = metadata_store_pb2.Event.DECLARED_INPUT

        #input_event_hpo_config = metadata_store_pb2.Event()
        #input_event_hpo_config.artifact_id = artifact_ids[1]
        #input_event_hpo_config.execution_id = run_id
        #input_event_hpo_config.type = metadata_store_pb2.Event.DECLARED_INPUT

        # Record the input events in the metadata store
        #metadata_store0.put_events([input_event_dataset, input_event_hpo_config])
        metadata_store0.put_events([input_event_dataset])
        return run_id, trainer_run , artifact_ids

## ** LOG OUTPUT EVENTS INTO MDSTORE , UPDATE EXECUTION STATUS ** ##
def log_output_artifacts(accuracy,run_id,trainer_run,trial_name, artifact_ids):
    with get_metadata_store() as metadata_store:
        experiment_contexts = []

        # Log metadata about the model
        model_artifact = metadata_store_pb2.Artifact()
        model_artifact.type_id = model_type_id
        model_artifact.uri = ""  # Replace with the actual path or identifier
        model_artifact.properties["model_accuracy"].double_value = accuracy
        [model_artifact_id] = metadata_store.put_artifacts([model_artifact])

        # Declare the output event
        output_event = metadata_store_pb2.Event()
        output_event.artifact_id = model_artifact_id
        output_event.execution_id = run_id
        output_event.type = metadata_store_pb2.Event.DECLARED_OUTPUT

        # Submit output event to the Metadata Store
        metadata_store.put_events([output_event])

        trainer_run.id = run_id
        trainer_run.properties["trainer_state"].string_value = "COMPLETED"
        metadata_store.put_executions([trainer_run])

        # Create a Context for the trial
        trial_context = metadata_store_pb2.Context()
        trial_context.type_id = trial_type_id
        trial_context.name = trial_name # Use the trial name as the context name
        trial_context.properties["trial_id"].string_value = trial_id # Store the trial ID as a note
        [context_trial_id] = metadata_store.put_contexts([trial_context])

        #experiment_context_obj = metadata_store_pb2.ParentContext()
        #experiment_context_obj.child_id = context_trial_id
        #experiment_context_obj.parent_id = experiment_id

        #experiment_contexts.append(experiment_context_obj)
        #metadata_store.put_parent_contexts(experiment_contexts)

        # Associate the trial context with the experiment context
        association = metadata_store_pb2.Association()
        association.execution_id = run_id
        association.context_id = context_trial_id
        
        attribution = metadata_store_pb2.Attribution()
        attribution.artifact_id = model_artifact_id
        attribution.context_id = context_trial_id
        
        #attribution_hpo = metadata_store_pb2.Attribution()
        #attribution_hpo.artifact_id = artifact_ids[1]
        #attribution_hpo.context_id = context_trial_id

        #metadata_store.put_attributions_and_associations([attribution,attribution_hpo ], [association])
        metadata_store.put_attributions_and_associations([attribution], [association])


# Define a simple neural network
class SimpleNet(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(SimpleNet, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

# Define a function to train and evaluate the model
def train_evaluate(config,trial_name=None, trial_id=None):
    input_size = 10
    output_size = 1

    # Instantiate the neural network with the hyperparameters
    model = SimpleNet(input_size, config["hidden_size"], output_size)

    # Define a dummy dataset for illustration purposes
    X = torch.randn(100, input_size)
    y = torch.randn(100, output_size)

    # Dummy DataLoader
    dataset = TensorDataset(X, y)
    dataloader = DataLoader(dataset, batch_size=8, shuffle=True)

    # Define loss and optimizer
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=config["lr"])

    trial_id = ray.train.get_context().get_trial_id()
    trial_name = ray.train.get_context().get_trial_name()
    trial_dir = ray.train.get_context().get_trial_dir()
    exp_name = ray.train.get_context().get_experiment_name()
    mtdata = ray.train.get_context().get_metadata()
    
    # Log input artfacts, events , Executions
    run_id, trainer_run, artifact_ids = log_inputs(input_size,output_size,config,trial_name,trial_id,exp_name)
    
    time.sleep(10)

    # Training loop
    for epoch in range(10):  # Adjust as needed
        for inputs, targets in dataloader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            
    # Evaluate the model (for simplicity, just return a dummy accuracy)
    accuracy = torch.rand(1).item()
    
    # Log output artifacts, events, Executions
    log_output_artifacts(accuracy,run_id,trainer_run,trial_name,artifact_ids)

    # Return a dictionary containing the accuracy and the model
    return {"accuracy": accuracy, "model": model}


raytune_experiment_name = "raytune_hyperparameter_tuning"

# Define the hyperparameter search space
search_space = {
    "hidden_size": tune.choice([5, 10, 20]),
    "lr": tune.loguniform(1e-4, 1e-1),
}

# Create MLMD Artifact/Context Types
create_mdtypes()
# Uncomment while adding ParentContext
#experiment_id = create_Parent_Context()

# Perform hyperparameter tuning with Ray Tune
analysis = tune.run(
    train_evaluate,
    config=search_space,
    num_samples=6,  # Number of trials
    resources_per_trial={"cpu": 1},
    name=raytune_experiment_name,)


# Get the best configuration and result
best_trial = analysis.get_best_trial("accuracy", "max", "last")
best_config = best_trial.config
best_accuracy = best_trial.last_result["accuracy"]
best_model = best_trial.last_result["model"]
trial_id = best_trial.trial_id 

print(f"Best model from trial ID: {trial_id}")
print(f"Best hyperparameters: {best_config}")
print(f"Best accuracy: {best_accuracy}")

In [None]:
## Save the best model
# Create a directory to save the optimal HPO model
hpo_folder = "models/hpo/"
os.makedirs(hpo_folder, exist_ok=True)
onnx_model_path = os.path.join(hpo_folder, "model.onnx")

# Save the best model to a file in ONNX format
dummy_input = torch.tensor([[0.3111400080477545, 1.9459399775518593, 1.0, 0.0, 0.0, 1.2, 3.4, -0.5, 0.8, -2.0]])
torch.onnx.export(best_model, dummy_input, onnx_model_path, verbose=True)

print(f"Best model saved to {onnx_model_path} in ONNX format")


In [None]:
import os
import boto3
import botocore

aws_access_key_id = os.environ.get('AWS_ACCESS_KEY_ID')
aws_secret_access_key = os.environ.get('AWS_SECRET_ACCESS_KEY')
endpoint_url = os.environ.get('AWS_S3_ENDPOINT')
region_name = os.environ.get('AWS_DEFAULT_REGION')
bucket_name = os.environ.get('AWS_S3_BUCKET')

session = boto3.session.Session(aws_access_key_id=aws_access_key_id,
                                aws_secret_access_key=aws_secret_access_key)

s3_resource = session.resource(
    's3',
    config=botocore.client.Config(signature_version='s3v4'),
    endpoint_url=endpoint_url,
    region_name=region_name)

bucket = s3_resource.Bucket(bucket_name)
print(bucket)

def upload_directory_to_s3(local_directory, s3_prefix):
    for root, dirs, files in os.walk(local_directory):
        for filename in files:
            file_path = os.path.join(root, filename)
            relative_path = os.path.relpath(file_path, local_directory)
            s3_key = os.path.join(s3_prefix, relative_path)
            print(f"{file_path} -> {s3_key}")
            bucket.upload_file(file_path, s3_key)
    return True

def list_objects(prefix):
    filter = bucket.objects.filter(Prefix=prefix)
    for obj in filter.all():
        print(obj.key)

In [None]:
# List the objects from
list_objects("models")

In [None]:
# Upload the model to the S3 directory
upload_directory_to_s3("models", "models")

In [246]:
# Details to access the model through REST API
deployed_model_name = "hpo"
rest_url = "http://modelmesh-serving.pcelesti:8008"
infer_url = f"{rest_url}/v2/models/{deployed_model_name}/infer"

In [None]:
## Get the input_names from the model
# Load the ONNX model
onnx_model = onnx.load("models/hpo/model.onnx")

# Print input names
input_names = [input.name for input in onnx_model.graph.input]
print("Input Names:", input_names)

In [None]:
import requests
import numpy as np

def onnx_rest_request(data, infer_url):
    # Convert the input data to a numpy array
    input_array = np.array(data, dtype=np.float32).reshape(1, 10)

    # Convert the numpy array to a list for JSON serialization
    input_list = input_array.tolist()

    # Create the JSON payload for the REST request
    json_data = {
        "inputs": [
            {
                "name": "onnx::Gemm_0",
                "shape": input_array.shape,
                "datatype": "FP32",
                "data": input_list
            }
        ]
    }

    # Make the REST request
    response = requests.post(infer_url, json=json_data)
    print(response.content)

    # Check for successful response (status code 200)
    if response.status_code == 200:
        response_dict = response.json()
        # Extract and return the predictions from the response
        return response_dict['outputs'][0]['data']
    else:
        # Print an error message for unsuccessful requests
        print(f"Error: {response.status_code}")
        return None

In [None]:
# Predict for the given data through REST
data = [0.3111400080477545, 1.9459399775518593, 1.0, 2.0, 3.0, 1.2, 0.4, 0.5, 0.8, 2.0]
prediction = onnx_rest_request(data,infer_url)
print("Model Prediction:", prediction)