In [None]:
!pip install codeflare_sdk
# Import pieces from codeflare-sdk
from codeflare_sdk.cluster.cluster import Cluster, ClusterConfiguration
from codeflare_sdk.cluster.auth import TokenAuthentication

In [None]:
# Create authentication object for user permissions
# IF unused, SDK will automatically check for default kubeconfig, then in-cluster config
# KubeConfigFileAuthentication can also be used to specify kubeconfig path manually
auth = TokenAuthentication(
    token = "TOKEN",
    server = "SERVER",
    skip_tls=True
)
auth.login()

In [None]:
# Create and configure our cluster object (and appwrapper)
cluster = Cluster(ClusterConfiguration(
    name='hpo-raytest',
    num_workers=1,
    min_cpus=1,
    max_cpus=1,
    min_memory=4,
    max_memory=4,
    num_gpus=0,
    image="quay.io/project-codeflare/ray:latest-py39-cu118",
    instascale=False
))

In [None]:
# Bring up the cluster
cluster.up()

In [None]:
cluster.status()

In [None]:
from codeflare_sdk import generate_cert
# Create required TLS cert and export the environment variables to enable TLS
generate_cert.generate_tls_cert(cluster_name, namespace)
generate_cert.export_env(cluster_name, namespace)

In [None]:
!pip install numpy
!pip install ray[default]==2.7.0
!pip install onnxruntime
!pip install ml_metadata

In [7]:
import ray

ray_cluster_uri = cluster.cluster_uri()
ray.shutdown()

In [None]:
# Additional libs
runtime_env = {"pip": ["ipython", "torch" , "onnx", "ray[train]", "ml_metadata" ,"protobuf==3.20.1"]}

ray.init(address=ray_cluster_uri, runtime_env=runtime_env,_system_config={"PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION": "python"} )

print("Ray cluster is up and running: ", ray.is_initialized())

In [None]:
## Create all the required metadata types required for HPO PoC
from grpc import insecure_channel
from ml_metadata.proto import metadata_store_service_pb2_grpc

channel = insecure_channel("modelregistry-sample:9090")
mds = metadata_store_service_pb2_grpc.MetadataStoreServiceStub(channel)

def create_mdtypes():
        # Create ArtifactTypes for HPOConfig
        hpo_config = metadata_store_pb2.ArtifactType()
        hpo_config.name = "odh.HPOConfig"
        request = metadata_store_service_pb2.PutArtifactTypeRequest()
        request.all_fields_match = True
        request.artifact_type.CopyFrom(hpo_config)
        response = mds.PutArtifactType(request)
        hpo_config_type_id = response.type_id
        
        # Create a ContextType for HPO experiment
        experiment_type = metadata_store_pb2.ContextType()
        experiment_type.name = "odh.HPOExperiment"
        request = metadata_store_service_pb2.PutContextTypeRequest()
        request.context_type.CopyFrom(experiment_type)
        response = mds.PutContextType(request)
        hpo_experiment_type_id = response.type_id

        # Create a ContextType for HPOTrial
        trial_type = metadata_store_pb2.ContextType()
        trial_type.name = "odh.HPOTrial"
        trial_type.properties["trial_id"] = metadata_store_pb2.STRING
        request = metadata_store_service_pb2.PutContextTypeRequest()
        request.context_type.CopyFrom(trial_type)
        response = mds.PutContextType(request)
        hpo_trial_type_id = response.type_id
        
        data_type = metadata_store_pb2.ArtifactType()
        data_type.name = "odh.Dataset"
        request = metadata_store_service_pb2.PutArtifactTypeRequest()
        request.all_fields_match = True
        request.artifact_type.CopyFrom(data_type)
        response = mds.PutArtifactType(request)
        data_type_id = response.type_id
        
        #  Register execution types for all steps in the ML workflow
        # Create an ExecutionType, Trainer
        trainer_type = metadata_store_pb2.ExecutionType()
        trainer_type.name = "odh.Train"
        request = metadata_store_service_pb2.PutExecutionTypeRequest()
        request.execution_type.CopyFrom(trainer_type)
        response = mds.PutExecutionType(request)
        trainer_type_id = response.type_id
        
        data_type = metadata_store_pb2.ArtifactType()
        data_type.name = "odh.Metrics"
        request = metadata_store_service_pb2.PutArtifactTypeRequest()
        request.all_fields_match = True
        request.artifact_type.CopyFrom(data_type)
        response = mds.PutArtifactType(request)
        metrics_type_id = response.type_id

        return hpo_config_type_id, hpo_experiment_type_id, hpo_trial_type_id, data_type_id, trainer_type_id, metrics_type_id
    
hpo_config_type_id, hpo_experiment_type_id, hpo_trial_type_id, data_type_id, trainer_type_id, metrics_type_id = create_mdtypes()
print(hpo_config_type_id, hpo_experiment_type_id, hpo_trial_type_id, data_type_id, trainer_type_id, metrics_type_id)
    


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import ray
from ray import tune
import time
import os

from grpc import insecure_channel
from ml_metadata.proto import metadata_store_pb2
from ml_metadata.proto import metadata_store_service_pb2
from ml_metadata.proto import metadata_store_service_pb2_grpc

os.environ['RAY_PICKLE_VERBOSE_DEBUG'] = '1'

from contextlib import contextmanager

@contextmanager
def get_metadata_store():
    channel = insecure_channel("modelregistry-sample:9090")
    store = metadata_store_service_pb2_grpc.MetadataStoreServiceStub(channel)
    yield store


'''
HPOExperiment: context type
HPOTrial: context type
HPOConfig: artifact type
DataSet: artifact type
Model: artifact type
Metrics: artifact type
'''

global data_type_id , model_type_id , trainer_type_id , metrics_type_id
global hpo_config_type_id, hpo_experiment_type_id, hpo_trial_type_id
        
def add_hpo_experiment(hpo_experiment_name):
    with get_metadata_store() as mds:
        # Create a Context for the experiment 
        experiment_context = metadata_store_pb2.Context()
        experiment_context.type_id = hpo_experiment_type_id
        experiment_context.name = hpo_experiment_name
        request = metadata_store_service_pb2.PutContextsRequest()
        request.contexts.extend([experiment_context])
        response = mds.PutContexts(request)
        experiment_id = response.context_ids[0]
        return experiment_id


def log_inputs(input_size,output_size,config,trial_name,trial_id,exp_name):
    with get_metadata_store() as metadata_store0:
        # Register the Execution of a Trainer run
        trainer_run = metadata_store_pb2.Execution()
        trainer_run.type_id = trainer_type_id
        trainer_run.custom_properties["trainer_state"].string_value = "RUNNING"
        request = metadata_store_service_pb2.PutExecutionsRequest()
        request.executions.extend([trainer_run])
        response = metadata_store0.PutExecutions(request)
        run_id = response.execution_ids[0]
        
        # Log metadata about the dataset
        dataset_artifact = metadata_store_pb2.Artifact()
        dataset_artifact.type_id = data_type_id
        dataset_artifact.uri = ""  # Replace with the actual path or identifier
        dataset_artifact.custom_properties["ds_input_size"].int_value = input_size
        dataset_artifact.custom_properties["ds_output_size"].int_value = output_size
        
        # Log metadata about the hyperparameter tuning
        hpo_config_artifact = metadata_store_pb2.Artifact()
        hpo_config_artifact.type_id = hpo_config_type_id
        hpo_config_artifact.uri = ""  # Replace with the actual path or identifier
        hpo_config_artifact.custom_properties["hp_hidden_size"].int_value = config["hidden_size"]
        hpo_config_artifact.custom_properties["hp_lr"].double_value = config["lr"]
        hpo_config_artifact.custom_properties["trial_name"].string_value = trial_name
        hpo_config_artifact.custom_properties["trial_id"].string_value = trial_id
        hpo_config_artifact.custom_properties["experiment_name"].string_value = exp_name
        request = metadata_store_service_pb2.PutArtifactsRequest()
        request.artifacts.extend([dataset_artifact, hpo_config_artifact])
        response = metadata_store0.PutArtifacts(request)
        dataset_artifact_id = response.artifact_ids[0]
        hpo_config_artifact_id = response.artifact_ids[1]   

        return run_id, trainer_run, dataset_artifact_id, hpo_config_artifact_id
        
def log_output_artifacts(accuracy,run_id,trainer_run,trial_name, trial_id, dataset_artifact_id, hpo_config_artifact_id):
    with get_metadata_store() as metadata_store:
        experiment_contexts = []
        # Log metadata about the model
        model_artifact = metadata_store_pb2.Artifact()
        model_artifact.type_id = model_type_id
        model_artifact.uri = "s3://hpomodels"  # Replace with the actual path or identifier
        model_artifact.custom_properties["model_name"].string_value = "SNN"
        request = metadata_store_service_pb2.PutArtifactsRequest()
        request.artifacts.extend([model_artifact])
        response = metadata_store.PutArtifacts(request)
        model_artifact_id = response.artifact_ids[0]
        
        # Log metrics
        metrics_artifact = metadata_store_pb2.Artifact()
        metrics_artifact.type_id = metrics_type_id
        metrics_artifact.custom_properties["accuracy"].double_value = accuracy
        request = metadata_store_service_pb2.PutArtifactsRequest()
        request.artifacts.extend([metrics_artifact])
        response = metadata_store.PutArtifacts(request)
        metrics_artifact_id = response.artifact_ids[0]
        
        # Log Execution trainer state
        trainer_run.id = run_id
        trainer_run.custom_properties["trainer_state"].string_value = "COMPLETED"
        request = metadata_store_service_pb2.PutExecutionsRequest()
        request.executions.extend([trainer_run])
        response = metadata_store.PutExecutions(request)
        run_id = response.execution_ids[0]

        # Create a Context for the trial
        trial_context = metadata_store_pb2.Context()
        trial_context.type_id = hpo_trial_type_id
        trial_context.name = trial_name # Use the trial name as the context name
        trial_context.properties["trial_id"].string_value = trial_id
        request = metadata_store_service_pb2.PutContextsRequest()
        request.contexts.extend([trial_context])
        response = metadata_store.PutContexts(request)
        context_trial_id = response.context_ids[0]
        
        # Associate the trial context with the experiment context
        association = metadata_store_pb2.Association()
        association.execution_id = run_id
        association.context_id = context_trial_id
        
        attribution = metadata_store_pb2.Attribution()
        attribution.artifact_id = model_artifact_id
        attribution.context_id = context_trial_id
        
        attribution_hpo = metadata_store_pb2.Attribution()
        attribution_hpo.artifact_id = hpo_config_artifact_id
        attribution_hpo.context_id = context_trial_id
        
        attribution_metrics = metadata_store_pb2.Attribution()
        attribution_metrics.artifact_id = metrics_artifact_id
        attribution_metrics.context_id = context_trial_id
        
        request = metadata_store_service_pb2.PutAttributionsAndAssociationsRequest()
        request.attributions.add().CopyFrom(attribution)
        request.attributions.add().CopyFrom(attribution_hpo)
        request.attributions.add().CopyFrom(attribution_metrics)
        request.associations.add().CopyFrom(association)
        response = metadata_store.PutAttributionsAndAssociations(request)

        # Create a ParentContext to associate the trial context with the experiment context
        experiment_context_obj = metadata_store_pb2.ParentContext()
        experiment_context_obj.child_id = context_trial_id
        experiment_context_obj.parent_id = experiment_id

        request = metadata_store_service_pb2.PutParentContextsRequest()
        request.parent_contexts.extend([experiment_context_obj])
        response = metadata_store.PutParentContexts(request)


# Define a simple neural network
class SimpleNet(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(SimpleNet, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x


# Define a function to train and evaluate the model
def train_evaluate(config): 
    input_size = 10
    output_size = 1

    # Instantiate the neural network with the hyperparameters
    model = SimpleNet(input_size, config["hidden_size"], output_size)

    # Define a dummy dataset for illustration purposes
    X = torch.randn(100, input_size)
    y = torch.randn(100, output_size)

    # Dummy DataLoader
    dataset = TensorDataset(X, y)
    dataloader = DataLoader(dataset, batch_size=8, shuffle=True)

    # Define loss and optimizer
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=config["lr"])

    trial_id = ray.train.get_context().get_trial_id()
    trial_name = ray.train.get_context().get_trial_name()
    trial_dir = ray.train.get_context().get_trial_dir()
    exp_name = ray.train.get_context().get_experiment_name()
    mtdata = ray.train.get_context().get_metadata()

    # Log input artfacts, events , Executions
    run_id, trainer_run, dataset_artifact_id, hpo_config_artifact_id = log_inputs(input_size,output_size,config,trial_name,trial_id,exp_name)

    time.sleep(2)

    # Training loop
    for epoch in range(10):  # Adjust as needed
        for inputs, targets in dataloader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()

    # Evaluate the model (for simplicity, just return a dummy accuracy)
    accuracy = torch.rand(1).item()

    # Log output artifacts, events, Executions
    log_output_artifacts(accuracy,run_id,trainer_run,trial_name,trial_id,dataset_artifact_id,hpo_config_artifact_id)

    # Return a dictionary containing the accuracy and the model
    return {"accuracy": accuracy, "model": model}


def custom_trial_name_creator(trial):
    trial_index = trial.trial_id
    experiment_name = raytune_experiment_name
    trial_name = f"{experiment_name}_{trial_index}"    
    return trial_name


raytune_experiment_name = "raytune_hyperparameter_tuning"  #Replace the experiment name for a new run

# Define the hyperparameter search space
search_space = {
    "hidden_size": tune.choice([5, 10, 20]),
    "lr": tune.loguniform(1e-4, 1e-1),
}

# Use the default metadata types for Model and other metadata types that are part of the metaData database.
# Replace these values based on the MR database using the code from previous block.
model_type_id = 12
hpo_config_type_id = 22
hpo_experiment_type_id = 23
hpo_trial_type_id = 24
data_type_id = 25
trainer_type_id = 26
metrics_type_id = 27
# Update the raytune_experiment_name as parent context for the trials.
hpo_experiment_id = add_hpo_experiment(raytune_experiment_name)


# Perform hyperparameter tuning with Ray Tune
analysis = tune.run(
    train_evaluate,
    config=search_space,
    num_samples=1,  # Number of trials
    resources_per_trial={"cpu": 1},
    name=raytune_experiment_name,
    trial_name_creator=custom_trial_name_creator, )


# Get the best configuration and result
best_trial = analysis.get_best_trial("accuracy", "max", "last")
best_config = best_trial.config
best_accuracy = best_trial.last_result["accuracy"]
best_model = best_trial.last_result["model"]
trial_id = best_trial.trial_id 

print(f"Best model from trial ID: {trial_id}")
print(f"Best hyperparameters: {best_config}")
print(f"Best accuracy: {best_accuracy}")

In [10]:
channel = insecure_channel("modelregistry-sample:9090")
mdstore = metadata_store_service_pb2_grpc.MetadataStoreServiceStub(channel)

In [None]:
# Extract unique artifact types from the artifacts
request = metadata_store_service_pb2.GetArtifactsRequest()
response = mdstore.GetArtifacts(request)
unique_artifact_types = set(artifact.type for artifact in response.artifacts)

print("Available Artifact Types:")
for artifact_type_id in unique_artifact_types:
    print(artifact_type_id)

In [None]:
# Extract artifact details of given
def get_artifact_type(type_name):
        request = metadata_store_service_pb2.GetArtifactTypeRequest()
        request.type_name = type_name
        response = mdstore.GetArtifactType(request)
        return response.artifact_type

# Example usage:
data_type = get_artifact_type("HPOConfig")
print(f"Artifact Type: {data_type}")

In [None]:
# Extract artifact details for an experiment which includes all the trials.
request = metadata_store_service_pb2.GetContextsRequest()
response = mdstore.GetContexts(request)

# Filter the contexts based on the experiment name
experiment_context_name = "raytune_hyperparameter_tuning"   #Replace the experiment name 
experiment_context = next(
    (context for context in response.contexts if context.name == experiment_context_name),
    None
)

if experiment_context:
    # Retrieve all contexts and filter them to find the child contexts (trial contexts)
    child_contexts = [
        context for context in response.contexts
        if context.name.startswith(experiment_context_name) and context.name != experiment_context_name
    ]

    # For each child context, retrieve the artifacts associated with it
    for child_context in child_contexts:
        artifacts_request = metadata_store_service_pb2.GetArtifactsByContextRequest(
            context_id=child_context.id
        )
        artifacts_response = mdstore.GetArtifactsByContext(artifacts_request)
        print(f"Trial Context: {child_context.name}")
        print(f"Artifacts: {artifacts_response.artifacts}")
else:
    print(f"No context found with the name '{experiment_context_name}'.")

In [None]:
## Extract artifact details of given trial
context_request = metadata_store_service_pb2.GetContextByTypeAndNameRequest()
context_request.type_name = "Trial"
context_request.context_name = "raytune_hyperparameter_tuning_7e346_00000" # Replace the trial name

context_response = mdstore.GetContextByTypeAndName(context_request)
context_id = context_response.context.id
artifacts_request = metadata_store_service_pb2.GetArtifactsByContextRequest()
artifacts_request.context_id = context_id
artifacts_response = mdstore.GetArtifactsByContext(artifacts_request)
trial_artifacts = artifacts_response.artifacts

print(trial_artifacts)

In [None]:
# # Extract artifact details as per the conditions
request = metadata_store_service_pb2.GetArtifactsRequest()
response = mdstore.GetArtifacts(request)

# Process the response and filter the artifacts manually
artifacts_with_conditions = [
    artifact for artifact in response.artifacts
    if artifact.properties["accuracy"].double_value >  0.1
]

print(artifacts_with_conditions)

In [None]:
## Save the best model
# Create a directory to save the optimal HPO model
hpo_folder = "models/hpo/"
os.makedirs(hpo_folder, exist_ok=True)
onnx_model_path = os.path.join(hpo_folder, "model.onnx")

# Save the best model to a file in ONNX format
dummy_input = torch.tensor([[0.3111400080477545, 1.9459399775518593, 1.0, 0.0, 0.0, 1.2, 3.4, -0.5, 0.8, -2.0]])
torch.onnx.export(best_model, dummy_input, onnx_model_path, verbose=True)

print(f"Best model saved to {onnx_model_path} in ONNX format")


In [None]:
import os
import boto3
import botocore

aws_access_key_id = os.environ.get('AWS_ACCESS_KEY_ID')
aws_secret_access_key = os.environ.get('AWS_SECRET_ACCESS_KEY')
endpoint_url = os.environ.get('AWS_S3_ENDPOINT')
region_name = os.environ.get('AWS_DEFAULT_REGION')
bucket_name = os.environ.get('AWS_S3_BUCKET')

session = boto3.session.Session(aws_access_key_id=aws_access_key_id,
                                aws_secret_access_key=aws_secret_access_key)

s3_resource = session.resource(
    's3',
    config=botocore.client.Config(signature_version='s3v4'),
    endpoint_url=endpoint_url,
    region_name=region_name)

bucket = s3_resource.Bucket(bucket_name)
print(bucket)

def upload_directory_to_s3(local_directory, s3_prefix):
    for root, dirs, files in os.walk(local_directory):
        for filename in files:
            file_path = os.path.join(root, filename)
            relative_path = os.path.relpath(file_path, local_directory)
            s3_key = os.path.join(s3_prefix, relative_path)
            print(f"{file_path} -> {s3_key}")
            bucket.upload_file(file_path, s3_key)
    return True

def list_objects(prefix):
    filter = bucket.objects.filter(Prefix=prefix)
    for obj in filter.all():
        print(obj.key)

In [None]:
# List the objects from
list_objects("models")

In [None]:
# Upload the model to the S3 directory
upload_directory_to_s3("models", "models")

In [68]:
# Details to access the model through REST API
deployed_model_name = "hpo"
rest_url = "http://modelmesh-serving.pcelesti:8008"
infer_url = f"{rest_url}/v2/models/{deployed_model_name}/infer"

In [None]:
import onnx
## Get the input_names from the model
# Load the ONNX model
onnx_model = onnx.load("models/hpo/model.onnx")

# Print input names
input_names = [input.name for input in onnx_model.graph.input]
print("Input Names:", input_names)

In [None]:
import requests
import numpy as np

def onnx_rest_request(data, infer_url):
    # Convert the input data to a numpy array
    input_array = np.array(data, dtype=np.float32).reshape(1, 10)

    # Convert the numpy array to a list for JSON serialization
    input_list = input_array.tolist()

    # Create the JSON payload for the REST request
    json_data = {
        "inputs": [
            {
                "name": "onnx::Gemm_0",
                "shape": input_array.shape,
                "datatype": "FP32",
                "data": input_list
            }
        ]
    }

    # Make the REST request
    response = requests.post(infer_url, json=json_data)
    print(response.content)

    # Check for successful response (status code 200)
    if response.status_code == 200:
        response_dict = response.json()
        # Extract and return the predictions from the response
        return response_dict['outputs'][0]['data']
    else:
        # Print an error message for unsuccessful requests
        print(f"Error: {response.status_code}")
        return None

In [None]:
# Predict for the given data through REST
data = [0.3111400080477545, 1.9459399775518593, 1.0, 2.0, 3.0, 1.2, 0.4, 0.5, 0.8, 2.0]
prediction = onnx_rest_request(data,infer_url)
print("Model Prediction:", prediction)