# Hyperparameter Tuning using HyperDrive

TODO: Import Dependencies. In the cell below, import all the dependencies that you will need to complete the project.

In [40]:
from azureml.core import Workspace, Experiment, Environment
from azureml.core import ScriptRunConfig
from azureml.core.conda_dependencies import CondaDependencies
from azureml.train.hyperdrive import RandomParameterSampling, BanditPolicy, HyperDriveConfig, PrimaryMetricGoal
from azureml.train.hyperdrive.parameter_expressions import choice
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
import os
import shutil

In [41]:
# Load workspace
ws = Workspace.from_config()

# Specify the name of the compute cluster
compute_name = "cpu-cluster"

# Check if the compute target already exists, otherwise create it
try:
    trainCluster = ComputeTarget(ws, compute_name)
    print(f"{compute_name} exists already")
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size="Standard_D2_V2", max_nodes=2)
    trainCluster = ComputeTarget.create(ws, compute_name, compute_config)
    trainCluster.wait_for_completion(show_output=True)

cpu-cluster exists already


In [42]:


# Create a new environment
env = Environment(name="my-sklearn-env")
conda_dep = CondaDependencies()
conda_dep.add_conda_package("scikit-learn")
conda_dep.add_conda_package("pandas")  # Add pandas dependency
env.python.conda_dependencies = conda_dep

# Register the environment
env.register(workspace=ws)

{
    "assetId": "azureml://locations/northcentralus/workspaces/14469e3d-16d9-46f1-b40b-c8dfbce3c1b7/environments/my-sklearn-env/versions/1",
    "databricks": {
        "eggLibraries": [],
        "jarLibraries": [],
        "mavenLibraries": [],
        "pypiLibraries": [],
        "rcranLibraries": []
    },
    "docker": {
        "arguments": [],
        "baseDockerfile": null,
        "baseImage": "mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04:20240908.v1",
        "baseImageRegistry": {
            "address": null,
            "password": null,
            "registryIdentity": null,
            "username": null
        },
        "buildContext": null,
        "enabled": false,
        "platform": {
            "architecture": "amd64",
            "os": "Linux"
        },
        "sharedVolumes": true,
        "shmSize": null
    },
    "environmentVariables": {
        "EXAMPLE_ENV_VAR": "EXAMPLE_VALUE"
    },
    "inferencingStackVersion": null,
    "name": "my-sklearn-env"

## Dataset

TODO: Get data. In the cell below, write code to access the data you will be using in this project. Remember that the dataset needs to be external.

In [43]:
import requests
from io import StringIO
import pandas as pd
from azureml.core import Workspace, Dataset

# Replace with your GitHub token and the correct URL
GITHUB_TOKEN = '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'
url = 'https://raw.githubusercontent.com/monaejam/Udacity/main/capston/heart_failure_clinical_records_dataset.csv'

# Fetch the raw CSV content from GitHub
headers = {'Authorization': f'token {GITHUB_TOKEN}'}
response = requests.get(url, headers=headers)

if response.status_code == 200:
    data = StringIO(response.text)
    df = pd.read_csv(data)
    
    # Print first few rows to confirm data retrieval
    print(df.head())
else:
    print(f"Failed to retrieve file. Status code: {response.status_code}")

# Now, you can proceed to register this dataset in your Azure ML Workspace

# Connect to your workspace
ws = Workspace.from_config()  # Or provide parameters manually if needed

# Register the dataset in Azure ML Workspace
dataset = Dataset.Tabular.register_pandas_dataframe(df, target=ws.get_default_datastore(), name="heart_failure_dataset")

# Convert to pandas DataFrame and describe the dataset
df = dataset.to_pandas_dataframe()
df.describe()


    age  anaemia  creatinine_phosphokinase  diabetes  ejection_fraction  \
0  75.0        0                       582         0                 20   
1  55.0        0                      7861         0                 38   
2  65.0        0                       146         0                 20   
3  50.0        1                       111         0                 20   
4  65.0        1                       160         1                 20   

   high_blood_pressure  platelets  serum_creatinine  serum_sodium  sex  \
0                    1  265000.00               1.9           130    1   
1                    0  263358.03               1.1           136    1   
2                    0  162000.00               1.3           129    1   
3                    0  210000.00               1.9           137    1   
4                    0  327000.00               2.7           116    0   

   smoking  time  DEATH_EVENT  
0        0     4            1  
1        0     6            1  
2       

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
count,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0
mean,60.833893,0.431438,581.839465,0.41806,38.083612,0.351171,263358.029264,1.39388,136.625418,0.648829,0.32107,130.26087,0.32107
std,11.894809,0.496107,970.287881,0.494067,11.834841,0.478136,97804.236869,1.03451,4.412477,0.478136,0.46767,77.614208,0.46767
min,40.0,0.0,23.0,0.0,14.0,0.0,25100.0,0.5,113.0,0.0,0.0,4.0,0.0
25%,51.0,0.0,116.5,0.0,30.0,0.0,212500.0,0.9,134.0,0.0,0.0,73.0,0.0
50%,60.0,0.0,250.0,0.0,38.0,0.0,262000.0,1.1,137.0,1.0,0.0,115.0,0.0
75%,70.0,1.0,582.0,1.0,45.0,1.0,303500.0,1.4,140.0,1.0,1.0,203.0,1.0
max,95.0,1.0,7861.0,1.0,80.0,1.0,850000.0,9.4,148.0,1.0,1.0,285.0,1.0


## Hyperdrive Configuration

TODO: Explain the model you are using and the reason for chosing the different hyperparameters, termination policy and config settings.

In [None]:
# Specify parameter sampler
ps = RandomParameterSampling(
    {
        "--C": choice(1, 2, 3, 4, 5),
        "--max_iter": choice(80, 100, 120, 150, 170, 200)
    }
)

# Specify a Policy
policy = BanditPolicy(evaluation_interval=1, slack_factor=0.2, delay_evaluation=5)

# Create a directory for training and copy the training script
if "training" not in os.listdir():
    os.mkdir("./training")
shutil.copy('train.py', './training')

# Create a ScriptRunConfig
src = ScriptRunConfig(source_directory='./',
                      script='train.py',
                      arguments=[
                          '--data-folder', dataset.as_named_input('input')],
                      compute_target=trainCluster,
                      environment=env)


# Create a HyperDriveConfig using the ScriptRunConfig, hyperparameter sampler, and policy.
hyperdrive_config = HyperDriveConfig(hyperparameter_sampling=ps,
                                     primary_metric_name='Accuracy',
                                     primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
                                     policy=policy,
                                     run_config=src,
                                     max_concurrent_runs=2,
                                     max_total_runs=10,                                     
                                    )

## Run Details

OPTIONAL: Write about the different models trained and their performance. Why do you think some models did better than others?

TODO: In the cell below, use the `RunDetails` widget to show the different experiments.

In [36]:
from azureml.core import Workspace, Datastore, Dataset

# Connect to your workspace
ws = Workspace.from_config()

# Get the default datastore (you can also create or specify another datastore)
datastore = Datastore.get(ws, 'workspaceblobstore')  # or ws.get_default_datastore()

# Upload the dataset to the datastore
datastore.upload_files(
    ['./heart_failure_clinical_records_dataset.csv'],  # List of local file paths
    target_path='datasets/',  # Destination path in the datastore
    overwrite=True  # Overwrite existing files
)

print("Dataset uploaded to datastore.")


Uploading an estimated of 1 files
Uploading ./heart_failure_clinical_records_dataset.csv
Uploaded ./heart_failure_clinical_records_dataset.csv, 1 files out of an estimated total of 1
Uploaded 1 files
Dataset uploaded to datastore.


In [37]:
dataset = Dataset.Tabular.from_delimited_files(path=[(datastore, 'datasets/heart_failure_clinical_records_dataset.csv')])

# Register as a new version
dataset = dataset.register(workspace=ws, name='heart_failure_dataset', create_new_version=True)

print("Dataset registered as a new version.")



Dataset registered as a new version.


In [26]:
from azureml.core import Dataset, Workspace

# Connect to your Azure ML workspace
ws = Workspace.from_config()

# Load the existing dataset by name
dataset = Dataset.get_by_name(workspace=ws, name='heart_failure_dataset')

# Convert to pandas DataFrame
df = dataset.to_pandas_dataframe()

print("Dataset loaded successfully.")


{'infer_column_types': 'False', 'activity': 'to_pandas_dataframe'}
{'infer_column_types': 'False', 'activity': 'to_pandas_dataframe', 'activityApp': 'TabularDataset'}
Dataset loaded successfully.


In [51]:
# Specify parameter sampler
ps = RandomParameterSampling(
    {
        "--C": choice(1, 2, 3, 4, 5),
        "--max_iter": choice(80, 100, 120, 150, 170, 200)
    }
)

# Specify a Policy
# Define the policy
policy = BanditPolicy(evaluation_interval=1, slack_factor=0.2, delay_evaluation=5)

# Create a directory for training and copy the training script if not already existing
if "training" not in os.listdir():
    os.mkdir("./training")
shutil.copy('train.py', './training')

# Create a ScriptRunConfig without passing the --data-folder argument
src = ScriptRunConfig(source_directory='./',
                      script='train.py',
                      compute_target=trainCluster,
                      environment=env)


# Create a HyperDriveConfig using the ScriptRunConfig, hyperparameter sampler, and policy.
hyperdrive_config = HyperDriveConfig(hyperparameter_sampling=ps,
                                     primary_metric_name='Accuracy',
                                     primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
                                     policy=policy,
                                     run_config=src,
                                     max_concurrent_runs=2,
                                     max_total_runs=10,                                     
                                    )

**submit experiment and best run**

In [52]:
# Submit the HyperDrive run
experiment_name = 'hyper-exp'
experiment = Experiment(workspace=ws, name=experiment_name)
hyperdrive_run = experiment.submit(config=hyperdrive_config)

# wait for completion and retrieve the best run
hyperdrive_run.wait_for_completion(show_output=True)
best_run = hyperdrive_run.get_best_run_by_primary_metric()
best_run_metrics = best_run.get_metrics()

print('Best Run Id: ', best_run.id)
print('\nAccuracy:', best_run_metrics['Accuracy'])

RunId: HD_2e4d3d56-e61a-43c6-97bd-64e0538cf13a
Web View: https://ml.azure.com/runs/HD_2e4d3d56-e61a-43c6-97bd-64e0538cf13a?wsid=/subscriptions/d2d90bd8-e567-4097-88c9-9532cc375686/resourcegroups/cloud_shell/workspaces/Udacity_1&tid=f3822f31-4d32-4719-a061-c45fac0a64ab

Streaming azureml-logs/hyperdrive.txt

[2024-09-16T00:25:00.530352][GENERATOR][INFO]Trying to sample '2' jobs from the hyperparameter space
[2024-09-16T00:25:00.9425732Z][SCHEDULER][INFO]Scheduling job, id='HD_2e4d3d56-e61a-43c6-97bd-64e0538cf13a_0' 
[2024-09-16T00:25:01.0524194Z][SCHEDULER][INFO]Scheduling job, id='HD_2e4d3d56-e61a-43c6-97bd-64e0538cf13a_1' 
[2024-09-16T00:25:01.013217][GENERATOR][INFO]Successfully sampled '2' jobs, they will soon be submitted to the execution target.
[2024-09-16T00:25:01.4640758Z][SCHEDULER][INFO]Successfully scheduled a job. Id='HD_2e4d3d56-e61a-43c6-97bd-64e0538cf13a_1' 
[2024-09-16T00:25:01.5575820Z][SCHEDULER][INFO]Successfully scheduled a job. Id='HD_2e4d3d56-e61a-43c6-97bd-64e053

In [53]:
best_run = hyperdrive_run.get_best_run_by_primary_metric()
print("best run details :",best_run.get_details())
print("best run file names :",best_run.get_file_names())
print("best run metrics :",best_run.get_metrics())

best run details : {'runId': 'HD_2e4d3d56-e61a-43c6-97bd-64e0538cf13a_2', 'target': 'cpu-cluster', 'status': 'Completed', 'startTimeUtc': '2024-09-16T00:26:58.924463Z', 'endTimeUtc': '2024-09-16T00:27:21.671315Z', 'services': {}, 'properties': {'_azureml.ComputeTargetType': 'amlctrain', '_azureml.ClusterName': 'cpu-cluster', 'ContentSnapshotId': '5fdff818-d85b-402a-9dad-e09b0e266cf8', 'ProcessInfoFile': 'azureml-logs/process_info.json', 'ProcessStatusFile': 'azureml-logs/process_status.json'}, 'inputDatasets': [{'dataset': {'id': '0cb1973b-83a4-412e-bfb9-956ff3593d33'}, 'consumptionDetails': {'type': 'Reference'}}], 'outputDatasets': [], 'runDefinition': {'script': 'train.py', 'command': '', 'useAbsolutePath': False, 'arguments': ['--C', '5', '--max_iter', '80'], 'sourceDirectoryDataStore': None, 'framework': 'Python', 'communicator': 'None', 'target': 'cpu-cluster', 'dataReferences': {}, 'data': {}, 'outputData': {}, 'datacaches': [], 'jobName': None, 'maxRunDurationSeconds': 2592000,

In [54]:
best_run.register_model(model_name = "hyperdrive_best_run.pkl", model_path = './outputs/')

print(best_run)
best_run.download_file( name= './outputs/hyper-model.pkl')

Run(Experiment: hyper-exp,
Id: HD_2e4d3d56-e61a-43c6-97bd-64e0538cf13a_2,
Type: azureml.scriptrun,
Status: Completed)


**Submission Checklist**
- I have registered the model.
- I have deployed the model with the best accuracy as a webservice.
- I have tested the webservice by sending a request to the model endpoint.
- I have deleted the webservice and shutdown all the computes that I have used.
- I have taken a screenshot showing the model endpoint as active.
- The project includes a file containing the environment details.

