# Punto (a): Separar el Conjunto de Datos de Entrenamiento y Validación
# 

In [2]:
from azureml.core import Workspace, Dataset
from sklearn.model_selection import train_test_split
import pandas as pd
# Carga el dataset desde el workspace de Azure
workspace = Workspace.from_config()
dataset = Dataset.get_by_name(workspace, name='processed_data')

# Montar el dataset y leerlo con pandas
mount_context = dataset.mount('./dataset_mount')
mount_context.start()

df = pd.read_csv('./dataset_mount/processed_data.csv')

mount_context.stop()

# Divide el dataframe en conjuntos de entrenamiento y validación
train_df, valid_df = train_test_split(df, test_size=0.2, random_state=42)

# Convertir los dataframes de entrenamiento y validación de nuevo a datasets de Azure para usarlos más adelante
# Para ello, primero tendrías que guardar los dataframes como CSV y luego subirlos a tu datastore
train_df.to_csv('./train_data/train_data.csv', index=False)
valid_df.to_csv('./valid_data/valid_data.csv', index=False)

Message: Running rslex direct volume mount: RSLEX_DIRECT_VOLUME_MOUNT=None, RSLEX_DIRECT_VOLUME_WRITABLE_MOUNT=None, enable_rslex_mount=None
Payload: {"pid": 28596, "rslex_version": "2.19.6", "version": "4.12.5"}


In [11]:
import mltable

train_path = [
    {
        "file": "train_data.csv"
    }
]

train_data_mltable = mltable.from_delimited_files(paths=train_path)
train_data_mltable.save("./train_data")

validation_path = [
    {
        "file": "valid_data.csv"
    }
]

validation_data_mltable = mltable.from_delimited_files(paths=validation_path)
validation_data_mltable.save("./valid_data")

print(train_data_mltable)
print(validation_data_mltable)

paths:
- file: file:///mnt/batch/tasks/shared/LS_root/mounts/clusters/primer-proyecto/code/Users/mauricio.quezada/train_data.csv
transformations:
- read_delimited:
    delimiter: ','
    empty_as_string: false
    encoding: utf8
    header: all_files_same_headers
    include_path_column: false
    infer_column_types: true
    partition_size: 20971520
    path_column: Path
    support_multi_line: false
type: mltable

paths:
- file: file:///mnt/batch/tasks/shared/LS_root/mounts/clusters/primer-proyecto/code/Users/mauricio.quezada/train_data.csv
transformations:
- read_delimited:
    delimiter: ','
    empty_as_string: false
    encoding: utf8
    header: all_files_same_headers
    include_path_column: false
    infer_column_types: true
    partition_size: 20971520
    path_column: Path
    support_multi_line: false
type: mltable



Engine process terminated with returncode=-2


# Punto (b): Crear un Cluster de Computación en Azure ML
# 

In [4]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

# Definir la configuración del cluster
cluster_name = "co2-cluster"
try:
    cluster = ComputeTarget(workspace=workspace, name=cluster_name)
    print(f"Found existing cluster: {cluster_name}")
except ComputeTargetException:
    config = AmlCompute.provisioning_configuration(vm_size='Standard_D2_v2', 
                                                   max_nodes=4, 
                                                   min_nodes=1)
    cluster = ComputeTarget.create(workspace, cluster_name, config)

cluster.wait_for_completion(show_output=True)

InProgress..
SucceededProvisioning operation finished, operation "Succeeded"
Succeeded........................
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


JUSTIFICACION:

    *Para este apartado utilizamos un mínimo de 1 y máximo 4 nodos, debido a que lo vimos más que suficiente para el trabajo requerido.
    *El recurso que se utilizó, es la libreria de MLTABLES, para poder utilizar nuestras tablas de validación y de entrenamiento.

# Punto (c): Crear un Trabajo de Entrenamiento con AutoML
# 

In [20]:
from azure.ai.ml import automl, Input, MLClient
from azure.ai.ml.entities import ResourceConfiguration
from azure.identity import DefaultAzureCredential
from azure.ai.ml.constants import AssetTypes

# Cargamos los datos (previamente separados) a un MLTable
train_mltable_path = "./train_data/" 
valid_mltable_path = "./valid_data/" 

# Define los inputs de MLTable
train_data_input = Input(type=AssetTypes.MLTABLE, path=train_mltable_path)
valid_data_input = Input(type=AssetTypes.MLTABLE, path=valid_mltable_path)

# Configuramos el job de AutoML para regresión
experiment_name = 'automl-co2-emission'
automl_job = automl.regression(
    experiment_name=experiment_name,
    training_data=train_data_input,
    validation_data=valid_data_input,
    target_column_name="CO2 Emissions(g/km)",
    primary_metric="normalized_root_mean_squared_error"
)

# Configura límites y recursos
automl_job.compute = cluster_name
automl_job.set_limits(timeout_minutes=60)

# Autenticación y envío del job
credential = DefaultAzureCredential()
ml_client = MLClient.from_config(credential)
returned_job = ml_client.jobs.create_or_update(automl_job)

print(f"Job creado: {returned_job}")

Found the config file in: /config.json


Job creado: compute: azureml:/subscriptions/ba1f7bf8-2be6-4bed-b818-c745bda74905/resourceGroups/primer_proyecto/providers/Microsoft.MachineLearningServices/workspaces/primer_proyecto/computes/
creation_context:
  created_at: '2023-10-23T19:53:17.854583+00:00'
  created_by: MAURICIO ALEJANDRO QUEZADA BUSTILLO
  created_by_type: User
display_name: frosty_ear_pgq4rdwm95
experiment_name: automl-co2-emission
id: azureml:/subscriptions/ba1f7bf8-2be6-4bed-b818-c745bda74905/resourceGroups/primer_proyecto/providers/Microsoft.MachineLearningServices/workspaces/primer_proyecto/jobs/frosty_ear_pgq4rdwm95
limits:
  enable_early_termination: true
  max_concurrent_trials: 1
  max_cores_per_trial: -1
  max_nodes: 1
  max_trials: 1000
  timeout_minutes: 60
  trial_timeout_minutes: 30
log_verbosity: info
name: frosty_ear_pgq4rdwm95
outputs: {}
primary_metric: normalized_root_mean_squared_error
properties: {}
resources:
  instance_count: 1
  instance_type: Standard_D2_v2
  properties: {}
  shm_size: 2g
s

JUSTIFICACIÓN:
    
    *Utilizamos los parámetros por defecto que posee el Job, debido a que estamso trabajando con un modelo de regresión y esta no implica muchas carga y por ende se puede probar todos lo moedelos y encontrar el mejor, sin necesidad de mover parámetros del Job.
    
    

# Punto (d): Recuperar y Visualizar los Resultados del Trabajo con AutoML
# 

In [26]:
# Importar las bibliotecas necesarias
import mlflow
from mlflow.tracking.client import MlflowClient
from mlflow.artifacts import download_artifacts

# --- (d) Información del Job y Best Run ---

# 1. Obtener el trabajo
job = ml_client.jobs.get(name=returned_job.name)

# 2. Obtener la URL de seguimiento (tracking URI) de MLClient.
# Esta URL es necesaria para acceder a la información del experimento y los runs en MLflow.
MLFLOW_TRACKING_URI = ml_client.workspaces.get(
    name=ml_client.workspace_name
).mlflow_tracking_uri

print(f"URL de seguimiento de MLflow: {MLFLOW_TRACKING_URI}")

# 3. Configurar MLflow para usar la URL de seguimiento obtenida.
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)

# 4. Inicializar el cliente de MLflow.
mlflow_client = MlflowClient()

# 5. Obtener el run principal del trabajo. En AutoML, este run tiene sub-runs para cada modelo probado.
mlflow_parent_run = mlflow_client.get_run(job.name)

# 6. Del run principal, extraer el ID del mejor sub-run (mejor modelo probado por AutoML).
best_child_run_id = mlflow_parent_run.data.tags["automl_best_child_run_id"]

# 7. Usar el ID del mejor sub-run para obtener toda la información sobre ese run.
best_run = mlflow_client.get_run(best_child_run_id)

# Mostrar la información del mejor run.
print(best_run)

URL de seguimiento de MLflow: azureml://eastus2.api.azureml.ms/mlflow/v1.0/subscriptions/ba1f7bf8-2be6-4bed-b818-c745bda74905/resourceGroups/primer_proyecto/providers/Microsoft.MachineLearningServices/workspaces/primer_proyecto
<Run: data=<RunData: metrics={'explained_variance': 0.9968423656109591,
 'mean_absolute_error': 1.960839788144388,
 'mean_absolute_percentage_error': 0.8030106191525185,
 'median_absolute_error': 1.4575644237046106,
 'normalized_mean_absolute_error': 0.004602910300808422,
 'normalized_median_absolute_error': 0.003421512731700964,
 'normalized_root_mean_squared_error': 0.007739424688184539,
 'normalized_root_mean_squared_log_error': 0.007782983700169502,
 'r2_score': 0.996839722977584,
 'root_mean_squared_error': 3.2969949171666135,
 'root_mean_squared_log_error': 0.013113319526022143,
 'spearman_correlation': 0.9981283121488365}, params={}, tags={'mlflow.parentRunId': 'good_diamond_8q33ypyzh7',
 'mlflow.rootRunId': 'good_diamond_8q33ypyzh7',
 'mlflow.runName': '

#### ¿Qué modelos intentó probar AutoML?
AutoML intentó probar varios algoritmos, algunos son:
- LightGBM
- XGBoostRegressor
- ElasticNet
- RandomForest
- ExtremeRandomTrees
- DecisionTree

#### ¿El mejor modelo es preciso o no?
El mejor modelo utilizado por AutoML es: LightGBM.

Basándonos en las métricas proporcionadas:
- R2 Score: 0.996839722977584
- Correlación de Spearman: 0.9981283121488365

Se pudo concluir que dicho modelo es el más adecuado por la alta precisión buscada para este modelo de regresión, poseyendo un R2 Score muy cercano a 1 que indica que el modelo desarrollado puede dar sentido y explicar una gran proporción de la variabilidad en los datos. Además, una correlación de Spearman bordeando al 1 infiere en una fuerte relación monótona entre las predicciones del sistema y los valores reales.

# Punto (e): Crear un Endpoint y Desplegar el Mejor Modelo
#

In [30]:
# import required libraries
from azure.ai.ml.entities import (
    ManagedOnlineEndpoint,
    ManagedOnlineDeployment,
    Model,
    Environment,
    CodeConfiguration,
    ProbeSettings,
)

# Creando un nombre de endpoint único con la fecha y hora actuales para evitar conflictos
import datetime

# Creando un nombre de endpoint único con la fecha y hora actuales para evitar conflictos
online_endpoint_name = "reg-" + datetime.datetime.now().strftime("%m%d%H%M")

# verificar la longitud del nombre del endpoint
assert 3 <= len(online_endpoint_name) <= 32, "El nombre del endpoint no cumple con la longitud requerida."

# crear un endpoint en línea
endpoint = ManagedOnlineEndpoint(
    name=online_endpoint_name,
    description="Endpoint online para desplegar el modelo de regresión",
    auth_mode="key",
    tags={"type": "regression"},
)
print(online_endpoint_name)

ml_client.begin_create_or_update(endpoint).result()

# Registramos el modelo de regresión que hemos entrenado
model_name = "regression-model"
model = Model(
    path=f"azureml://jobs/{best_run.info.run_id}/outputs/artifacts/outputs/mlflow-model/",
    name=model_name,
    description="Modelo de regresión",
    type=AssetTypes.MLFLOW_MODEL,
)
registered_model = ml_client.models.create_or_update(model)

# Desplegamos el modelo en el endpoint
deployment = ManagedOnlineDeployment(
    name="regression-deployment",
    endpoint_name=online_endpoint_name,
    model=registered_model.id,
    instance_type="Standard_E4s_v3",  # Puedes elegir otro tipo de instancia si lo prefieres
    instance_count=1)
ml_client.online_deployments.begin_create_or_update(deployment).result()



reg-10232239
.....................................................................................

Check: endpoint reg-10232239 exists


ManagedOnlineDeployment({'private_network_connection': None, 'provisioning_state': 'Succeeded', 'endpoint_name': 'reg-10232239', 'type': 'Managed', 'name': 'regression-deployment', 'description': None, 'tags': {}, 'properties': {'AzureAsyncOperationUri': 'https://management.azure.com/subscriptions/ba1f7bf8-2be6-4bed-b818-c745bda74905/providers/Microsoft.MachineLearningServices/locations/eastus2/mfeOperationsStatus/od:64353576-ec82-4005-a1d0-bdf99309bf66:833f0f17-cb74-4411-a5ba-3e799668d98e?api-version=2023-04-01-preview'}, 'print_as_yaml': True, 'id': '/subscriptions/ba1f7bf8-2be6-4bed-b818-c745bda74905/resourceGroups/primer_proyecto/providers/Microsoft.MachineLearningServices/workspaces/primer_proyecto/onlineEndpoints/reg-10232239/deployments/regression-deployment', 'Resource__source_path': None, 'base_path': '/mnt/batch/tasks/shared/LS_root/mounts/clusters/primer-proyecto/code/Users/mauricio.quezada', 'creation_context': None, 'serialize': <msrest.serialization.Serializer object at 0

# Punto(f): Realizar una predicción

In [46]:
import json

# Datos de entrada basados en las columnas del CSV
request_json = {
    "input_data": {
        "columns": ["Make", "Model", "Vehicle Class", "Engine Size(L)", "Cylinders", "Transmission", "Fuel Type", "Fuel Consumption City (L/100 km)", "Fuel Consumption Hwy (L/100 km)", "Fuel Consumption Comb (L/100 km)", "Fuel Consumption Comb (mpg)"],
        "data": [{"Make": "Toyota", "Model": "Corolla", "Vehicle Class": "COMPACT", "Engine Size(L)": 1.8, "Cylinders": 4, "Transmission": "Automatic", "Fuel Type": "Gasoline", "Fuel Consumption City (L/100 km)": 8.5, "Fuel Consumption Hwy (L/100 km)": 6.2, "Fuel Consumption Comb (L/100 km)": 7.4, "Fuel Consumption Comb (mpg)": 38}]
    }
}

request_file_name = "sample_request_data.json"
with open(request_file_name, "w") as request_file:
    json.dump(request_json, request_file)

resp = ml_client.online_endpoints.invoke(
    endpoint_name=online_endpoint_name,
    deployment_name=deployment.name,
    request_file=request_file_name,
)
print(resp)


[172.076099334576]


In [52]:
import urllib.request
import json
import os
import ssl

def allowSelfSignedHttps(allowed):
    # bypass the server certificate verification on client side
    if allowed and not os.environ.get('PYTHONHTTPSVERIFY', '') and getattr(ssl, '_create_unverified_context', None):
        ssl._create_default_https_context = ssl._create_unverified_context

allowSelfSignedHttps(True) # this line is needed if you use self-signed certificate in your scoring service.

body = str.encode(json.dumps(request_json))

url = 'https://reg-10232239.eastus2.inference.ml.azure.com/score'
# Replace this with the primary/secondary key or AMLToken for the endpoint
api_key = 'E8Khe5C4eC3zWFYYBC8kwgz3NFhfoF4t'
if not api_key:
    raise Exception("A key should be provided to invoke the endpoint")

# The azureml-model-deployment header will force the request to go to a specific deployment.
# Remove this header to have the request observe the endpoint traffic rules
headers = {'Content-Type':'application/json', 'Authorization':('Bearer '+ api_key), 'azureml-model-deployment': 'regression-deployment' }

req = urllib.request.Request(url, body, headers)

try:
    response = urllib.request.urlopen(req)

    result = response.read()
    print(result)
except urllib.error.HTTPError as error:
    print("The request failed with status code: " + str(error.code))

    # Print the headers - they include the requert ID and the timestamp, which are useful for debugging the failure
    print(error.info())
    print(error.read().decode("utf8", 'ignore'))

b'[172.076099334576]'
