In [1]:
import os
from dotenv import load_dotenv
import mlflow
from mlflow.models.signature import ModelSignature
from mlflow.models.signature import infer_signature
from mlflow.types.schema import Schema
from mlflow.types.schema import ParamSchema
from mlflow.types.schema import ParamSpec
from mlflow.types.schema import ColSpec
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
from typing import Tuple

load_dotenv()

True

In [2]:
# conectar con mlflow y minio (por http)
mlflow.set_tracking_uri("http://127.0.0.1:5000")

os.environ['MLFLOW_S3_ENDPOINT_URL'] = "http://127.0.0.1:9000"
os.environ['AWS_ACCESS_KEY_ID'] = os.getenv('KEY_ID')
os.environ['AWS_SECRET_ACCESS_KEY'] = os.getenv('ACCESS_KEY')

# Schema

In [3]:
def get_train_data() -> Tuple[pd.DataFrame]:
    """
    Generate train and test data.

    :return: x_train,y_train
    """
    # Cargar el conjunto de datos de Boston Housing
    data = pd.read_csv("BostonHousing.csv")

    # Separar las características y la variable objetivo
    X = data.drop("medv", axis=1)
    y = data["medv"]

    return X, y

In [4]:
x_train, y_train = get_train_data()
x_train.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33


In [5]:
cols_spec = []
data_map = {
    'int64': 'integer',
    'float64': 'double',
    'bool': 'boolean',
    'str': 'string',
    "date": 'datetime'
}

for name, dtype in x_train.dtypes.to_dict().items():
    cols_spec.append(ColSpec(name=name, type=data_map[str(dtype)]))

cols_spec

['crim': double (required),
 'zn': double (required),
 'indus': double (required),
 'chas': integer (required),
 'nox': double (required),
 'rm': double (required),
 'age': double (required),
 'dis': double (required),
 'rad': integer (required),
 'tax': integer (required),
 'ptratio': double (required),
 'b': double (required),
 'lstat': double (required)]

In [6]:
x_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 13 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   crim     506 non-null    float64
 1   zn       506 non-null    float64
 2   indus    506 non-null    float64
 3   chas     506 non-null    int64  
 4   nox      506 non-null    float64
 5   rm       506 non-null    float64
 6   age      506 non-null    float64
 7   dis      506 non-null    float64
 8   rad      506 non-null    int64  
 9   tax      506 non-null    int64  
 10  ptratio  506 non-null    float64
 11  b        506 non-null    float64
 12  lstat    506 non-null    float64
dtypes: float64(10), int64(3)
memory usage: 51.5 KB


In [7]:
y_train.info()

<class 'pandas.core.series.Series'>
RangeIndex: 506 entries, 0 to 505
Series name: medv
Non-Null Count  Dtype  
--------------  -----  
506 non-null    float64
dtypes: float64(1)
memory usage: 4.1 KB


In [8]:
input_schema = Schema(inputs=cols_spec)
output_schema = Schema([ColSpec(name="label", type="integer")])
input_schema, output_schema

(['crim': double (required), 'zn': double (required), 'indus': double (required), 'chas': integer (required), 'nox': double (required), 'rm': double (required), 'age': double (required), 'dis': double (required), 'rad': integer (required), 'tax': integer (required), 'ptratio': double (required), 'b': double (required), 'lstat': double (required)],
 ['label': integer (required)])

In [9]:
parameter = ParamSpec(name="model_name", dtype="string", default="logging_artifacts18")
param_schema = ParamSchema(params=[parameter])
parameter, param_schema

('model_name': string (default: logging_artifacts18),
 ['model_name': string (default: logging_artifacts18)])

In [10]:
model_signature = ModelSignature(inputs=input_schema, outputs=output_schema, params=param_schema)
print("MODEL SIGNATURE")
print(model_signature.to_dict())

MODEL SIGNATURE
{'inputs': '[{"type": "double", "name": "crim", "required": true}, {"type": "double", "name": "zn", "required": true}, {"type": "double", "name": "indus", "required": true}, {"type": "integer", "name": "chas", "required": true}, {"type": "double", "name": "nox", "required": true}, {"type": "double", "name": "rm", "required": true}, {"type": "double", "name": "age", "required": true}, {"type": "double", "name": "dis", "required": true}, {"type": "integer", "name": "rad", "required": true}, {"type": "integer", "name": "tax", "required": true}, {"type": "double", "name": "ptratio", "required": true}, {"type": "double", "name": "b", "required": true}, {"type": "double", "name": "lstat", "required": true}]', 'outputs': '[{"type": "integer", "name": "label", "required": true}]', 'params': '[{"name": "model_name", "type": "string", "default": "logging_artifacts18", "shape": null}]'}


In [11]:
model_signature = infer_signature(x_train, y_train, params={"model_name": "logging_artifacts16"})
print("MODEL SIGNATURE")
print(model_signature.to_dict())

MODEL SIGNATURE
{'inputs': '[{"type": "double", "name": "crim", "required": true}, {"type": "double", "name": "zn", "required": true}, {"type": "double", "name": "indus", "required": true}, {"type": "long", "name": "chas", "required": true}, {"type": "double", "name": "nox", "required": true}, {"type": "double", "name": "rm", "required": true}, {"type": "double", "name": "age", "required": true}, {"type": "double", "name": "dis", "required": true}, {"type": "long", "name": "rad", "required": true}, {"type": "long", "name": "tax", "required": true}, {"type": "double", "name": "ptratio", "required": true}, {"type": "double", "name": "b", "required": true}, {"type": "double", "name": "lstat", "required": true}]', 'outputs': '[{"type": "double", "name": "medv", "required": true}]', 'params': '[{"name": "model_name", "type": "string", "default": "logging_artifacts16", "shape": null}]'}




In [12]:
# experiment_id = create_mlflow_experiment(
#     experiment_name="Model Signature",
#     artifact_location="model_signature_artifacts",
#     tags={"purpose": "learning"},
# )

mlflow.set_experiment("mlflow_tracking")

mlflow.autolog()

# with mlflow.start_run(run_name="model_signature_run") as run:
with mlflow.start_run(run_name="logging_artifacts18") as run:
    print("RUN ID:", run.info.run_id)
    mlflow.sklearn.log_model(sk_model=RandomForestClassifier(), artifact_path="model_signature", signature=model_signature)

2024/06/19 09:43:10 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.


RUN ID: e53c11b51e94411aaebd3974ebe6387c


In [13]:
print('tracking uri:', mlflow.get_tracking_uri())
print('artifact uri:', mlflow.get_artifact_uri())

tracking uri: http://127.0.0.1:5000
artifact uri: s3://mlflow/3/54e4c2ac89364b45927a7a107988be45/artifacts


In [14]:
mlflow.end_run()

In [15]:
print('ok_')

ok_
