### GCP ML - Pipeline

In [1]:
# Install packages
import kfp
from google.cloud import aiplatform
from kfp.v2 import dsl, compiler
from kfp.v2.dsl import (Artifact, Dataset, Input, InputPath, Model, Output, OutputPath, component, importer)
from typing import NamedTuple
from google.cloud import storage
import pandas as pd
import gcsfs
from google_cloud_pipeline_components.v1.vertex_notification_email import VertexNotificationEmailOp

#### Declare variables and pipeline root

In [2]:
PROJECT_ID = "datapath-deploy-api-v1-434102"  # @param {type:"string"}
LOCATION = "us-central1"  # @param {type:"string"}
BUCKET_URI = f"gs://demo_vertext_01"  # @param {type:"string"}
SERVICE_ACCOUNT = "dev-mlops-vertex@datapath-deploy-api-v1-434102.iam.gserviceaccount.com"

In [3]:
# ! gsutil mb -l {LOCATION} -p {PROJECT_ID} {BUCKET_URI}
PIPELINE_ROOT = "{}/output_info".format(BUCKET_URI)

#### Initialize AIPlatform

In [4]:
aiplatform.init(project=PROJECT_ID, staging_bucket=BUCKET_URI)

##### First component: Preprocess data

In [5]:
@component(base_image="python:3.9", packages_to_install=["pandas", "google-cloud-storage"])
def preprocess_data(
    gcs_bucket_name: str,
    source_blob_name: str,
    proccesed_blob_name: str,
    output_dataset: Output[Dataset]
    ):
    """
    Preprocessing step that downloads a CSV from GCS, processes it, and outputs the result.
    """
    from google.cloud import storage
    import pandas as pd
    import io
    
    # Crear cliente de GCS
    client = storage.Client()
    bucket = client.get_bucket(gcs_bucket_name)
    
    # Descargar el archivo CSV del bucket    
    blob = bucket.blob(source_blob_name)
    downloaded_file = blob.download_as_bytes()

    # Leer el CSV con pandas desde los bytes descargados
    dataset = pd.read_csv(io.BytesIO(downloaded_file))
    
    # Renombrar columnas específicas
    dataset.rename(columns={'Fecha': 'Date', 
                                'Último': 'Close', 
                                'Apertura': 'Open', 
                                'Máximo': 'High', 
                                'Mínimo': 'Low',}, inplace=True)
    
    # Reemplazar los puntos por guiones en la columna 'fecha'
    dataset['Date'] = dataset['Date'].str.replace('.', '-', regex=False)
    
    # Become Date feature from object to date format
    dataset['Date'] = pd.to_datetime(dataset['Date'], format='%d-%m-%Y')    
    
    # Función para transformar el formato de texto a float
    def transformar_a_float(valor):
        valor = valor.replace('.', '')  # Eliminar los puntos de miles
        valor = valor.replace(',', '.')  # Reemplazar la coma decimal por un punto
        return float(valor)  # Convertir a float

    # Aplicar la función a la columna
    dataset['Open'] = dataset['Open'].apply(transformar_a_float)
    dataset['High'] = dataset['High'].apply(transformar_a_float)
    dataset['Low'] = dataset['Low'].apply(transformar_a_float)
    dataset['Close'] = dataset['Close'].apply(transformar_a_float)    
    
    
    # Guardar el DataFrame transformado en un archivo CSV en memoria
    dataset.to_csv(f"{output_dataset.path}.csv", index=False)
            

#### Second component: Train LSTM model

In [6]:
@component(base_image="python:3.9", 
           packages_to_install=[
               "scikit-learn",
               "pandas",
               "keras",
               "numpy",
               "tensorflow",
               "joblib"
               ])
def training_model(input_df: Input[Dataset],
                   name_file_model: str,
                   output_model: Output[Model]
                  ):
    from sklearn.model_selection import train_test_split
    from joblib import dump
    import pandas as pd
    import numpy as np
    import os
    from sklearn.preprocessing import PowerTransformer
    from sklearn.preprocessing import MinMaxScaler
    from keras.models import Sequential
    from keras.layers import LSTM, Dense    
    
    dataset_transformed = pd.read_csv(f"{input_df.path}.csv")
    
    # Filtrar datos de entrenamiento (2002-2022)
    train_data = dataset_transformed[(dataset_transformed['Date'] >= '2002-01-01') & (dataset_transformed['Date'] <= '2021-12-31')]

    # Filtrar datos de prueba (2023 en adelante)
    test_data = dataset_transformed[dataset_transformed['Date'] >= '2022-01-01']
    #print(test_data.head())
    
    # Inicializar el escalador
    min_max_scaler = MinMaxScaler(feature_range=(0,1))

    # Ajustar el escalador a la columna 'Close' en los datos de entrenamiento
    train_data['Close'] = min_max_scaler.fit_transform(train_data[['Close']])

    print("Datos de entrenamiento con 'Close' escalado:")
    print(train_data['Close'].head(3))

    dataset_train_normalized = train_data['Close'].values
    
    dataset_test = test_data[['Close']]
    print("ok training and test set filtered!!")
    
    # dataset_train_processed = pd.read_csv("/Users/michaelandr/Desktop/airflow_deployment_ml/dags/data/train/processed_training_set_ISA_Historical_Info.csv")
    # La red LSTM tendrá como entrada "time_step" datos consecutivos, y como salida 1 dato (la predicción a
    # partir de esos "time_step" datos). Se conformará de esta forma el set de entrenamiento
    time_step = 60
    X_train = []
    Y_train = []
    m = len(dataset_train_normalized)

    for i in range(time_step, m):
        # X: bloques de "time_step" datos: 0-time_step, 1-time_step+1, 2-time_step+2, etc
        X_train.append(dataset_train_normalized[i-60:i])
        Y_train.append(dataset_train_normalized[i])
                
    X_train, Y_train = np.array(X_train), np.array(Y_train)     
    print("Conversion de arrays exitosa") 
    
    # Reshape X_train para que se ajuste al modelo en Keras
    X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))

    # Valores iniciales
    dim_entrada = (X_train.shape[1],1) # 60 datos de una feature
    dim_salida = 1
    na = 50

    # Crear el modelo
    print("Inicio creacion red lstm")
    model_lstm = Sequential()

    # Añadir la capa LSTM
    model_lstm.add(LSTM(units=na, return_sequences=True, input_shape= dim_entrada))
    model_lstm.add(LSTM(units=na))

    # Añadir una capa densa para la salida
    model_lstm.add(Dense(dim_salida))

    # Compilar el modelo
    print("compilacion")    
    model_lstm.compile(optimizer='rmsprop', loss='mean_squared_error')

    # Resumen del modelo
    model_lstm.summary()

    print("inicio training")
    # Train the model
    model_lstm.fit(X_train, Y_train, epochs=20, batch_size=32, validation_split=0.2, verbose=1)
    print("Training had finished!")

    # Guardar el modelo en el formato nativo de Keras en la ruta de salida proporcionada por KFP
    print(output_model.path)
    model_lstm.save(f"{output_model.path}.keras")

    print(f"Modelo guardado en: {output_model.path}")


#### Third component:Make predictions:

In [7]:
@component(base_image="python:3.9", 
           packages_to_install=[
               "scikit-learn",
               "pandas",
               "keras",
               "numpy",
               "tensorflow",
               "joblib"
               ])
def make_predictions(input_model: Input[Model],
                     gcs_bucket_name: str,
                     source_blob_name: str,
                     proccesed_blob_name: str):
    """ Import keras model and make predictions based on csv file stored in a bucket. """
    from sklearn.model_selection import train_test_split
    from joblib import dump
    import pandas as pd
    import numpy as np
    import os
    from sklearn.preprocessing import PowerTransformer
    from sklearn.preprocessing import MinMaxScaler
    from keras.models import Sequential, load_model
    from keras.layers import LSTM, Dense  
    from google.cloud import storage
    import io    
    
    # Crear cliente de GCS
    client = storage.Client()
    bucket = client.get_bucket(gcs_bucket_name)
    
    # Descargar el archivo CSV del bucket    
    blob = bucket.blob(source_blob_name)
    downloaded_file = blob.download_as_bytes()

    # Leer el CSV con pandas desde los bytes descargados
    dataset_predictions = pd.read_csv(io.BytesIO(downloaded_file))
    dataset_predictions = dataset_predictions.drop(columns=["Date", "Open", "High", "Low"])
    

    # Inicializar el escalador
    min_max_scaler = MinMaxScaler(feature_range=(0,1))

    # Ajustar el escalador a la columna 'Close' en los datos de entrenamiento
    dataset_predictions['Close'] = min_max_scaler.fit_transform(dataset_predictions[['Close']])

    print("Datos productivos escalados:")
    print(dataset_predictions['Close'].head(3))
    
    # Cargar el modelo guardado en formato .keras
    print("Inicio cargue del modelo productivo:")
    modelo_lstm = load_model(f"{input_model.path}.keras")
    print("Fin del cargue del modelo productivo:")
    
    # Convertir los datos a un formato adecuado para la predicción de LSTM
    dataset_predictions_formated = np.reshape(dataset_predictions, (1, dataset_predictions.shape[0], 1))
    print(f"dataset_predictions_formated: {dataset_predictions_formated.shape}")
    
    # Hacer la predicción
    prediccion_escalada = modelo_lstm.predict(dataset_predictions_formated)

    # Invertir la transformación de escala (para volver a los valores originales)
    prediccion_final = min_max_scaler.inverse_transform(prediccion_escalada)

    print(f"Predicción: {prediccion_final[0][0]}")
    
    

    

#### Definir un pipeline de test

In [14]:
@dsl.pipeline(
    name="test-pipeline-2",
    pipeline_root=PIPELINE_ROOT
)
def csv_preprocessing_pipeline(
    project: str = "datapath-deploy-api-v1-434102",
    gcp_region: str = "us-central1"):
    
    notify_email_task = VertexNotificationEmailOp(recipients=["michael.morapp@gmail.com"])

    with dsl.ExitHandler(notify_email_task):

        # Preproceso de datos        
        preprocess_task = preprocess_data(
           gcs_bucket_name="demo_vertext_01",
           source_blob_name = "raw_info/ISA_Historical_Info_2002_2024.csv",
           proccesed_blob_name = "output_info/processed_ISA_Historical_Info_2002_2024.csv"
        )

        preprocess_task.set_display_name("Preprocessing Data has finished")

        modeling_task = training_model(preprocess_task.output, "model_lstm").after(preprocess_task)
        modeling_task.set_display_name("Training Model has finished")

        predictions_task = make_predictions(modeling_task.output, 
           gcs_bucket_name="demo_vertext_01",
           source_blob_name = "prod_info/ISA_Historical_Info_Prod.csv",
           proccesed_blob_name = "prod_info/prediction_ISA.csv"                     
        ).after(modeling_task)  

        predictions_task.set_display_name("Making Predictions has finished")


#### Compilar el pipeline de test:

In [15]:
compiler.Compiler().compile(
    pipeline_func = csv_preprocessing_pipeline,
    package_path = "pipeline_demo_test.json"
)



#### Run the pipeline

In [16]:
aiplatform.init(project = "datapath-deploy-api-v1-434102", location = "us-central1")

job = aiplatform.PipelineJob(
    display_name ="test-pipeline-job-training",
    template_path = "pipeline_demo_test.json",
    pipeline_root = PIPELINE_ROOT,
    enable_caching = True,
    project = "datapath-deploy-api-v1-434102",
    location = "us-central1"
    )

print("Submit pipeline job ....")
job.submit("dev-mlops-vertex@datapath-deploy-api-v1-434102.iam.gserviceaccount.com")


Submit pipeline job ....
Creating PipelineJob
PipelineJob created. Resource name: projects/172483762390/locations/us-central1/pipelineJobs/test-pipeline-2-20241022032331
To use this PipelineJob in another session:
pipeline_job = aiplatform.PipelineJob.get('projects/172483762390/locations/us-central1/pipelineJobs/test-pipeline-2-20241022032331')
View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/test-pipeline-2-20241022032331?project=172483762390
