In [3]:
import kfp
from kfp import dsl
from kfp.dsl import Input, Output, Dataset, Model, component
from azure.storage.blob import BlobServiceClient

@component(
    base_image='python:3.12.3',
    packages_to_install=[
        'pandas',
        'mlflow',
        'scikit-learn',
        'joblib',
        'minio',
        'azure-storage-blob'
    ]
)
def download_from_azure(
    x_train: Output[Dataset],
    x_test: Output[Dataset],
    y_train: Output[Dataset],
    y_test: Output[Dataset],
    azure_connection_string: str,
    container_name: str = 'csvstorage',
    x_train_blob: str="X_train.csv",
    x_test_blob: str="X_test.csv",
    y_train_blob: str="y_train.csv",
    y_test_blob: str="y_test.csv"
):
    import pandas as pd
    from azure.storage.blob import BlobServiceClient

    # Initialize Azure Blob Service Client
    blob_service_client = BlobServiceClient.from_connection_string(conn_str=azure_connection_string)
    
    # Download function
    def download_blob_to_file(blob_name, output_path):
        blob_client = blob_service_client.get_blob_client(container='csvstorage', blob=blob_name)
        with open(output_path, "wb") as download_file:
            download_file.write(blob_client.download_blob().readall())
    
    # Download all files
    download_blob_to_file(x_train_blob, x_train.path)
    download_blob_to_file(x_test_blob, x_test.path)
    download_blob_to_file(y_train_blob, y_train.path)
    download_blob_to_file(y_test_blob, y_test.path)

@component(
    base_image='python:3.12.3',
    packages_to_install=[
        'pandas',
        'mlflow',
        'scikit-learn',
        'joblib',
        'minio',
        'dagshub',
        'prometheus-client'
    ]
)
def train_and_evaluate(
    x_train: Input[Dataset],
    x_test: Input[Dataset],
    y_train: Input[Dataset],
    y_test: Input[Dataset],
    best_model: Output[Model],
    metrics_port: int = 8000  # Port to expose Prometheus metrics
):
    import pandas as pd
    import mlflow
    import mlflow.sklearn
    from sklearn.model_selection import GridSearchCV
    from sklearn.linear_model import LinearRegression, Ridge, Lasso
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.metrics import r2_score, mean_squared_error
    import joblib
    from minio import Minio
    import dagshub
    from prometheus_client import start_http_server, Gauge
    import time
    
    # Start Prometheus metrics server
    start_http_server(metrics_port)
    
    # Create Prometheus metrics
    r2_metric = Gauge('model_r2_score', 'Best model R2 score')
    mse_metric = Gauge('model_mse', 'Best model Mean Squared Error')
    rmse_metric = Gauge('model_rmse', 'Best model Root Mean Squared Error')
    model_type = Gauge('model_type', 'Best model type', ['model_name'])
    
    import os
    
    # Initialize DagsHub and MLflow
    dagshub.auth.add_app_token("c1b64f0e0a5268dae2ca62d0ae4bec20fdecb445")
    dagshub.init(repo_owner='manish-bagdwal1', repo_name='MLOps-Pipeline-Local-Batch-Training', mlflow=True)

    # Load Preprocessed Data
    X_train = pd.read_csv(x_train.path)
    X_test = pd.read_csv(x_test.path)
    y_train = pd.read_csv(y_train.path).values.ravel()
    y_test = pd.read_csv(y_test.path).values.ravel()

    # Define Models and Hyperparameter Grid
    models = {
        'LinearRegression': (LinearRegression(), {}),
        'Ridge': (Ridge(), {'alpha': [0.1, 1.0, 10.0]}),
        'Lasso': (Lasso(), {'alpha': [0.1, 1.0, 10.0]}),
        'RandomForest': (RandomForestRegressor(), {'n_estimators': [50, 100, 200]})
    }

    best_model_instance = None
    best_score = float('-inf')
    best_model_name = ''
    best_mse = float('inf')
    best_rmse = float('inf')

    mlflow.set_tracking_uri('https://dagshub.com/manish-bagdwal1/MLOps-Pipeline-Local-Batch-Training.mlflow')
    mlflow.set_experiment('kubeflow_experiment')

    # Train and Evaluate Models
    for model_name, (model, param_grid) in models.items():
        with mlflow.start_run(run_name=model_name):
            if param_grid:
                grid_search = GridSearchCV(model, param_grid, cv=5, scoring='r2')
                grid_search.fit(X_train, y_train)
                model_instance = grid_search.best_estimator_
                best_params = grid_search.best_params_
            else:
                model_instance = model.fit(X_train, y_train)
                best_params = {}
            
            predictions = model_instance.predict(X_test)
            r2 = r2_score(y_test, predictions)
            mse = mean_squared_error(y_test, predictions)
            rmse = mean_squared_error(y_test, predictions, squared=False)
            
            mlflow.log_params(best_params)
            mlflow.log_metric('R2 Score', r2)
            mlflow.log_metric('MSE', mse)
            mlflow.log_metric('RMSE', rmse)
            mlflow.sklearn.log_model(model_instance, model_name)
            
            if r2 > best_score:
                best_score = r2
                best_mse = mse
                best_rmse = rmse
                best_model_instance = model_instance
                best_model_name = model_name

    # Save Best Model
    joblib.dump(best_model_instance, best_model.path)
    
    # Upload to MinIO
    client = Minio('minio-service:9000', access_key='minio', secret_key='minio123', secure=False)
    if not client.bucket_exists("models"):
        client.make_bucket("models")
    client.fput_object('models', f'{best_model_name}.pkl', best_model.path)
    
    # Update Prometheus metrics
    r2_metric.set(best_score)
    mse_metric.set(best_mse)
    rmse_metric.set(best_rmse)
    model_type.labels(model_name=best_model_name).set(1)
    
    
    
 

In [4]:
@dsl.pipeline(
    name='Regression Model Training Pipeline',
    description='A pipeline that downloads data from Azure Blob Storage and trains models'
)
def ml_pipeline(
    azure_connection_string: str ,
    container_name: str = "csvstorage",
    x_train_blob: str = "X_train.csv",
    x_test_blob: str = "X_test.csv",
    y_train_blob: str = "y_train.csv",
    y_test_blob: str = "y_test.csv",
    metrics_port: int = 8000
):
    # Download data from Azure
    download_task = download_from_azure(
        azure_connection_string=azure_connection_string,
        container_name=container_name,
        x_train_blob=x_train_blob,
        x_test_blob=x_test_blob,
        y_train_blob=y_train_blob,
        y_test_blob=y_test_blob
    )
    
    # Train models
    train_task = train_and_evaluate(
        x_train=download_task.outputs['x_train'],
        x_test=download_task.outputs['x_test'],
        y_train=download_task.outputs['y_train'],
        y_test=download_task.outputs['y_test']
    )
    
    # Configure Prometheus scraping
    train_task.add_pod_annotation("prometheus.io/scrape", "true")
    train_task.add_pod_annotation("prometheus.io/port", str(metrics_port))
    train_task.add_pod_annotation("prometheus.io/path", "/metrics")
    
    # Optional: Add timeout to ensure metrics are scraped
    train_task.set_timeout(180)  # 3 minutes timeout
    

if __name__ == '__main__':
    kfp.compiler.Compiler().compile(ml_pipeline, 'kubeflow_mlflow_pipeline_promv2.yaml')

AttributeError: 'PipelineTask' object has no attribute 'add_pod_annotation'