In [1]:
! pip3 install --user --no-cache-dir --upgrade "kfp>2" "google-cloud-pipeline-components>2" \
                                        google-cloud-aiplatform



ERROR: Ignored the following versions that require a different python version: 2.0.0 Requires-Python >=3.7.0,<3.12.0; 2.0.0-beta.14 Requires-Python >=3.7.0,<3.12.0; 2.0.0-beta.17 Requires-Python >=3.7.0,<3.12.0; 2.0.0-rc.1 Requires-Python >=3.7.0,<3.12.0; 2.0.0-rc.2 Requires-Python >=3.7.0,<3.12.0; 2.0.0b15 Requires-Python >=3.7.0,<3.12.0; 2.0.0b16 Requires-Python >=3.7.0,<3.12.0; 2.0.0b2 Requires-Python >=3.7.0,<3.12.0; 2.0.0b3 Requires-Python >=3.7.0,<3.12.0; 2.0.0b4 Requires-Python >=3.7.0,<3.12.0; 2.0.0b5 Requires-Python >=3.7.0,<3.12.0; 2.0.1 Requires-Python >=3.7.0,<3.12.0; 2.1.0 Requires-Python >=3.7.0,<3.12.0; 2.1.1 Requires-Python >=3.7.0,<3.12.0; 2.1.2 Requires-Python >=3.7.0,<3.12.0; 2.1.3 Requires-Python >=3.7.0,<3.12.0; 2.10.0 Requires-Python >=3.7.0,<3.12.0; 2.11.0 Requires-Python <3.12.0,>=3.7.0; 2.12.0 Requires-Python <3.12.0,>=3.7.0; 2.13.0 Requires-Python <3.12.0,>=3.7.0; 2.13.1 Requires-Python <3.12.0,>=3.7.0; 2.14.0 Requires-Python <3.12.0,>=3.8.0; 2.14.1 Requires-P




In [2]:
# restart kernel
import os

if not os.getenv("IS_TESTING"):
    # Automatically restart kernel after installs
    import IPython

    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

In [1]:
! python3 -c "import kfp; print('KFP SDK version: {}'.format(kfp.__version__))"
! pip3 freeze | grep aiplatform
! python3 -c "import google_cloud_pipeline_components; print('google_cloud_pipeline_components version: {}'.format(google_cloud_pipeline_components.__version__))"

KFP SDK version: 2.9.0


'grep' is not recognized as an internal or external command,
operable program or batch file.


google_cloud_pipeline_components version: 1.0.33


In [2]:
import kfp
import typing
from typing import Dict
from typing import NamedTuple
from kfp import dsl
from kfp.dsl import (Artifact,
                        Dataset,
                        Input,
                        Model,
                        Output,
                        Metrics,
                        ClassificationMetrics,
                        component, 
                        OutputPath, 
                        InputPath)
import google.cloud.aiplatform as aip
from google_cloud_pipeline_components.v1.model import ModelUploadOp
from google_cloud_pipeline_components.v1.endpoint import (EndpointCreateOp,ModelDeployOp)
from google_cloud_pipeline_components.types import artifact_types

In [3]:
PROJECT_ID = "hip-lightning-435508-s1"  # Replace with your Google Cloud project ID
REGION = "us-central1"  # Adjust the region as needed
PIPELINE_ROOT = "gs://mlops_team4_de2024"  # Replace with your GCS bucket URI


In [4]:
# Train Test Split
@dsl.component(
    packages_to_install=["pandas", "scikit-learn==1.3.2"],
    base_image="python:3.10.7-slim"
)
def train_test_split(dataset: Input[Dataset], dataset_train: Output[Dataset], dataset_test: Output[Dataset]):
    '''Splits the California housing dataset into training and testing sets.'''
    import pandas as pd
    import logging 
    import sys
    from sklearn.model_selection import train_test_split as tts

    logging.basicConfig(stream=sys.stdout, level=logging.INFO) 
    
    # Load data from the dataset
    alldata = pd.read_csv(dataset.path, index_col=None)
    train, test = tts(alldata, test_size=0.3)
    
    # Save the splits
    train.to_csv(dataset_train.path + ".csv" , index=False, encoding='utf-8-sig')
    test.to_csv(dataset_test.path + ".csv" , index=False, encoding='utf-8-sig')


In [5]:
# Training Component
@dsl.component(
    packages_to_install=["pandas", "scikit-learn==1.3.2"],
    base_image="python:3.10.7-slim"
)
def train_regression_model(features: Input[Dataset], model: Output[Model]):
    '''Train a regression model using Linear Regression.'''
    import pandas as pd
    from sklearn.linear_model import LinearRegression        
    import pickle 
    
    # Load the training data
    data = pd.read_csv(features.path+".csv")
    
    # Train a Linear Regression model
    model_lr = LinearRegression()
    X = data.drop('median_house_value', axis=1)  # We can change 'median_house_value' to our target column name
    y = data['median_house_value']
    model_lr.fit(X, y)

    # Save the model to the specified path
    file_name = model.path + f".pkl"
    with open(file_name, 'wb') as file:  
        pickle.dump(model_lr, file)   


In [6]:
# Model Evaluation
@dsl.component(
    packages_to_install=["pandas", "scikit-learn==1.3.2", "numpy"],
    base_image="python:3.10.7-slim"
)
def evaluate_model(
    test_set: Input[Dataset],
    model: Input[Model],
    metrics: Output[Metrics]
):
    '''Evaluate the trained regression model.'''
    import pandas as pd
    import pickle
    from sklearn.metrics import mean_absolute_error, mean_squared_error
    import numpy as np

    # Load test data and model
    data = pd.read_csv(test_set.path + ".csv")
    X_test = data.drop('median_house_value', axis=1)
    y_test = data['median_house_value']
    
    model_file = model.path + ".pkl"
    loaded_model = pickle.load(open(model_file, 'rb'))

    # Predictions
    y_pred = loaded_model.predict(X_test)

    # Calculate metrics
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))

    # Log metrics
    metrics.log_metric("mean_absolute_error", mae)
    metrics.log_metric("root_mean_squared_error", rmse)


In [9]:
@dsl.component(
    packages_to_install=["google-cloud-storage"],
    base_image="python:3.10.7-slim"
)
def upload_model_to_gcs(project_id: str, model_repo: str, model: Input[Model]):
    '''upload model to gsc'''
    from google.cloud import storage   
    import logging 
    import sys
    
    logging.basicConfig(stream=sys.stdout, level=logging.INFO)    
  
    # upload the model to GCS
    client = storage.Client(project=project_id)
    bucket = client.bucket(model_repo)
    blob = bucket.blob(str(model.metadata["algo"]) + '_model' + str(model.metadata["file_type"])) 
    blob.upload_from_filename(model.path + str(model.metadata["file_type"]))       
    
    print("Saved the model to GCP bucket : " + model_repo)