In [7]:
! pip3 install --user --no-cache-dir --upgrade "kfp>2" "google-cloud-pipeline-components>2" \ google-cloud-aiplatform

ERROR: Ignored the following versions that require a different python version: 2.0.0 Requires-Python >=3.7.0,<3.12.0; 2.0.0-beta.14 Requires-Python >=3.7.0,<3.12.0; 2.0.0-beta.17 Requires-Python >=3.7.0,<3.12.0; 2.0.0-rc.1 Requires-Python >=3.7.0,<3.12.0; 2.0.0-rc.2 Requires-Python >=3.7.0,<3.12.0; 2.0.0b15 Requires-Python >=3.7.0,<3.12.0; 2.0.0b16 Requires-Python >=3.7.0,<3.12.0; 2.0.0b2 Requires-Python >=3.7.0,<3.12.0; 2.0.0b3 Requires-Python >=3.7.0,<3.12.0; 2.0.0b4 Requires-Python >=3.7.0,<3.12.0; 2.0.0b5 Requires-Python >=3.7.0,<3.12.0; 2.0.1 Requires-Python >=3.7.0,<3.12.0; 2.1.0 Requires-Python >=3.7.0,<3.12.0; 2.1.1 Requires-Python >=3.7.0,<3.12.0; 2.1.2 Requires-Python >=3.7.0,<3.12.0; 2.1.3 Requires-Python >=3.7.0,<3.12.0; 2.10.0 Requires-Python >=3.7.0,<3.12.0; 2.11.0 Requires-Python <3.12.0,>=3.7.0; 2.12.0 Requires-Python <3.12.0,>=3.7.0; 2.13.0 Requires-Python <3.12.0,>=3.7.0; 2.13.1 Requires-Python <3.12.0,>=3.7.0; 2.14.0 Requires-Python <3.12.0,>=3.8.0; 2.14.1 Requires-P



In [9]:
! python3 -c "import kfp; print('KFP SDK version: {}'.format(kfp.__version__))"
! pip3 freeze | grep aiplatform
! python3 -c "import google_cloud_pipeline_components; print('google_cloud_pipeline_components version: {}'.format(google_cloud_pipeline_components.__version__))"

KFP SDK version: 2.9.0
google-cloud-aiplatform==1.70.0
google_cloud_pipeline_components version: 1.0.33


In [11]:
import google.cloud.aiplatform as aiplatform
import kfp
from kfp import compiler, dsl
from kfp.dsl import Artifact, Dataset, Input, Metrics, Model, Output, component

In [13]:
PROJECT_ID = "hip-lightning-435508-s1"  # Replace with your Google Cloud project ID
REGION = "us-central1"  # Adjust the region as needed
PIPELINE_ROOT = "gs://mlops_team4_de2024"  # Replace with your GCS bucket URI


In [14]:
# Initialize the AI platform
aiplatform.init(
    project=PROJECT_ID,
    location=REGION,
)

In [16]:
# Data Ingestion
@dsl.component(
    packages_to_install=["pandas","google-cloud-storage"],
    base_image="python:3.10.7-slim"
)
def download_data(project_id: str, bucket: str, file_name: str, dataset: Output[Dataset]):
    '''Download data from GCS.'''
    from google.cloud import storage
    import pandas as pd
    import logging 
    import sys
 
    logging.basicConfig(stream=sys.stdout, level=logging.INFO)

    try:
        # Initialize GCS client
        logging.info(f"Initializing storage client for project: {project_id}")
        client = storage.Client(project=project_id)

        # Access the bucket and blob
        logging.info(f"Accessing bucket: {bucket}, file: {file_name}")
        bucket = client.bucket(bucket)
        blob = bucket.blob(file_name)
        
        # Download the file
        file_path = dataset.path + ".csv"
        logging.info(f"Downloading file to: {file_path}")
        blob.download_to_filename(file_path)
        logging.info('Downloaded Data successfully!')
    except Exception as e:
        logging.error(f"Error in download_data: {str(e)}")
        raise e  # Re-raise the error to capture it in logs

In [19]:
# Training Linear Regression
@dsl.component(
    packages_to_install=['pandas', 'scikit-learn==1.3.2'],
    base_image="python:3.10.7-slim"
)
def train_lr(features: Input[Dataset], out_model: Output[Model]) -> NamedTuple('outputs', metrics=dict):
    '''Train a Linear Regression model and return performance metrics'''
    import pandas as pd
    from sklearn.linear_model import LinearRegression
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import mean_absolute_error, mean_squared_error
    import logging 
    import pickle  
    from typing import NamedTuple

    logging.basicConfig(stream=sys.stdout, level=logging.INFO)

    # Load dataset
    df = pd.read_csv(features.path + ".csv")
    logging.info(f"Columns in dataset: {df.columns}")

    # Split dataset into features and target variable
    X = df.drop(columns=['Median_House_Value'])  # Replace with your target column
    y = df['Median_House_Value']

    # Split the data into training and testing sets
    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

    # Train the model
    model_lr = LinearRegression()
    model_lr.fit(x_train, y_train)

    # Calculate metrics
    predictions = model_lr.predict(x_test)
    metrics_dict = {
        "mean_absolute_error": mean_absolute_error(y_test, predictions),
        "mean_squared_error": mean_squared_error(y_test, predictions),
    }
    logging.info(f"Metrics: {metrics_dict}")

    # Save the model
    model_file = out_model.path + ".pkl"
    with open(model_file, 'wb') as f:  
        pickle.dump(model_lr, f)   

    return NamedTuple('outputs', metrics=dict)(metrics_dict)

In [24]:
# Model Prediction
@dsl.component(
    packages_to_install=['pandas', 'scikit-learn==1.3.2'],
    base_image="python:3.10.7-slim"
)
def predict_lr(model: Input[Model], features: Input[Dataset], results: Output[Dataset]):
    """Predict house prices using the trained Linear Regression model."""
    import pandas as pd
    import pickle  
    import logging
    import sys
    
    logging.basicConfig(stream=sys.stdout, level=logging.INFO)

    # Load the features dataset
    df = pd.read_csv(features.path + ".csv")
    
    # Load the saved model
    filename = model.path + ".pkl"
    model_lr = pickle.load(open(filename, 'rb'))

    # Select features for prediction (using the actual feature names)
    xNew = df[[
        'Median_Income', 'Median_Age', 'Tot_Rooms', 'Tot_Bedrooms', 'Population',
        'Households', 'Latitude', 'Longitude', 'Distance_to_coast', 
        'Distance_to_LA', 'Distance_to_SanDiego', 'Distance_to_SanJose',
        'Distance_to_SanFrancisco'
    ]]

    # Make predictions
    df['predicted_price'] = model_lr.predict(xNew)
    logging.info(f"Predictions: {df['predicted_price'].tolist()}")

    # Save results to the output dataset
    df.to_csv(results.path, index=False, encoding='utf-8-sig')

In [5]:
# Training Component
@dsl.component(
    packages_to_install=["pandas", "scikit-learn==1.3.2"],
    base_image="python:3.10.7-slim"
)
def train_regression_model(features: Input[Dataset], model: Output[Model]):
    '''Train a regression model using Linear Regression.'''
    import pandas as pd
    from sklearn.linear_model import LinearRegression        
    import pickle 
    
    # Load the training data
    data = pd.read_csv(features.path+".csv")
    
    # Train a Linear Regression model
    model_lr = LinearRegression()
    X = data.drop('Median_House_Value', axis=1)
    y = data['Median_House_Value']
    model_lr.fit(X, y)

    # Save the model to the specified path
    file_name = model.path + f".pkl"
    with open(file_name, 'wb') as file:  
        pickle.dump(model_lr, file)   

In [22]:
# Model Evaluation
@dsl.component(
    packages_to_install=["pandas", "scikit-learn==1.3.2", "numpy"],
    base_image="python:3.10.7-slim"
)
def evaluate_model(
    test_set: Input[Dataset],
    model: Input[Model],
    metrics: Output[Metrics]
):
    '''Evaluate the trained regression model.'''
    import pandas as pd
    import pickle
    from sklearn.metrics import mean_absolute_error, mean_squared_error
    import numpy as np

    # Load test data and model
    data = pd.read_csv(test_set.path + ".csv")
    X_test = data.drop('Median_House_Value', axis=1)
    y_test = data['Median_House_Value']
    
    model_file = model.path + ".pkl"
    loaded_model = pickle.load(open(model_file, 'rb'))

    # Predictions
    y_pred = loaded_model.predict(X_test)

    # Calculate metrics
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))

    # Log metrics
    metrics.log_metric("mean_absolute_error", mae)
    metrics.log_metric("root_mean_squared_error", rmse)

In [23]:
# Model upload
@dsl.component(
    packages_to_install=["google-cloud-storage"],
    base_image="python:3.10.7-slim"
)
def upload_model_to_gcs(project_id: str, model_repo: str, model: Input[Model]):
    """Upload the trained model to Google Cloud Storage (GCS)."""
    from google.cloud import storage
    import logging

    # Set up logging
    logging.basicConfig(stream=sys.stdout, level=logging.INFO)
    
    try:
        # Initialize GCS client
        client = storage.Client(project=project_id)

        # Get the bucket and create a blob for the model
        bucket = client.bucket(model_repo)
        model_file_name = "lr_model.pkl"  # Assuming a linear regression model saved as a pickle file
        blob = bucket.blob(model_file_name)

        
        # Upload the model file
        blob.upload_from_filename(f"{model.path}{model.metadata['file_type']}")
        
        logging.info(f"Saved the model to GCP bucket: {model_repo} as {model_file_name}")

    except Exception as e:
        logging.error(f"Error uploading model to GCS: {str(e)}")

In [32]:
# Import necessary modules
from kfp import dsl
from kfp import compiler

# Define your pipeline function
@dsl.pipeline(
    name="california-housing-pipeline"
)
def california_housing_pipeline(project_id: str, data_bucket: str, trainset_filename: str, model_repo: str, testset_filename: str):
    
    # Step 1: Download training data
    train_data_op = download_data(
        project_id=project_id,
        bucket=data_bucket,
        file_name=trainset_filename
    )

    # Step 2: Train the model
    train_op = train_lr(
        features=train_data_op.outputs["dataset"]
    )

    # Step 3: Download test data
    test_data_op = download_data(
        project_id=project_id,
        bucket=data_bucket,
        file_name=testset_filename
    )

    # Step 4: Make predictions using the trained model
    predict_op = predict_lr(
        model=train_op.outputs['out_model'],
        features=test_data_op.outputs['dataset']
    )

    # Step 5: Upload the predicted results
    upload_op = upload_model_to_gcs(
        project_id=project_id,
        model_repo=model_repo,
        model=train_op.outputs['out_model']
    )

# Compile the pipeline
compiler.Compiler().compile(
    pipeline_func=california_housing_pipeline,
    package_path='california_housing_pipeline.yaml'
)

In [34]:
import google.cloud.aiplatform as aip

# Before initializing, make sure to set the GOOGLE_APPLICATION_CREDENTIALS
# environment variable to the path of your service account.
# You can set this in your environment or use the following command in the terminal:
# export GOOGLE_APPLICATION_CREDENTIALS="path/to/your/service-account-file.json"

# Initialize AI Platform
aip.init(
    project=PROJECT_ID,
    location=REGION,
)

# Prepare the pipeline job
job = aip.PipelineJob(
    display_name="california-housing-pipeline",
    enable_caching=False,
    template_path="california_housing_pipeline.yaml",
    pipeline_root="gs://mlops_team4_de2024/artifacts",  # Update with a valid GCS path
    location=REGION,
    parameter_values={
        'project_id': PROJECT_ID,
        'data_bucket': 'gs://mlops_team4_de2024',
        'trainset_filename': 'train_data.csv',
        'testset_filename': 'test_data.csv',
        'model_repo': 'mlops_team4_de2024'
    }
)

# Run the pipeline job
job.run()

DefaultCredentialsError: Your default credentials were not found. To set up Application Default Credentials, see https://cloud.google.com/docs/authentication/external/set-up-adc for more information.