In [None]:
! pip install kfp
!pip install google-cloud-pipeline-components
!pip install gcsfs
!pip install scikit-learn

# Set parameters and initialize aiplatform client library

In [None]:
# Set parameters
project_id = 'my-final-project-ise-543'
location = 'us-central1'

from google.cloud import aiplatform
aiplatform.init(project=project_id, location=location)

from kfp.v2.dsl import pipeline, component, component

  from kfp.v2.dsl import pipeline, component, component


# Define components

## Common dataset preparation steps

In [None]:

from kfp.v2.dsl import component, InputPath, OutputPath
@component(packages_to_install=["pandas", "numpy", "fsspec", "gcsfs"])
def perform_initial_data_preparation(input_dataset_path: str,
                                     output_dataset_path: OutputPath('Dataset')):
    import pandas as pd
    import numpy as np

    df = pd.read_csv(input_dataset_path)

    # Filling all the Nan value of cigsPerDay with zero for the rows with current Smoker=0
    df.loc[df['currentSmoker']==0,['cigsPerDay']]=df.loc[df['currentSmoker']==0,['cigsPerDay']].fillna(0)



    # create a new label of 0 for all the NA values in education
    df['education']=df['education'].fillna(0)


    # Clip the column to remove outliers
    clipped_column = df['totChol'].clip( upper=600)

    # Replace the original column with the clipped column
    df['totChol']=clipped_column





    df.to_csv(output_dataset_path, index=False)



## Test train Dataset

In [None]:
from kfp.v2.dsl import component, InputPath, OutputPath


@component(packages_to_install=["scikit-learn", "pandas"])
def split_dataset(input_dataset_path: InputPath('Dataset'),
                  train_data_path: OutputPath('Dataset'),
                  validation_data_path: OutputPath('Dataset')):
    from sklearn.model_selection import train_test_split
    import pandas as pd
    df = pd.read_csv(input_dataset_path)
    X = df.drop(columns=['TenYearCHD'])
    y = df['TenYearCHD']

    train_data, validation_data = train_test_split(df,test_size=0.2,stratify=y)

    train_data.to_csv(train_data_path, index=False)

    validation_data.to_csv(validation_data_path, index=False)


## Imputing values

In [None]:
from kfp.v2.dsl import Output
from kfp.v2.dsl import Artifact

@component(packages_to_install=["pandas", "joblib","scikit-learn","imbalanced-learn==0.11.0"])
def impute_median_training(training_dataset_path: InputPath('Dataset'),
                   imputed_dataset_path: OutputPath('Dataset'),
                   scaler_path: OutputPath('Artifact'),
                   median: OutputPath('Artifact'),
                           features: OutputPath('Artifact')):

    from sklearn.preprocessing import StandardScaler, MinMaxScaler
    import joblib
    import pandas as pd
    import numpy as np
    from sklearn.feature_selection import SelectKBest
    from sklearn.feature_selection import f_regression


    # Load the training dataset
    df = pd.read_csv(training_dataset_path)

    median_values = {}

    for column in ['totChol', 'BMI', 'heartRate', 'a1c', 'glucose']:
        med = df[column].median()
        df[column] = df[column].fillna(med)
        median_values[column] = med

    median_df = pd.DataFrame(median_values.items(), columns=['Column', 'Median'])

    # drop the remaining Na values

    df.dropna(inplace=True)

    # Create a scaler object
    target_column = df['TenYearCHD']
    features_to_scale = df.drop('TenYearCHD', axis=1)

    # Apply StandardScaler to the features
    scaler = StandardScaler()
    scaled_features_array = scaler.fit_transform(features_to_scale)
    scaled_features_df = pd.DataFrame(scaled_features_array, columns=features_to_scale.columns)




    # Create a scaler object
    # Step 1: Initialize SelectKBest with the desired scoring function
    selector = SelectKBest(score_func=f_regression, k=10)  # You can adjust k as needed

    # Step 2: Fit the selector to your data
    X_new = selector.fit_transform(scaled_features_df, target_column)

    # Step 3: Get the selected feature indices
    selected_features_indices = selector.get_support(indices=True)

    # Step 4: Get the names of the selected features
    selected_features_names = list(scaled_features_df.columns[selected_features_indices])

    # Step 5: Save the selected feature names to an artifact
    joblib.dump(selected_features_names, features)

    # Save the selected features dataset to the output path
    X_selected = scaled_features_df[selected_features_names]


    # Combine the scaled features with the target column
    result_df = pd.concat([X_selected, target_column], axis=1)


    # Save the model to the designated output path
    joblib.dump(scaler, scaler_path)

      # Save the median dataframe to the output path
    median_df.to_csv(median, index=False)

    # Save the normalized dataframe to the output path
    result_df.to_csv(imputed_dataset_path, index=False)









## SMote training

In [None]:
@component(packages_to_install=["pandas", "joblib", "scikit-learn","imbalanced-learn==0.11.0"])
def Smote_training(training_dataset_path: InputPath('Dataset'),
                        OS_dataset_path: OutputPath('Dataset')
                       ):

    import pandas as pd
    import joblib

    # Load the training dataset
    df = pd.read_csv(training_dataset_path)
    df.dropna(inplace=True)


    y_train = df['TenYearCHD']
    X_selected = df.drop(columns=['TenYearCHD'])



    from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
    from imblearn.under_sampling import RandomUnderSampler

    # Oversampling using Random Oversampler
    ros = RandomUnderSampler(sampling_strategy=0.5)
    X_ros, y_ros = ros.fit_resample(X_selected, y_train)

    # Oversampling using SMOTE
    smote = SMOTE(random_state=42)
    X_smote, y_smote = smote.fit_resample(X_selected, y_train)

    oversampled_df = pd.concat([X_smote, y_smote], axis=1)

    oversampled_df.to_csv(OS_dataset_path, index=False)







  return component_factory.create_component_from_func(


## imputing Vaidation Dataset

In [None]:
@component(packages_to_install=["pandas", "numpy", "scikit-learn", "scipy", "joblib"])
def impute_median_validation(
        validation_dataset_path: InputPath('Dataset'),
        median_path: InputPath('Artifact'),  # medians from training
        scaler_path: InputPath('Artifact'),  # scaler from training
        imputed_validation_dataset_path: OutputPath('Dataset'),
        FS_dataset_path: InputPath('Artifact')):


    import pandas as pd
    import numpy as np
    from sklearn.preprocessing import StandardScaler
    import joblib


    # Load the validation dataset
    df = pd.read_csv(validation_dataset_path)

    # Load the median values from the training dataset
    median_df = pd.read_csv(median_path)

    # Fill in the missing values with the median values
    # Iterate over columns in the test dataset
    for column in median_df['Column']:
        # Retrieve the median value for the current column
        median_value = median_df.loc[median_df['Column'] == column, 'Median'].values[0]
        # Fill missing values in the test dataset with the median value
        df[column] = df[column].fillna(median_value)

    # Drop the remaining missing values
    df.dropna(inplace=True)

    # Load the scaler
    scaler = joblib.load(scaler_path)

    y_test = df['TenYearCHD']
    X_test = df.drop(columns=['TenYearCHD'])


    X_test = X_test.reset_index(drop=True)
    X_test_scaled_array=scaler.transform(X_test)

    X_test_scaled=pd.DataFrame(X_test_scaled_array, columns=X_test.columns)

    df=pd.concat([X_test_scaled, y_test], axis=1)


    # Load the list of selected feature names from the training dataset
    selected_features_names = joblib.load(FS_dataset_path)

    # Select the same features in the test dataset as selected in the training dataset
    selected_test_X =df[selected_features_names]

    selected_test_df = pd.concat([selected_test_X, y_test], axis=1)

    # Drop the remaining missing values
    selected_test_df.dropna(inplace=True)



    # Save the imputed dataframe to the output path
    selected_test_df.to_csv(imputed_validation_dataset_path, index=False)


  return component_factory.create_component_from_func(


## Selecting features for validation

In [None]:
@component(packages_to_install=["pandas", "joblib"])
def select_features_test(test_dataset_path: InputPath('Dataset'),
                         #FS_dataset_path: InputPath('Artifact'),
                         selected_test_features_path: OutputPath('Dataset')):
    import pandas as pd
    import joblib

    # Load the test dataset
    test_df = pd.read_csv(test_dataset_path)

    # Save the selected features in the test dataset to the output path
    test_df.to_csv(selected_test_features_path, index=False)


## train logistic

In [None]:
from kfp.v2.dsl import Output
from kfp.v2.dsl import Artifact
from kfp.v2.dsl import Model
from kfp.v2.dsl import Model
from kfp.v2.dsl import Input
from kfp.v2.dsl import InputPath
from kfp.v2.dsl import OutputPath
from kfp.v2.dsl import component

@component(packages_to_install=["pandas", "scikit-learn", "joblib"])
def train_logistic_regression(training_dataset_path: InputPath('Dataset'),
                              trained_model_artifact: Output[Model]):

    import pandas as pd
    from sklearn.linear_model import LogisticRegression
    import joblib
    import os

    # Load the training data
    train_df = pd.read_csv(training_dataset_path)

    y_train = train_df['TenYearCHD']

    X_train = train_df.drop('TenYearCHD', axis=1)

    trained_model = LogisticRegression(max_iter=1000)
    trained_model.fit(X_train, y_train)

    # Save the model to the designated gcs output path
    os.makedirs(trained_model_artifact.path, exist_ok=True)
    joblib.dump(trained_model, os.path.join(trained_model_artifact.path, "model.joblib"))

  return component_factory.create_component_from_func(


## train model

In [None]:
@component(packages_to_install=["pandas", "scikit-learn", "joblib"])
def train_knn(training_dataset_path: InputPath('Dataset'),
              trained_model_artifact: Output[Model]):

    import pandas as pd
    from sklearn.neighbors import KNeighborsClassifier
    import joblib
    import os

    # Load the training data
    train_df = pd.read_csv(training_dataset_path)

    y_train = train_df['TenYearCHD']

    X_train = train_df.drop('TenYearCHD', axis=1)

    trained_model = KNeighborsClassifier()
    trained_model.fit(X_train, y_train)

    # Save the model to the designated gcs output path
    os.makedirs(trained_model_artifact.path, exist_ok=True)
    joblib.dump(trained_model, os.path.join(trained_model_artifact.path, "model.joblib"))


@component(packages_to_install=["pandas", "scikit-learn", "joblib"])
def train_random_forest(training_dataset_path: InputPath('Dataset'),
                        trained_model_artifact: Output[Model]):

    import pandas as pd
    from sklearn.ensemble import RandomForestClassifier
    import joblib
    import os

    # Load the training data
    train_df = pd.read_csv(training_dataset_path)


    y_train = train_df['TenYearCHD']

    X_train = train_df.drop('TenYearCHD', axis=1)

    trained_model = RandomForestClassifier()
    trained_model.fit(X_train, y_train)

    # Save the model to the designated gcs output path
    os.makedirs(trained_model_artifact.path, exist_ok=True)
    joblib.dump(trained_model, os.path.join(trained_model_artifact.path, "model.joblib"))


@component(packages_to_install=["pandas", "scikit-learn", "joblib"])
def train_naive_bayes(training_dataset_path: InputPath('Dataset'),
                      trained_model_artifact: Output[Model]):

    import pandas as pd
    from sklearn.naive_bayes import GaussianNB
    import joblib
    import os

    # Load the training data
    train_df = pd.read_csv(training_dataset_path)


    y_train = train_df['TenYearCHD']

    X_train = train_df.drop('TenYearCHD', axis=1)

    trained_model = GaussianNB()
    trained_model.fit(X_train, y_train)

    # Save the model to the designated gcs output path
    os.makedirs(trained_model_artifact.path, exist_ok=True)
    joblib.dump(trained_model, os.path.join(trained_model_artifact.path, "model.joblib"))



In [None]:

@component(packages_to_install=["pandas","scikit-learn", "joblib"])
def voting_classifier(training_dataset_path: InputPath('Dataset'),
                      knn_model: Input[Model],
                       rf_model: Input[Model],
                       nb_model: Input[Model],
                       voting_model_artifact: Output[Model]):

    from sklearn.ensemble import VotingClassifier
    import joblib
    import pandas as pd
    import os
   # Load the training data
    train_df = pd.read_csv(training_dataset_path)

    y_train = train_df['TenYearCHD']

    X_train = train_df.drop('TenYearCHD', axis=1)

    # Load the trained models

    knn_model_loaded = joblib.load(knn_model.path + "/model.joblib")
    rf_model_loaded = joblib.load(rf_model.path + "/model.joblib")
    nb_model_loaded = joblib.load(nb_model.path + "/model.joblib")

    # Create a voting classifier with the loaded models
    voting_classifier = VotingClassifier(estimators=[
        ('knn', knn_model_loaded),
        ('rf', rf_model_loaded),
        ('nb', nb_model_loaded)],voting='soft'
    )

    voting_classifier.fit(X_train, y_train)

    # Save the voting classifier to the designated gcs output path
    os.makedirs(voting_model_artifact.path, exist_ok=True)
    joblib.dump(voting_classifier, os.path.join(voting_model_artifact.path, "model.joblib"))


## evaluate

In [None]:
from kfp.v2.dsl import Metrics

@component(packages_to_install=["pandas", "scikit-learn", "joblib"])
def evaluate_model(test_dataset_path: InputPath('Dataset'),
                   knn_model: Input[Model],
                   rf_model: Input[Model],
                   nb_model: Input[Model],
                   voting_model: Input[Model],
                   lr_model:  Input[Model],
                   #svm_model: Input[Model],
                  # gb_model:  Input[Model],
                   #xgb_model: Input[Model],
                   #cat_model: Input[Model],
                   best_model_metrics: Output[Metrics]):

    import pandas as pd
    import joblib
    from sklearn.metrics import accuracy_score, f1_score

    # Load the test dataset
    test_df = pd.read_csv(test_dataset_path)
    y_test = test_df['TenYearCHD']
    X_test = test_df.drop(columns=['TenYearCHD'])


    # Load the trained models
    knn_model_loaded = joblib.load(knn_model.path + "/model.joblib")
    rf_model_loaded = joblib.load(rf_model.path + "/model.joblib")
    nb_model_loaded = joblib.load(nb_model.path + "/model.joblib")
    voting_model_loaded = joblib.load(voting_model.path + "/model.joblib")
    lr_model_loaded = joblib.load(lr_model.path + "/model.joblib")
    #svm_model_loaded = joblib.load(svm_model)
    #gb_model_loaded = joblib.load(gb_model)
    #

    # Make predictions on the test set for each model
    knn_pred = knn_model_loaded.predict(X_test)
    rf_pred = rf_model_loaded.predict(X_test)
    nb_pred = nb_model_loaded.predict(X_test)
    voting_pred = voting_model_loaded.predict(X_test)
    lr_pred = lr_model_loaded.predict(X_test)
    #svm_pred = svm_model_loaded.predict(X_test)
    #gb_pred = gb_model_loaded.predict(X_test

    # Calculate evaluation metrics for each model
    knn_acc = accuracy_score(y_test, knn_pred)
    knn_f1 = f1_score(y_test, knn_pred,average='weighted')
    rf_acc = accuracy_score(y_test, rf_pred)
    rf_f1 = f1_score(y_test, rf_pred,average='weighted')
    nb_acc = accuracy_score(y_test, nb_pred)
    nb_f1 = f1_score(y_test, nb_pred,average='weighted')
    voting_acc = accuracy_score(y_test, voting_pred)
    voting_f1 = f1_score(y_test, voting_pred,average='weighted')
    lr_f1=f1_score(y_test,lr_pred,average='weighted')

    # Determine the best model based on F1 score
    best_model = max([('knn', knn_acc, knn_f1),
                      ('rf', rf_acc, rf_f1),
                      ('nb', nb_acc, nb_f1),
                      ('voting', voting_acc, voting_f1)],
                     key=lambda x: x[2])

    # Log the evaluation metrics of the best model
    best_model_metrics.log_metric("accuracy", best_model[1])
    best_model_metrics.log_metric("f1_score", best_model[2])
    best_model_metrics.log_metric("model", best_model[0])
    best_model_metrics.log_metric("knn_accuracy", knn_acc)
    best_model_metrics.log_metric("knn_f1_score", knn_f1)
    best_model_metrics.log_metric("rf_accuracy", rf_acc)
    best_model_metrics.log_metric("rf_f1_score", rf_f1)
    #best_model_metrics.log_metric("nb_accuracy", nb_acc)
    #best_model_metrics.log_metric("nb f1_score", nb_f1)
    best_model_metrics.log_metric("voting_accuracy", voting_acc)
    best_model_metrics.log_metric("voting_f1_score", voting_f1)
    best_model_metrics.log_metric("lr_f1_score", lr_f1)




# Pipeline Defintion

In [None]:
# Define pipeline
from kfp.v2.dsl import pipeline, Output, Dataset

@pipeline(name="Heart Disease Prediction Pipeline")
def heart_disease_prediction_pipeline(raw_dataset_path: str):


    # Perform initial data preparation
    preprocess_task = perform_initial_data_preparation(input_dataset_path=raw_dataset_path)

    # Split dataset
    split_result = split_dataset(input_dataset_path=preprocess_task.output)


    # Process training dataset - impute median
    training_data_preparation = impute_median_training(training_dataset_path=split_result.outputs['train_data_path'])

    # Process validation dataset - impute median
    validation_data_preparation = impute_median_validation(validation_dataset_path=split_result.outputs['validation_data_path'],
                                                           median_path=training_data_preparation.outputs['median'],
                                                           scaler_path=training_data_preparation.outputs['scaler_path'],
                                                           FS_dataset_path=training_data_preparation.outputs['features'])

        # feture Selection
    feature_selection_task = Smote_training(training_dataset_path=training_data_preparation.outputs['imputed_dataset_path'])

    #Feature slection validation

    feature_selection_validation_task = select_features_test(test_dataset_path=validation_data_preparation.outputs['imputed_validation_dataset_path'])
                                                            # FS_dataset_path=feature_selection_task.outputs['features'])

      # Train models
    train_lr_task = train_logistic_regression(training_dataset_path=feature_selection_task.outputs['OS_dataset_path'])
    train_knn_task = train_knn(training_dataset_path=feature_selection_task.outputs['OS_dataset_path'])
    train_rf_task = train_random_forest(training_dataset_path=feature_selection_task.outputs['OS_dataset_path'])
    train_nb_task = train_naive_bayes(training_dataset_path=feature_selection_task.outputs['OS_dataset_path'])
    train_voting_task = voting_classifier(training_dataset_path=feature_selection_task.outputs['OS_dataset_path'],
                                          knn_model=train_knn_task.outputs['trained_model_artifact'],
                                          rf_model=train_rf_task.outputs['trained_model_artifact'],
                                          nb_model=train_nb_task.outputs['trained_model_artifact'])

    evaluate_models_task = evaluate_model(
      test_dataset_path=feature_selection_validation_task.output,
      knn_model=train_knn_task.outputs['trained_model_artifact'],
      rf_model=train_rf_task.outputs['trained_model_artifact'],
      nb_model=train_nb_task.outputs['trained_model_artifact'],
      voting_model=train_voting_task.outputs['voting_model_artifact'],
      lr_model=train_lr_task.outputs['trained_model_artifact']
    )











In [None]:

from kfp.v2 import compiler

# Compile the pipeline

compiler.Compiler().compile(
    pipeline_func=heart_disease_prediction_pipeline,
    package_path="heart_disease_prediction_pipeline.json"
)

pipeline_job = aiplatform.PipelineJob(
    display_name='heart_disease_prediction',
    template_path='heart_disease_prediction_pipeline.json',
    pipeline_root='gs://heart_prediction',
    parameter_values={
      'raw_dataset_path':'gs://heart_prediction/Final Project Dataset (2).csv'
    },
    enable_caching=True
)

pipeline_job.run()

INFO:google.cloud.aiplatform.pipeline_jobs:Creating PipelineJob
INFO:google.cloud.aiplatform.pipeline_jobs:PipelineJob created. Resource name: projects/757245801734/locations/us-central1/pipelineJobs/heart-disease-prediction-pipeline-20240503005801
INFO:google.cloud.aiplatform.pipeline_jobs:To use this PipelineJob in another session:
INFO:google.cloud.aiplatform.pipeline_jobs:pipeline_job = aiplatform.PipelineJob.get('projects/757245801734/locations/us-central1/pipelineJobs/heart-disease-prediction-pipeline-20240503005801')
INFO:google.cloud.aiplatform.pipeline_jobs:View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/heart-disease-prediction-pipeline-20240503005801?project=757245801734
INFO:google.cloud.aiplatform.pipeline_jobs:PipelineJob projects/757245801734/locations/us-central1/pipelineJobs/heart-disease-prediction-pipeline-20240503005801 current state:
PipelineState.PIPELINE_STATE_RUNNING
INFO:google.cloud.aiplatform.pipeline_jobs:Pi

In [None]:
from kfp.v2.dsl import Metrics

@component(packages_to_install=["pandas", "scikit-learn", "joblib"])
def evaluate_model(test_dataset_path: InputPath('Dataset'),
                   model: Input[Model],
                   metrics: Output[Metrics]):

    import pandas as pd
    import joblib
    from sklearn.metrics import confusion_matrix, accuracy_score, f1_score

    # Load the test dataset
    test_df = pd.read_csv(test_dataset_path)
    X_test = test_df.drop(columns=['TenYearCHD'])
    y_test = test_df['TenYearCHD']

    # Load the trained model
    model_file_path = model.path + "/model.joblib"
    trained_model = joblib.load(model_file_path)

    # Make predictions
    y_pred = trained_model.predict(X_test)

    # Calculate the confusion matrix and extract components
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')

    # Log each component of the confusion matrix separately
    metrics.log_metric("accuracy", accuracy)
    metrics.log_metric("f1_score", f1)

  return component_factory.create_component_from_func(
