## Define constants

In [None]:
from datetime import datetime
TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")
# -> '20220305052457'

In [None]:
PROJECT_ID = "" 
REGION = "us-central1"
BUCKET_NAME = ""
PIPELINE_ROOT = "gs://{}".format(BUCKET_NAME)
API_ENDPOINT = "{}-aiplatform.googleapis.com".format(REGION)
# -> 'us-central1-aiplatform.googleapis.com'
DISPLAY_NAME = "titanic_" + TIMESTAMP
KAGGLE_COMMIT_MESSAGE = "update at " + TIMESTAMP

In [None]:
RAW_DATA_PATH = "input/titanic"
TRANSFORMED_DATA_PATH = "output/titanic/" + TIMESTAMP
TRAINED_MODEL_PATH = "output/titanic/" + TIMESTAMP
PREDICTED_DATA_PATH = "output/titanic/" + TIMESTAMP

In [None]:
BASE_IMAGE_TRANSFORM = REGION + "-docker.pkg.dev/" + PROJECT_ID + "/kaggle/transform:latest"
BASE_IMAGE_TRAINER = REGION + "-docker.pkg.dev/" + PROJECT_ID + "/kaggle/trainer:latest"
BASE_IMAGE_PREDICTOR = REGION + "-docker.pkg.dev/" + PROJECT_ID + "/kaggle/predictor:latest"

## Define components and a pipeline

In [None]:
# Set up your Google Cloud project
!gcloud config set project $PROJECT_ID

In [None]:
import google.cloud.aiplatform as aip

In [None]:
from kfp import dsl
from kfp.v2 import compiler
from kfp.v2.dsl import component

In [None]:
# Set up your Google Cloud project
aip.init(project=PROJECT_ID, staging_bucket=BUCKET_NAME)

In [None]:
@component(base_image=BASE_IMAGE_TRANSFORM)
def transform(project_id: str,bucket_name: str, raw_data_path: str, transformed_data_path: str) -> str:
    from google.cloud import storage as gcs
    from io import BytesIO
    import numpy as np
    import pandas as pd

    raw_data_path_train = raw_data_path + '/train.csv'
    raw_data_path_test = raw_data_path + '/test.csv'
    raw_data_path_gender_submission = raw_data_path + '/gender_submission.csv'
    transformed_data_path_y_train = transformed_data_path + "/y_train.csv"
    transformed_data_path_X_train = transformed_data_path + "/X_train.csv"
    transformed_data_path_X_test = transformed_data_path + "/X_test.csv"
    
    # input
    client = gcs.Client(project_id)
    bucket = client.get_bucket(bucket_name)
    # train
    raw_data_blob_train = bucket.blob(raw_data_path_train)
    train = pd.read_csv(BytesIO(raw_data_blob_train.download_as_string()))
    # test
    raw_data_blob_test = bucket.blob(raw_data_path_test)
    test = pd.read_csv(BytesIO(raw_data_blob_test.download_as_string()))
    # gender_submission
    raw_data_blob_gender_submission = bucket.blob(raw_data_path_gender_submission)
    gender_submission = pd.read_csv(BytesIO(raw_data_blob_gender_submission.download_as_string()))

    # run
    data = pd.concat([train, test], sort=False)

    data['Sex'].replace(['male', 'female'], [0, 1], inplace=True)

    data['Embarked'].fillna(('S'), inplace=True)
    data['Embarked'] = data['Embarked'].map({'S': 0, 'C': 1, 'Q': 2}).astype(int)

    data['Fare'].fillna(np.mean(data['Fare']), inplace=True)

    age_avg = data['Age'].mean()
    age_std = data['Age'].std()
    data['Age'].fillna(np.random.randint(age_avg - age_std, age_avg + age_std), inplace=True)

    delete_columns = ['Name', 'PassengerId', 'SibSp', 'Parch', 'Ticket', 'Cabin']
    data.drop(delete_columns, axis=1, inplace=True)

    train = data[:len(train)]
    test = data[len(train):]

    y_train = train['Survived']
    X_train = train.drop('Survived', axis=1)
    X_test = test.drop('Survived', axis=1)
    
    # output
    # y_train
    transformed_data_blob_y_train = bucket.blob(transformed_data_path_y_train)
    transformed_data_blob_y_train.upload_from_string(y_train.to_csv(sep=","))
    # X_train
    transformed_data_blob_X_train = bucket.blob(transformed_data_path_X_train)
    transformed_data_blob_X_train.upload_from_string(X_train.to_csv(sep=","))
    # X_test
    transformed_data_blob_X_test = bucket.blob(transformed_data_path_X_test)
    transformed_data_blob_X_test.upload_from_string(X_test.to_csv(sep=","))
    
    return transformed_data_path

In [None]:
# debug local
transform(PROJECT_ID, BUCKET_NAME, RAW_DATA_PATH, TRANSFORMED_DATA_PATH)

In [None]:
@component(base_image=BASE_IMAGE_TRAINER)
def trainer(project_id: str,bucket_name: str, transformed_data_path: str, trained_model_path: str) -> str:
    from google.cloud import storage as gcs
    from io import BytesIO
    import pickle
    import numpy as np
    import pandas as pd
    from sklearn.linear_model import LogisticRegression

    transformed_data_path_y_train = transformed_data_path + "/y_train.csv"
    transformed_data_path_X_train = transformed_data_path + "/X_train.csv"
    trained_model_path_model = trained_model_path + "/model_titanic.sav"

    # input
    client = gcs.Client(project_id)
    bucket = client.get_bucket(bucket_name)
    # y_train
    transformed_data_blob_y_train = bucket.blob(transformed_data_path_y_train)
    y_train = pd.read_csv(BytesIO(transformed_data_blob_y_train.download_as_string()), index_col=0)
    # X_train
    transformed_data_blob_X_train = bucket.blob(transformed_data_path_X_train)
    X_train = pd.read_csv(BytesIO(transformed_data_blob_X_train.download_as_string()), index_col=0)

    # run
    model = LogisticRegression(penalty='l2', solver='sag', random_state=0)
    model.fit(X_train, y_train)
    
    # output
    pickle.dump(model, open('model_titanic.sav', 'wb'))
    trained_model_blob_model = bucket.blob(trained_model_path_model)
    trained_model_blob_model.upload_from_filename('model_titanic.sav')

    return trained_model_path

In [None]:
# debug local
trainer(PROJECT_ID, BUCKET_NAME, TRANSFORMED_DATA_PATH, TRAINED_MODEL_PATH)

In [None]:
@component(base_image=BASE_IMAGE_PREDICTOR)
def predictor(project_id: str,bucket_name: str, raw_data_path: str, transformed_data_path: str, trained_model_path: str, predicted_data_path: str, kaggle_commit_message: str) -> str:
    from google.cloud import storage as gcs
    from io import BytesIO
    import pickle
    import numpy as np
    import pandas as pd
    from sklearn.linear_model import LogisticRegression
    from kaggle.api.kaggle_api_extended import KaggleApi

    raw_data_path_gender_submission = raw_data_path + '/gender_submission.csv'
    transformed_data_path_X_test = transformed_data_path + "/X_test.csv"
    trained_model_path_model = trained_model_path + "/model_titanic.sav"
    predicted_data_path_submission = predicted_data_path + "/submission.csv"

    # input
    client = gcs.Client(project_id)
    bucket = client.get_bucket(bucket_name)
    # gender_submission
    raw_data_blob_gender_submission = bucket.blob(raw_data_path_gender_submission)
    gender_submission = pd.read_csv(BytesIO(raw_data_blob_gender_submission.download_as_string()))
    # X_test
    transformed_data_blob_X_test = bucket.blob(transformed_data_path_X_test)
    X_test = pd.read_csv(BytesIO(transformed_data_blob_X_test.download_as_string()), index_col=0)
    # model
    trained_model_blob = bucket.blob(trained_model_path_model)
    trained_model_blob.download_to_filename("model_titanic.sav")
    loaded_model = pickle.load(open("model_titanic.sav", "rb"))
    
    # run
    y_pred = loaded_model.predict(X_test)
    # submit
    gender_submission["Survived"] = list(map(int, y_pred))
    gender_submission.to_csv("submission.csv", index=False)

    api = KaggleApi()
    api.authenticate()
    api.competition_submit('submission.csv', message=kaggle_commit_message, competition='titanic')
    
    # output
    predicted_data_submission_blob = bucket.blob(predicted_data_path_submission)
    predicted_data_submission_blob.upload_from_filename("submission.csv")

    return predicted_data_path

In [None]:
# debug local
predictor(PROJECT_ID, BUCKET_NAME, RAW_DATA_PATH, TRANSFORMED_DATA_PATH, TRAINED_MODEL_PATH, PREDICTED_DATA_PATH, KAGGLE_COMMIT_MESSAGE)

In [None]:
@dsl.pipeline(
    name="titanic",
    description="pipeline for titanic",
    pipeline_root=PIPELINE_ROOT,
)
def pipeline(
        project_id: str = PROJECT_ID,
        bucket_name: str = BUCKET_NAME,
        raw_data_path: str = RAW_DATA_PATH,
        transformed_data_path: str = TRANSFORMED_DATA_PATH,
        trained_model_path: str = TRAINED_MODEL_PATH,
        predicted_data_path: str = PREDICTED_DATA_PATH,
        kaggle_commit_message: str = KAGGLE_COMMIT_MESSAGE
        
    ):
    transform_task = transform(project_id, bucket_name, raw_data_path, transformed_data_path)
    trainer_task = trainer(project_id, bucket_name, transform_task.output, trained_model_path)
    predictor_task = predictor(project_id, bucket_name, raw_data_path, transform_task.output, trainer_task.output, predicted_data_path, kaggle_commit_message)

## Compile the pipeline

In [None]:
from kfp.v2 import compiler  # noqa: F811

compiler.Compiler().compile(
    pipeline_func=pipeline, package_path="pipeline_titanic.json".replace(" ", "_")
)

## Run the pipeline

In [None]:
job = aip.PipelineJob(
    display_name=DISPLAY_NAME,
    template_path="pipeline_titanic.json".replace(" ", "_"),
    pipeline_root=PIPELINE_ROOT,
)

job.run()