In [None]:
import google.cloud.aiplatform as aiplatform
from google.cloud.aiplatform import hyperparameter_tuning as hpt

BUCKET_URI = f"gs://sid-vertex-mlops"
TRAIN_COMPUTE = "n1-standard-4"
TRAIN_VERSION = "xgboost-cpu.1-1"
REGION = "us-central1"

TRAIN_IMAGE = "us-docker.pkg.dev/vertex-ai/training/xgboost-cpu.1-1:latest"

machine_spec = {"machine_type": TRAIN_COMPUTE, "accelerator_count": 0}

aiplatform.init(project="udemy-mlops", staging_bucket=BUCKET_URI)

! rm -rf custom
! mkdir custom

setup_py = """
import setuptools

setuptools.setup(
    install_requires=[
        'cloudml-hypertune',
        'gcsfs',
        'category_encoders==2.6.1',
        'imbalanced-learn==0.11.0',
        'scikit-learn>=0.24.0',
    ],
    packages=setuptools.find_packages()
)
"""

# setup_py = "import setuptools\n\nsetuptools.setup(\n\n    install_requires=[\n\n        'cloudml-hypertune','gcsfs','category_encoders','imbalanced-learn',\n\n    ],\n\n    packages=setuptools.find_packages())"
! echo "$setup_py" > custom/setup.py

! mkdir custom/trainer
! touch custom/trainer/__init__.py

In [None]:
%%writefile custom/trainer/task.py
import pandas as pd
from sklearn.metrics import precision_score, recall_score, roc_auc_score, accuracy_score
from sklearn.model_selection import train_test_split
from category_encoders import HashingEncoder
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from google.cloud import storage
import hypertune
import argparse

storage_client = storage.Client()
bucket = storage_client.bucket("sid-kubeflow-v1")

parser = argparse.ArgumentParser()
parser.add_argument("--n_estimators", dest="n_estimators",default=20, type=int, help="Number of estimators")
parser.add_argument("--learning_rate", dest="learning_rate",default=0.2, type=float, help="Learning Rate")

args = parser.parse_args()


def load_data(file_path):
    df = pd.read_csv(file_path)
    return df

def preprocess_data(df):
    
    df = df.drop(columns=['car', 'toCoupon_GEQ5min', 'direction_opp'])
    df = df.fillna(df.mode().iloc[0])
    df = df.drop_duplicates()

    df_dummy = df.copy()
    age_list = []
    for i in df['age']:
        if i == 'below21':
            age = '<21'
        elif i in ['21', '26']:
            age = '21-30'
        elif i in ['31', '36']:
            age = '31-40'
        elif i in ['41', '46']:
            age = '41-50'
        else:
            age = '>50'
        age_list.append(age)
    df_dummy['age'] = age_list

    df_dummy['passanger_destination'] = df_dummy['passanger'].astype(str) + '-' + df_dummy['destination'].astype(str)
    df_dummy['marital_hasChildren'] = df_dummy['maritalStatus'].astype(str) + '-' + df_dummy['has_children'].astype(str)
    df_dummy['temperature_weather'] = df_dummy['temperature'].astype(str) + '-' + df_dummy['weather'].astype(str)
    df_dummy = df_dummy.drop(columns=['passanger', 'destination', 'maritalStatus', 'has_children', 'temperature','weather', 'Y'])

    df_dummy = pd.concat([df_dummy, df['Y']], axis = 1)
    df_dummy = df_dummy.drop(columns=['gender', 'RestaurantLessThan20'])
    df_le = df_dummy.replace({
        'expiration':{'2h': 0, '1d' : 1},
        'age':{'<21': 0, '21-30': 1, '31-40': 2, '41-50': 3, '>50': 4},
        'education':{'Some High School': 0, 'High School Graduate': 1, 'Some college - no degree': 2,
                     'Associates degree': 3, 'Bachelors degree': 4, 'Graduate degree (Masters or Doctorate)': 5},
        'Bar':{'never': 0, 'less1': 1, '1~3': 2, '4~8': 3, 'gt8': 4},
        'CoffeeHouse':{'never': 0, 'less1': 1, '1~3': 2, '4~8': 3, 'gt8': 4}, 
        'CarryAway':{'never': 0, 'less1': 1, '1~3': 2, '4~8': 3, 'gt8': 4}, 
        'Restaurant20To50':{'never': 0, 'less1': 1, '1~3': 2, '4~8': 3, 'gt8': 4},
        'income':{'Less than $12500':0, '$12500 - $24999':1, '$25000 - $37499':2, '$37500 - $49999':3,
                  '$50000 - $62499':4, '$62500 - $74999':5, '$75000 - $87499':6, '$87500 - $99999':7,
                  '$100000 or More':8},
        'time':{'7AM':0, '10AM':1, '2PM':2, '6PM':3, '10PM':4}
    })

    x = df_le.drop('Y', axis=1)
    y = df_le.Y

    return x, y

def train_model(x_train, y_train, learning_rate, n_estimators, max_depth=None):
    model = XGBClassifier(
        max_depth=max_depth,
        learning_rate=learning_rate,
        n_estimators=n_estimators,
        random_state=42,
        use_label_encoder=False
    )
    model.fit(x_train, y_train)
    return model

def evaluate_model(model, x_test, y_test, x_sm_train_hashing, y_sm_train):
    y_pred = model.predict(x_test)
    y_pred_proba = model.predict_proba(x_test)
    y_pred_train = model.predict(x_sm_train_hashing)
    y_pred_train_proba = model.predict_proba(x_sm_train_hashing)
    
    return accuracy_score(y_test, y_pred),precision_score(y_test, y_pred)

def encode_features(x, n_components=27):
    hashing_ros_enc = HashingEncoder(cols=['passanger_destination', 'marital_hasChildren', 'occupation', 'coupon',
                                           'temperature_weather'], n_components=n_components).fit(x)
    x_test_hashing = hashing_ros_enc.transform(x.reset_index(drop=True))
    return x_test_hashing

def oversample_data(x_train_hashing, y_train):
    sm = SMOTE(random_state=42)
    x_sm_train_hashing, y_sm_train = sm.fit_resample(x_train_hashing, y_train)
    return x_sm_train_hashing, y_sm_train

def get_score(model, x, y, x_test, y_test):
    model.fit(x, y)
    y_pred = model.predict_proba(x_test)[:, 1]
    score = roc_auc_score(y_test, y_pred)
    return score

input_file = "gs://sid-kubeflow-v1/coupon-recommendation/in-vehicle-coupon-recommendation.csv"
df = load_data(input_file)


n_estimators = args.n_estimators
learning_rate = args.learning_rate

x, y = preprocess_data(df)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

x_train.fillna(x_train.mode().iloc[0], inplace=True)
x_test.fillna(x_train.mode().iloc[0], inplace=True)

model_name = 'xgboost'
print("Training and evaluating", model_name, "model:")
x_train_hashing = encode_features(x_train)
x_test_hashing = encode_features(x_test)
x_sm_train_hashing, y_sm_train = oversample_data(x_train_hashing, y_train)

pipeline = train_model(x_sm_train_hashing, y_sm_train, learning_rate, n_estimators, max_depth=None)

accuracy, precision = evaluate_model(pipeline, x_test_hashing, y_test, x_sm_train_hashing, y_sm_train)

hpt = hypertune.HyperTune()
hpt.report_hyperparameter_tuning_metric(
    hyperparameter_metric_tag='accuracy',
    metric_value=accuracy
)

In [None]:
! rm -f custom.tar custom.tar.gz
! tar cvf custom.tar custom
! gzip custom.tar
! gsutil cp custom.tar.gz $BUCKET_URI/xgboost_classification.tar.gz

In [None]:
DISK_TYPE = "pd-ssd"
DISK_SIZE = 200

disk_spec = {"boot_disk_type": DISK_TYPE, "boot_disk_size_gb": DISK_SIZE}
worker_pool_spec = [
    {
        "replica_count": 1,
        "machine_spec": machine_spec,
        "disk_spec": disk_spec,
        "python_package_spec": {
            "executor_image_uri": TRAIN_IMAGE,
            "package_uris": [BUCKET_URI + "/xgboost_classification.tar.gz"],
            "python_module": "trainer.task"
        }
    }
]

job = aiplatform.CustomJob(
    display_name="xgboost_hpt_tuning",
    worker_pool_specs=worker_pool_spec
)

hpt_job = aiplatform.HyperparameterTuningJob(
    display_name="xgboost_hpt_job",
    custom_job=job,
    metric_spec={
        "accuracy": "maximize"
    },
    parameter_spec={
        "n_estimators": hpt.IntegerParameterSpec(min=30,max=40,scale="linear"),
        "learning_rate": hpt.DoubleParameterSpec(min=0.2,max=0.5,scale="linear")
    },
    search_algorithm=None,
    max_trial_count=2,
    parallel_trial_count=2
)

hpt_job.run()