In [1]:
project = !gcloud config get-value project
PROJECT_ID = project[0]
PROJECT_ID


'molten-unison-414815'

In [2]:
REGION = 'us-central1'
DATANAME = 'fraud'
NOTEBOOK = '02c'

# Resources
DEPLOY_COMPUTE = 'n1-standard-2'

# Model Training
VAR_TARGET = 'Class'
VAR_OMIT = 'transaction_id' # add more variables to the string with space delimiters

In [3]:
#!pip install -U google-cloud-pipeline-components -U -q


In [4]:
from google.cloud import aiplatform
from datetime import datetime
import kfp
from kfp.v2 import compiler
#import kfp.v2.dsl as dsl
#import google_cloud_pipeline_components as gcc_aip
from google_cloud_pipeline_components.v1.dataset import TabularDatasetCreateOp
from google_cloud_pipeline_components.v1.automl.training_job import AutoMLTabularTrainingJobRunOp
from google_cloud_pipeline_components.v1.endpoint import EndpointCreateOp, ModelDeployOp

from google.cloud import bigquery
from google.protobuf import json_format
from google.protobuf.struct_pb2 import Value
import json
import numpy as np

  from kfp.v2 import compiler


In [5]:
aiplatform.init(project=PROJECT_ID, location=REGION)
bq = bigquery.Client()

In [6]:
TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")
BUCKET = PROJECT_ID
URI = f"gs://{BUCKET}/{DATANAME}/models/{NOTEBOOK}"
DIR = f"temp/{NOTEBOOK}"

In [7]:
SERVICE_ACCOUNT = !gcloud config list --format='value(core.account)' 
SERVICE_ACCOUNT = SERVICE_ACCOUNT[0]
SERVICE_ACCOUNT

'279275268617-compute@developer.gserviceaccount.com'

In [8]:
!gcloud projects get-iam-policy $PROJECT_ID --filter="bindings.members:$SERVICE_ACCOUNT" --format='table(bindings.role)' --flatten="bindings[].members"


ROLE
roles/editor


In [9]:
!rm -rf {DIR}
!mkdir -p {DIR}

In [10]:
@kfp.dsl.pipeline(
    name = f'kfp-{NOTEBOOK}-{DATANAME}-{TIMESTAMP}',
    pipeline_root = URI+'/'+str(TIMESTAMP)+'/kfp/'
)
def pipeline(
    project: str,
    dataname: str,
    display_name: str,
    deploy_machine: str,
    bq_source: str,
    var_target: str,
    var_omit: str,
    features: dict,
    labels: dict 
):
    
    # dataset
    dataset = TabularDatasetCreateOp(
        project = project,
        display_name = display_name,
        bq_source = bq_source,
        labels = labels
    )
    
    # training
    model = AutoMLTabularTrainingJobRunOp(
        project = project,
        display_name = display_name,
        optimization_prediction_type = "classification",
        optimization_objective = "maximize-au-prc",
        budget_milli_node_hours = 1000,
        disable_early_stopping=False,
        column_specs = features,
        dataset = dataset.outputs['dataset'],
        target_column = var_target,
        predefined_split_column_name = 'splits',
        labels = labels
    )
    
    # Endpoint: Creation
    endpoint = EndpointCreateOp(
        project = project,
        display_name = display_name,
        labels = labels
    )
    
    # Endpoint: Deployment of Model
    deployment = ModelDeployOp(
        model = model.outputs["model"],
        endpoint = endpoint.outputs["endpoint"],
        dedicated_resources_min_replica_count = 1,
        dedicated_resources_max_replica_count = 1,
        traffic_split = {"0": 100},
        dedicated_resources_machine_type= deploy_machine
    )

In [11]:
compiler.Compiler().compile(
    pipeline_func = pipeline,
    package_path = f"{DIR}/{NOTEBOOK}.json"
)

In [12]:
!gsutil cp {DIR}/{NOTEBOOK}.json {URI}/{TIMESTAMP}/kfp/

Copying file://temp/02c/02c.json [Content-Type=application/json]...
/ [1 files][ 46.5 KiB/ 46.5 KiB]                                                
Operation completed over 1 objects/46.5 KiB.                                     


In [13]:
# get feature names
query = f"SELECT * FROM {DATANAME}.INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME = '{DATANAME}_prepped'"
schema = bq.query(query).to_dataframe()
OMIT = VAR_OMIT.split() + [VAR_TARGET, 'splits']
features = schema[~schema.column_name.isin(OMIT)].column_name.tolist()
features = dict.fromkeys(features, 'auto')

In [14]:
pipeline = aiplatform.PipelineJob(
    display_name = f'{NOTEBOOK}_{DATANAME}_{TIMESTAMP}',
    template_path = f"{URI}/{TIMESTAMP}/kfp/{NOTEBOOK}.json",
    parameter_values = {
        "project" : PROJECT_ID,
        "dataname" : DATANAME,
        "display_name" : f'{NOTEBOOK}_{DATANAME}_{TIMESTAMP}',
        "deploy_machine" : DEPLOY_COMPUTE,
        "bq_source" : f'bq://{PROJECT_ID}.{DATANAME}.{DATANAME}_prepped',
        "var_target" : VAR_TARGET,
        "var_omit" : VAR_OMIT,
        "features" : features,
        "labels" : {'notebook': NOTEBOOK}       
    },
    labels = {'notebook': NOTEBOOK},
    enable_caching=False
)

In [15]:
response = pipeline.run(
    service_account = SERVICE_ACCOUNT
)

Creating PipelineJob
PipelineJob created. Resource name: projects/279275268617/locations/us-central1/pipelineJobs/kfp-02c-fraud-20240220174303-20240220174414
To use this PipelineJob in another session:
pipeline_job = aiplatform.PipelineJob.get('projects/279275268617/locations/us-central1/pipelineJobs/kfp-02c-fraud-20240220174303-20240220174414')
View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/kfp-02c-fraud-20240220174303-20240220174414?project=279275268617
PipelineJob projects/279275268617/locations/us-central1/pipelineJobs/kfp-02c-fraud-20240220174303-20240220174414 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob projects/279275268617/locations/us-central1/pipelineJobs/kfp-02c-fraud-20240220174303-20240220174414 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob projects/279275268617/locations/us-central1/pipelineJobs/kfp-02c-fraud-20240220174303-20240220174414 current state:
PipelineState.PIPELINE_STATE_R

In [16]:
print(f"Review the Pipeline as it runs here:\nhttps://console.cloud.google.com/vertex-ai/locations/{REGION}/pipelines/runs/{pipeline.resource_name.split('/')[-1]}?project={PROJECT_ID}")


Review the Pipeline as it runs here:
https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/kfp-02c-fraud-20240220174303-20240220174414?project=molten-unison-414815


In [17]:
aiplatform.get_pipeline_df(pipeline = f'kfp-{NOTEBOOK}-{DATANAME}-{TIMESTAMP}')


Unnamed: 0,pipeline_name,run_name,param.input:bq_source,param.input:labels,param.input:features,param.input:var_target,param.input:dataname,param.vmlmd_lineage_integration,param.input:deploy_machine,param.input:display_name,param.input:project,param.input:var_omit
0,kfp-02c-fraud-20240220174303,kfp-02c-fraud-20240220174303-20240220174414,bq://molten-unison-414815.fraud.fraud_prepped,{'notebook': '02c'},"{'V23': 'auto', 'V21': 'auto', 'Amount': 'auto...",Class,fraud,{'pipeline_run_component': {'parent_task_names...,n1-standard-2,02c_fraud_20240220174303,molten-unison-414815,transaction_id


In [18]:
models = aiplatform.Model.list(filter=f'labels.notebook={NOTEBOOK}')


In [19]:
model = models[0]
model.resource_name

'projects/279275268617/locations/us-central1/models/2514624874165567488'

In [20]:
evaluation = model.get_model_evaluation().to_dict() # get first evaluation

In [21]:
evaluation.keys()

dict_keys(['name', 'metricsSchemaUri', 'metrics', 'createTime', 'sliceDimensions', 'modelExplanation'])

In [22]:
evaluation['metrics'].keys()

dict_keys(['auPrc', 'auRoc', 'logLoss', 'confusionMatrix', 'confidenceMetrics'])

In [23]:
evaluation['metrics']['auPrc']

0.9998603

In [24]:
evaluation['metrics']['confidenceMetrics'][3]

{'falseNegativeCount': '4',
 'recallAt1': 0.9996136,
 'falsePositiveRateAt1': 0.00038641234,
 'truePositiveCount': '28463',
 'recall': 0.9998595,
 'trueNegativeCount': '28342',
 'confidenceThreshold': 0.01,
 'falsePositiveRate': 0.0043910495,
 'precisionAt1': 0.9996136,
 'confusionMatrix': {'rows': [[28415.0, 1.0, 0.0],
   [10.0, 41.0, 0.0],
   [0.0, 0.0, 0.0]],
  'annotationSpecs': [{'displayName': '0', 'id': '0'},
   {'displayName': '1', 'id': '1'},
   {'displayName': 'DROPPED', 'id': 'DROPPED'}]},
 'precision': 0.9956275,
 'f1ScoreAt1': 0.9996136,
 'f1ScoreMacro': 0.7592993,
 'f1Score': 0.997739,
 'falsePositiveCount': '125',
 'f1ScoreMicro': 0.997739}

In [25]:
print(f"Review this model in the console:\nhttps://console.cloud.google.com/vertex-ai/locations/{REGION}/models/{model.name}/versions/{model.version_id}/evaluations/{evaluation['name'].split('/')[-1]}?project={PROJECT_ID}")


Review this model in the console:
https://console.cloud.google.com/vertex-ai/locations/us-central1/models/2514624874165567488/versions/1/evaluations/4347793670839253050?project=molten-unison-414815


In [26]:
evaluation['metrics']['auPrc']

0.9998603

In [27]:
for i in range(len(evaluation['metrics']['confusionMatrix']['annotationSpecs'])):
    print('True Label = ', evaluation['metrics']['confusionMatrix']['annotationSpecs'][i]['displayName'], ' has Predicted labels = ', evaluation['metrics']['confusionMatrix']['rows'][i])

True Label =  0  has Predicted labels =  [28415.0, 1.0, 0.0]
True Label =  1  has Predicted labels =  [10.0, 41.0, 0.0]
True Label =  DROPPED  has Predicted labels =  [0.0, 0.0, 0.0]


In [28]:
model_client = aiplatform.gapic.ModelServiceClient(
    client_options = {
        'api_endpoint' : f'{REGION}-aiplatform.googleapis.com'
    }
)

In [29]:
slices = model_client.list_model_evaluation_slices(parent = evaluation['name'])


In [30]:
for slice in slices:
    print('Label = ', slice.slice_.value, 'has auPrc = ', slice.metrics['auPrc'])

Label =  0 has auPrc =  0.9998934
Label =  1 has auPrc =  0.9018287


In [31]:
pred = bq.query(
    query = f"""
        SELECT * EXCEPT({VAR_TARGET}, splits, {VAR_OMIT})
        FROM {DATANAME}.{DATANAME}_prepped
        WHERE splits='TEST'
        LIMIT 10
    """
).to_dataframe()

In [32]:
pred.head(4)

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
0,73378,1.267971,-0.071333,0.008482,-0.39262,-0.02784,-0.015306,-0.182631,0.109003,0.065205,...,-0.108434,-0.20456,-0.603748,0.051971,-0.627543,0.14416,0.903631,-0.078328,-0.017881,0.0
1,36686,-1.116637,0.766187,3.219085,1.051747,-0.047797,0.510289,0.417527,0.228175,-0.02857,...,-0.308429,0.085421,0.488005,-0.501597,0.54324,0.625938,-0.098797,-0.272474,-0.177326,0.0
2,80210,1.106933,0.208589,1.411978,2.514973,-0.564985,0.670207,-0.659376,0.326511,-0.145195,...,-0.133966,-0.055411,-0.033153,-0.009164,0.002618,0.316512,-0.040788,0.039501,0.022443,0.0
3,67511,-1.585593,1.084266,-0.181189,-0.37581,0.143916,-0.775504,-0.084983,0.913627,-0.070178,...,-0.245192,0.073028,0.223127,-0.163398,-0.239701,-0.36814,0.35491,0.300076,0.041391,0.0


In [33]:
pred['Time'] = pred['Time'].astype(str)
newobs = pred.to_dict(orient='records')
newobs[0]

{'Time': '73378',
 'V1': 1.26797105289876,
 'V2': -0.0713332661988698,
 'V3': 0.008482458695155421,
 'V4': -0.39262000048097107,
 'V5': -0.0278399146423438,
 'V6': -0.015306058654508998,
 'V7': -0.18263079164336604,
 'V8': 0.10900335113603098,
 'V9': 0.0652050570725012,
 'V10': -0.058482319672751,
 'V11': 1.01930048769987,
 'V12': 0.525212498581151,
 'V13': -0.41128953011088104,
 'V14': 0.553832811686634,
 'V15': 0.7037734428758959,
 'V16': 0.339737786247733,
 'V17': -0.457517927458409,
 'V18': -0.332333891216917,
 'V19': 0.36143531975172294,
 'V20': -0.10843437916062801,
 'V21': -0.204560407430768,
 'V22': -0.603747633165057,
 'V23': 0.0519706252828389,
 'V24': -0.627542564476426,
 'V25': 0.144160411448769,
 'V26': 0.903630574388861,
 'V27': -0.0783284922720522,
 'V28': -0.0178807425923178,
 'Amount': 0.0}

In [34]:
instances = [json_format.ParseDict(newob, Value()) for newob in newobs]


In [35]:
aiplatform.Endpoint.list(filter=f'labels.notebook={NOTEBOOK}')

[<google.cloud.aiplatform.models.Endpoint object at 0x7f2bcd7deb60> 
 resource name: projects/279275268617/locations/us-central1/endpoints/1180182795905925120]

In [36]:
endpoint = aiplatform.Endpoint.list(filter=f'labels.notebook={NOTEBOOK}')[0]
endpoint.display_name

'02c_fraud_20240220174303'

In [37]:
prediction = endpoint.predict(instances = instances) # or instances = newobs
prediction.predictions[0]

{'classes': ['0', '1'], 'scores': [0.9999477863311768, 5.220999446464702e-05]}

In [38]:
prediction.predictions[0]['classes'][np.argmax(prediction.predictions[0]['scores'])]


'0'

In [39]:
with open(f'{DIR}/request.json','w') as file:
    file.write(json.dumps({"instances": [newobs[0]]}))

In [40]:
!curl -X POST \
-H "Authorization: Bearer "$(gcloud auth application-default print-access-token) \
-H "Content-Type: application/json; charset=utf-8" \
-d @{DIR}/request.json \
https://{REGION}-aiplatform.googleapis.com/v1/{endpoint.resource_name}:predict

{
  "predictions": [
    {
      "scores": [
        0.99994778633117676,
        5.2209987188689411e-05
      ],
      "classes": [
        "0",
        "1"
      ]
    }
  ],
  "deployedModelId": "6973531552890224640",
  "model": "projects/279275268617/locations/us-central1/models/2514624874165567488",
  "modelDisplayName": "02c_fraud_20240220174303",
  "modelVersionId": "1"
}


In [41]:
!gcloud beta ai endpoints predict {endpoint.name.rsplit('/',1)[-1]} --region={REGION} --json-request={DIR}/request.json


Using endpoint [https://us-central1-prediction-aiplatform.googleapis.com/]
[{'classes': ['0', '1'], 'scores': [0.9999477863311768, 5.220998718868941e-05]}]


In [43]:
explanation = endpoint.explain(instances = instances)

InvalidArgument: 400 {"error": "b'{\"error\": \"\"}'"}

In [44]:
explanation.predictions[0]


NameError: name 'explanation' is not defined

In [45]:
import matplotlib.pyplot as plt
features = []
scores = []
for k in explanation.explanations[0].attributions[0].feature_attributions:
    features.append(k)
    scores.append(explanation.explanations[0].attributions[0].feature_attributions[k])
features = [x for _, x in sorted(zip(scores, features))]
scores = sorted(scores)
fig, ax = plt.subplots()
fig.set_size_inches(9, 9)
ax.barh(features, scores)
fig.show()

NameError: name 'explanation' is not defined

In [46]:
batch = aiplatform.BatchPredictionJob.create(
    job_display_name = f'{NOTEBOOK}_{DATANAME}_{TIMESTAMP}',
    model_name = endpoint.list_models()[0].model,
    instances_format = "bigquery",
    predictions_format = "bigquery",
    bigquery_source = f'bq://{PROJECT_ID}.{DATANAME}.{DATANAME}_prepped',
    bigquery_destination_prefix = f"{PROJECT_ID}",
    generate_explanation=True,
    labels = {'notebook':f'{NOTEBOOK}'}
)

Creating BatchPredictionJob
BatchPredictionJob created. Resource name: projects/279275268617/locations/us-central1/batchPredictionJobs/8170465889481654272
To use this BatchPredictionJob in another session:
bpj = aiplatform.BatchPredictionJob('projects/279275268617/locations/us-central1/batchPredictionJobs/8170465889481654272')
View Batch Prediction Job:
https://console.cloud.google.com/ai/platform/locations/us-central1/batch-predictions/8170465889481654272?project=279275268617
BatchPredictionJob projects/279275268617/locations/us-central1/batchPredictionJobs/8170465889481654272 current state:
JobState.JOB_STATE_PENDING
BatchPredictionJob projects/279275268617/locations/us-central1/batchPredictionJobs/8170465889481654272 current state:
JobState.JOB_STATE_PENDING
BatchPredictionJob projects/279275268617/locations/us-central1/batchPredictionJobs/8170465889481654272 current state:
JobState.JOB_STATE_PENDING
BatchPredictionJob projects/279275268617/locations/us-central1/batchPredictionJobs/