In [207]:
import sagemaker
from sagemaker.predictor import Predictor
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import CSVDeserializer
from sagemaker.feature_store.feature_group import FeatureGroup
from sagemaker.s3 import S3Downloader, S3Uploader
from sagemaker.model_monitor.dataset_format import DatasetFormat

import json
import jsonlines
import utils
from importlib import reload

reload(utils)

<module 'utils' from '/home/sagemaker-user/src/monitoring/utils.py'>

In [179]:
sm_session = sagemaker.Session()

endpoint_name = "index-predictor-endpoint"
feature_group_name = "index-predictor-feature-group-v7"
bucket_name = "team1-index-predictor-bucket"
data_version = "2024-06-26-09-33"

data_capture_prefix = "data-capture"
data_capture_s3_url = f"s3://{bucket_name}/{data_capture_prefix}"

In [180]:
predictor = Predictor(
    endpoint_name=endpoint_name,
    serializer=CSVSerializer(),
    deserializer=CSVDeserializer(),
)

In [181]:
### Downloading data from feature store

In [182]:
feature_group = FeatureGroup(name=feature_group_name, sagemaker_session=sm_session)

query = feature_group.athena_query()

query.run(
    query_string=f"""SELECT * FROM "{query.table_name}" WHERE version = '{data_version}'""",
    output_location=f"s3://{bucket_name}/model_monitor/data/",
)

query.wait()

df = query.as_dataframe()

train_df = df[df["type"] == "train"].copy()
validation_df = df[df["type"] == "validation"].copy()
test_df = df[df["type"] == "test"].copy()

selected_test_df = test_df.copy().sample(n=100, random_state=1)

INFO:sagemaker:Query 347ba82c-eccc-4c69-a667-fe1d0a5eddc8 is being executed.
INFO:sagemaker:Query 347ba82c-eccc-4c69-a667-fe1d0a5eddc8 successfully executed.


In [183]:
columns_to_drop = ["type", "version", "write_time", "api_invocation_time", "is_deleted", "datetime"]

df.drop(
    columns=columns_to_drop,
    inplace=True,
)
train_df.drop(
    columns=columns_to_drop,
    inplace=True,
)
validation_df.drop(
    columns=columns_to_drop,
    inplace=True,
)
test_df.drop(
    columns=columns_to_drop,
    inplace=True,
)
selected_test_df.drop(
    columns=columns_to_drop,
    inplace=True,
)
selected_test_no_target = selected_test_df.drop(columns=["close_target"])

In [184]:
utils.generate_endpoint_traffic(predictor, selected_test_no_target)

100%|██████████| 100/100 [00:01<00:00, 88.43it/s]


In [185]:
test_df_no_target = test_df.drop(columns=["close_target"])

In [198]:
predictions = predictor.predict(test_df_no_target)
flat_predictions = [item for sublist in predictions for item in sublist]

In [201]:
test_df['probabilities'] = flat_predictions
test_df['probabilities'] = test_df['probabilities'].astype(float)
test_df['predictions'] = (test_df['probabilities'] > 0.5).astype(int)

In [204]:
test_df.to_csv("tmp/test_df.csv", index=False)

test_df_s3_uri = S3Uploader.upload("tmp/test_df.csv", f's3://{bucket_name}/model_monitor/model_baseline/test_df.csv')

In [206]:
model_monitor = ModelQualityMonitor(
    role=sagemaker.get_execution_role(),
    instance_count=1,
    instance_type='ml.m5.xlarge',
    volume_size_in_gb=20,
    max_runtime_in_seconds=1800,
    sagemaker_session=sm_session
)

INFO:sagemaker.image_uris:Defaulting to the only supported framework/algorithm version: .
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.


In [212]:
model_baseline_job_name = f"index-predictor-model-baselining-{strftime('%d-%H-%M-%S', gmtime())}"

model_baseline_job = model_monitor.suggest_baseline(
    baseline_dataset=test_df_s3_uri,
    dataset_format=DatasetFormat.csv(header=True),
    output_s3_uri = f's3://{bucket_name}/model_monitor/model_baseline/results',
    problem_type="BinaryClassification",
    inference_attribute= "predictions", # The column in the dataset that contains predictions
    probability_attribute= "probabilities", # The column in the dataset that contains probabilities
    ground_truth_attribute= "close_target", # The column in the dataset that contains ground truth labels
    job_name=model_baseline_job_name,
)

model_baseline_job.wait(logs=False)

INFO:sagemaker:Creating processing-job with name index-predictor-model-baselining-27-07-32-25


............................................................!

In [213]:
latest_model_baseline_job = model_monitor.latest_baselining_job
pd.DataFrame(latest_model_baseline_job.suggested_constraints().body_dict["binary_classification_constraints"]).T

Unnamed: 0,threshold,comparison_operator
recall,0.812834,LessThanThreshold
precision,0.520548,LessThanThreshold
accuracy,0.509804,LessThanThreshold
true_positive_rate,0.812834,LessThanThreshold
true_negative_rate,0.176471,LessThanThreshold
false_positive_rate,0.823529,GreaterThanThreshold
false_negative_rate,0.187166,GreaterThanThreshold
auc,0.494652,LessThanThreshold
f0_5,0.560886,LessThanThreshold
f1,0.634656,LessThanThreshold


In [214]:
pd.DataFrame(latest_model_baseline_job.baseline_statistics().body_dict["binary_classification_metrics"]["confusion_matrix"])

Unnamed: 0,0,1
0,30,35
1,140,152


In [215]:
### Generate endpoint traffic and ingest ground truth data into it