In [1]:
import os
import pandas as pd
import numpy as np
import boto3
import sagemaker
from sagemaker.session import Session
from sagemaker.feature_store.feature_group import FeatureGroup
from sagemaker import get_execution_role
from time import gmtime, strftime, time
from botocore.client import ClientError
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


### Set up Region, Session, Bucket, S3

In [2]:
session = boto3.session.Session()
region = session.region_name
sagemaker_session = sagemaker.Session()
bucket = sagemaker_session.default_bucket()
s3 = boto3.Session().client(service_name="s3", region_name=region)
role = get_execution_role()
prefix = "spam-detection"
s3_key = f"s3://{bucket}/{prefix}"
script_path = sagemaker.Session().upload_data(
    path="spam_detection.py", 
    bucket=bucket, 
    key_prefix='scripts'
)

print("Default bucket: {}".format(bucket))

Default bucket: sagemaker-us-east-1-019877554860


### Confirm bucket

In [3]:
response = None
try:
    response = s3.head_bucket(Bucket=bucket)
    print(response)
    setup_s3_bucket_passed = True
except ClientError as e:
    print("[ERROR] Cannot find bucket {} in {} due to {}.".format(bucket, response, e))

{'ResponseMetadata': {'RequestId': 'XG4VSBG9V37DN90B', 'HostId': 'oDyZndjAprvNJYyG1CLya+omujYoHa5cpj/8PTRnCoWilo2C0FM9CDnFW7ucTF5FPhiEqb93bf4=', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amz-id-2': 'oDyZndjAprvNJYyG1CLya+omujYoHa5cpj/8PTRnCoWilo2C0FM9CDnFW7ucTF5FPhiEqb93bf4=', 'x-amz-request-id': 'XG4VSBG9V37DN90B', 'date': 'Wed, 16 Oct 2024 05:52:25 GMT', 'x-amz-bucket-region': 'us-east-1', 'x-amz-access-point-alias': 'false', 'content-type': 'application/xml', 'server': 'AmazonS3'}, 'RetryAttempts': 0}, 'BucketRegion': 'us-east-1', 'AccessPointAlias': False}


### Load the dataset. Drop Receive, Date, URLs.

In [4]:
filename = 'CEAS_08.csv'
local_csv_path = 'dataset/CEAS_08.csv'

df = pd.read_csv(local_csv_path)

df['label'] = df['label'].astype(int)
df = df.drop(["receiver","date","urls"], axis=1)
df["body"] = df["body"].str.replace(r'\n', ' ')

folder_in_s3 = 'Dataset/'
s3_destination_dir = f's3://{bucket}/{folder_in_s3}'
s3 = boto3.client('s3')

bucket_name = bucket

In [5]:
num_missing = df['body'].isnull().sum()
print(f"Number of missing values in 'body' column: {num_missing}")
num_missing = df['body'].isna().sum()
print(f"Number of NaN values in 'body' column: {num_missing}")
total_rows = len(df)
non_null_count = df['body'].notnull().sum()
num_missing = total_rows - non_null_count

print(f"Number of NaN values in 'body' column: {num_missing}")

Number of missing values in 'body' column: 0
Number of NaN values in 'body' column: 0
Number of NaN values in 'body' column: 0


In [6]:
# Splitting 80% for training, 10% for testing, 10% for validation/
# First, allocate 80% to df_train and 20% to df_production.
df_train, df_production = train_test_split(df, test_size=0.20, random_state=42)
# Splitting df_production where 10% goes to df_test and 10% goes to df_validation.
df_test, df_validation = train_test_split(df_production, test_size=0.50, random_state=42)

# Print the sizes of each split to confirm
print(f"Training data: {len(df_train)} rows")
print(f"Test data: {len(df_test)} rows")
print(f"Validation data: {len(df_validation)} rows")
print(f"Production data: {len(df_production)} rows")

Training data: 31323 rows
Test data: 3915 rows
Validation data: 3916 rows
Production data: 7831 rows


In [7]:
# Save the training data to CSV
df_train[["sender", "subject", "body", "label"]].to_csv('train.csv', index=False)

sagemaker_session = sagemaker.Session()
bucket = sagemaker_session.default_bucket()

prefix = 'spam-detection'

train_input_path = sagemaker_session.upload_data('train.csv', bucket=bucket, key_prefix=f'{prefix}/train')

print(f'Training data uploaded to: {train_input_path}')

Training data uploaded to: s3://sagemaker-us-east-1-019877554860/spam-detection/train/train.csv


### Spam_detection.py trainiing script - Not to be run within Jupyter Notebook

In [8]:
# spam_detection.py - Not to be run within the Jupyter Notebook.
"""
import argparse
import os
import pandas as pd
import joblib

from sklearn.naive_bayes import GaussianNB
from sklearn.feature_extraction.text import TfidfVectorizer

def model_fn(model_dir):
    # Load the model for inference
    model = joblib.load(os.path.join(model_dir, "model.joblib"))
    vectorizer = joblib.load(os.path.join(model_dir, "vectorizer.joblib"))
    return model, vectorizer

def predict_fn(input_data, model_and_vectorizer):
    # Vectorize string input and make predictions
    model, vectorizer = model_and_vectorizer
    
    # Check if the input data is a string (email body)
    # Transform the input string to TF-IDF features
    input_tfidf = vectorizer.transform([str(input_data)])
    input_dense = input_tfidf.toarray()  # Convert to dense format for GaussianNB

    # Make predictions using the trained model
    prediction = model.predict(input_dense)
    
    return prediction

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    
    # SageMaker-specific arguments
    parser.add_argument('--model-dir', type=str, default=os.environ.get('SM_MODEL_DIR'))
    parser.add_argument('--train', type=str, default=os.environ.get('SM_CHANNEL_TRAIN'))
    
    args = parser.parse_args()
    
    # Read the training data
    train_data = pd.read_csv(os.path.join(args.train, 'train.csv'))
    
    # Separate features and target
    X_train = train_data.drop("label", axis=1).astype(str)
    y_train = train_data['label']
    
    # Create the TF-IDF vectorizer
    vectorizer = TfidfVectorizer(max_features=10000)    
    X_train_tfidf = vectorizer.fit_transform(X_train)
   
    # Convert sparse matrix to dense for GaussianNB
    X_train_dense = X_train_tfidf.toarray()
    
    # Train the model
    # Build a Naive Bayes Classifier
    model = GaussianNB()
    
    model.fit(X_train_dense, y_train)
    
    # Save the model
    joblib.dump(model, os.path.join(args.model_dir, "model.joblib"))
    joblib.dump(vectorizer, os.path.join(args.model_dir, "vectorizer.joblib"))
"""

'\nimport argparse\nimport os\nimport pandas as pd\nimport joblib\n\nfrom sklearn.naive_bayes import GaussianNB\nfrom sklearn.feature_extraction.text import TfidfVectorizer\n\ndef model_fn(model_dir):\n    # Load the model for inference\n    model = joblib.load(os.path.join(model_dir, "model.joblib"))\n    vectorizer = joblib.load(os.path.join(model_dir, "vectorizer.joblib"))\n    return model, vectorizer\n\ndef predict_fn(input_data, model_and_vectorizer):\n    # Vectorize string input and make predictions\n    model, vectorizer = model_and_vectorizer\n    \n    # Check if the input data is a string (email body)\n    # Transform the input string to TF-IDF features\n    input_tfidf = vectorizer.transform([str(input_data)])\n    input_dense = input_tfidf.toarray()  # Convert to dense format for GaussianNB\n\n    # Make predictions using the trained model\n    prediction = model.predict(input_dense)\n    \n    return prediction\n\nif __name__ == \'__main__\':\n    parser = argparse.Argum

### Import SKLearn

In [9]:
from sagemaker.sklearn import SKLearn
import tempfile

# Upload the script to Amazon S3
s3_client = boto3.client('s3')
bucket = sagemaker.Session().default_bucket()
script_path = sagemaker.Session().upload_data(
    path='spam_detection.py', 
    bucket=bucket, 
    key_prefix='scripts'
)

### Upload training data to the S3 bucket

In [10]:
# Define the local path to your training data directory
train_local_path = 'train'  # Ensure 'train/train.csv' exists

# Upload training data to S3
train_data_path = sagemaker_session.upload_data(
    path=train_local_path,
    bucket=bucket,
    key_prefix=f"{prefix}/train"
)

### Create an estimator and train it.

In [None]:
from sagemaker.sklearn.estimator import SKLearn

role = sagemaker.get_execution_role()

estimator = SKLearn(
    entry_point='spam_detection.py',
    role=role,
    instance_count=1,
    instance_type='ml.m5.xlarge',
    framework_version='1.2-1',
    py_version='py3',
    output_path=f"s3://{bucket}/{prefix}/output",
    dependencies=['requirements.txt']
)

# Run the training job
# estimator.fit({'train': train_data_path})

### Create a model and train it.

In [16]:
from datetime import datetime, timedelta, timezone

# define endpoint name
endpoint_name = f"{prefix}-{datetime.utcnow():%Y-%m-%d-%H%M}"

# Initialize the SKLearnModel with an explicit model name
model = SKLearn(
    model_data=f"s3://{bucket}/{prefix}/output/sagemaker-scikit-learn-2024-10-16-03-58-23-641/output/model.tar.gz",
    role=role,
    entry_point="spam_detection.py",  # Your inference script
    name=endpoint_name,
    instance_count=1,
    instance_type='ml.m5.xlarge',
    framework_version='1.2-1',
    py_version='py3',
    output_path=f"s3://{bucket}/{prefix}/output",
    dependencies=['requirements.txt']
)

model.fit({'train': train_data_path})

INFO:sagemaker:Creating training-job with name: sagemaker-scikit-learn-2024-10-16-05-55-10-656


2024-10-16 05:55:12 Starting - Starting the training job...
2024-10-16 05:55:26 Starting - Preparing the instances for training...
2024-10-16 05:56:08 Downloading - Downloading the training image......
2024-10-16 05:56:54 Training - Training image download completed. Training in progress.[34m2024-10-16 05:57:00,373 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2024-10-16 05:57:00,376 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2024-10-16 05:57:00,379 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)[0m
[34m2024-10-16 05:57:00,396 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2024-10-16 05:57:00,612 sagemaker-training-toolkit INFO     Installing dependencies from requirements.txt[0m
[34mCollecting xgboost (from -r requirements.txt (line 3))
  Downloading xgboost-2.1.1-py3-none-manylinux_2_28_x86_64.whl.metada

In [28]:
# Create a SageMaker Model entity
model_name = 'spam-detection-2024-10-16-0558'  # Must match the model_name in ModelConfig

sagemaker_model = model.create_model(
    name=model_name,
    entry_point="spam_detection.py",
    dependencies=['requirements.txt'],
    role=role
)

# **Add this line to create the model in SageMaker**
sagemaker_model.create(
    instance_type='ml.m5.xlarge'
)

# Check to make sure the model was actually created.
sagemaker_client = boto3.client('sagemaker')
# List existing models
response = sagemaker_client.list_models(
    NameContains='spam-detection-2024-10-16-0558'
)

if not response['Models']:
    print("Model does not exist. Please create the model before proceeding.")
else:
    print("Model exists. Ready for Clarify job.")

INFO:sagemaker:Creating model with name: spam-detection-2024-10-16-0558


Model exists. Proceeding with Clarify job.


### Create an endpoint and assign the model to it

In [17]:
from datetime import datetime, timedelta, timezone

# define endpoint name
endpoint_name = f"{prefix}-{datetime.utcnow():%Y-%m-%d-%H%M}"
print("EndpointName =", endpoint_name)

# Deploy the model to an endpoint
predictor = model.deploy(
    initial_instance_count=1,
    instance_type='ml.m5.xlarge',
    endpoint_name=endpoint_name   # Adding endpoint name for model monitoring purposes
)
print(f"Model successfully deployed to endpoint: {endpoint_name}")

EndpointName = spam-detection-2024-10-16-0558


INFO:sagemaker:Creating model with name: sagemaker-scikit-learn-2024-10-16-05-58-28-114
INFO:sagemaker:Creating endpoint-config with name spam-detection-2024-10-16-0558
INFO:sagemaker:Creating endpoint with name spam-detection-2024-10-16-0558


-------!Model successfully deployed to endpoint: spam-detection-2024-10-16-0558


### Start SHAP analysis - import the necessary components

In [18]:
from sagemaker import get_execution_role
from sagemaker.clarify import (
    DataConfig,
    ModelConfig,
    SHAPConfig,
    TextConfig,
    SageMakerClarifyProcessor,
    ModelPredictedLabelConfig,
)

### Define feature columns. Set up the necessary configurations. 

In [19]:
feature_columns = ["sender", "subject", "body"]
features = ",".join(feature_columns)

# Set up the configurations for SageMaker Clarify
data_config = DataConfig(
    s3_data_input_path=train_data_path,
    s3_output_path=f"s3://{bucket}/{prefix}/clarify-explainability",
    label="label",
    headers=True,
    features=features,
)

model_config = ModelConfig(
    model_name=endpoint_name,
    instance_count=1,
    instance_type="ml.m5.xlarge",
    accept_type="text/csv",
    content_type='text/csv'
)

clarify_processor = sagemaker.clarify.SageMakerClarifyProcessor(
    role=role,
    instance_count=1,
    instance_type="ml.m5.xlarge",
    sagemaker_session=sagemaker_session,
)

bias_data_config = sagemaker.clarify.DataConfig(
    s3_data_input_path=train_data_path,
    s3_output_path=f"s3://{bucket}/{prefix}/clarify-explainability",
    label="label",
    headers=df_train.columns.to_list(),
    dataset_type="text/csv",
)

model_predicted_label_config = ModelPredictedLabelConfig(label="label")

INFO:sagemaker.image_uris:Defaulting to the only supported framework/algorithm version: 1.0.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.


### Configure explainability DataConfig and SHAPConfig

In [20]:
explainability_output_path = f"s3://{bucket}/{prefix}/clarify-explainability"
explainability_data_config = sagemaker.clarify.DataConfig(
    s3_data_input_path=train_data_path,
    s3_output_path=explainability_output_path,
    label="label",
    headers=df_train.columns.tolist(),
    dataset_type="text/csv",
    features=features,
)

# SHAP configuration
shap_config = SHAPConfig(
    baseline=[
        ["larry@google.com", "I have your data", "Turn on your cookies."],
    ],
    num_samples=15,
    agg_method="mean_abs",
    save_local_shap_values=True
)

In [23]:
sagemaker_client = boto3.client('sagemaker')
# List existing models
response = sagemaker_client.list_models(
    NameContains='spam-detection-2024-10-16-0558'
)

if not response['Models']:
    print("Model does not exist. Please create the model before proceeding.")
else:
    print("Model exists. Proceeding with Clarify job.")

Model does not exist. Please create the model before proceeding.


### Start the analysis (10 minutes)

In [30]:
clarify_processor.run_explainability(
    data_config=explainability_data_config,
    model_config=model_config,
    explainability_config=shap_config,
)

INFO:sagemaker.clarify:Analysis Config: {'dataset_type': 'text/csv', 'features': 'sender,subject,body', 'headers': ['sender', 'subject', 'body', 'label'], 'label': 'label', 'predictor': {'model_name': 'spam-detection-2024-10-16-0558', 'instance_type': 'ml.m5.xlarge', 'initial_instance_count': 1, 'accept_type': 'text/csv', 'content_type': 'text/csv'}, 'methods': {'report': {'name': 'report', 'title': 'Analysis Report'}, 'shap': {'use_logit': False, 'save_local_shap_values': True, 'baseline': [['larry@google.com', 'I have your data', 'Turn on your cookies.']], 'num_samples': 15, 'agg_method': 'mean_abs'}}}
INFO:sagemaker:Creating processing-job with name Clarify-Explainability-2024-10-16-06-33-58-193


..................[34msagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml[0m
[34msagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml[0m
[34mINFO:sagemaker-clarify-processing:Starting SageMaker Clarify Processing job[0m
[34mINFO:analyzer.data_loading.data_loader_util:Analysis config path: /opt/ml/processing/input/config/analysis_config.json[0m
[34mINFO:analyzer.data_loading.data_loader_util:Analysis result path: /opt/ml/processing/output[0m
[34mINFO:analyzer.data_loading.data_loader_util:This host is algo-1.[0m
[34mINFO:analyzer.data_loading.data_loader_util:This host is the leader.[0m
[34mINFO:analyzer.data_loading.data_loader_util:Number of hosts in the cluster is 1.[0m
[34mINFO:sagemaker-clarify-processing:Running Python / Pandas based analyzer.[0m
[34mINFO:analyzer.data_loading.data_loader_factory:Dataset type: text/csv uri: /opt/ml/processing/input/data[0m
[34mINFO:sage

UnexpectedStatusException: Error for Processing job Clarify-Explainability-2024-10-16-06-33-58-193: Failed. Reason: ClientError: An error occurred (ModelError) when calling the InvokeEndpoint operation (reached max retries: 0): Received server error (500) from primary with message "<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">
<title>500 Internal Server Error</title>
<h1>Internal Server Error</h1>
<p>The server encountered an internal error and was unable to complete your request. Either the server is overloaded or there is an error in the application.</p>
". See https://us-east-1.console.aws.amazon.com/cloudwatch/home?region=us-east-1#logEventViewer:group=/aws/sagemaker/Endpoints/sm-clarify-spam-detection-2024-10-16-0558-1729060624-c647 in account 019877554860 for more information., exit code: 1

In [None]:
# Define your S3 bucket and prefix
s3_bucket = bucket
s3_prefix = "clarify_output/"
# Ensure it ends with '/'

# List SHAP output JSON files
response = s3.list_objects_v2(Bucket=s3_bucket, Prefix=s3_prefix)

# Check if objects exist
#if 'Contents' not in response:
#    raise ValueError(f"No objects found with prefix '{s3_prefix}' in bucket '{s3_bucket}'.")

# Extract JSON file keys
response = s3.list_objects_v2(Bucket=s3_bucket, Prefix='empty_prefix/')

if 'Contents' in response:
    file_keys = [obj['Key'] for obj in response['Contents'] if obj['Key'].endswith('.json')]
    print(file_keys)
else:
    print("No objects found with the specified prefix.")

#if not file_keys:
#    raise ValueError(f"No JSON files found with prefix '{s3_prefix}' in bucket '{s3_bucket}'.")

print("SHAP output files:", file_keys)

In [None]:
import glob

# Download SHAP JSON files
local_shap_dir = 'shap_values'
os.makedirs(local_shap_dir, exist_ok=True)

for key in file_keys:
    filename = os.path.basename(key)
    local_path = os.path.join(local_shap_dir, filename)
    s3.download_file(s3_bucket, key, local_path)
    print(f"Downloaded {key} to {local_path}")

# Load SHAP values from JSON files
shap_values_list = []

json_files = glob.glob(os.path.join(local_shap_dir, '*.json'))

for file in json_files:
    with open(file, 'r') as f:
        data = json.load(f)
        for record in data:
            shap_values_list.append(record['shap_values'])  # Adjust based on actual JSON structure

In [None]:
# Convert to DataFrame
shap_df = pd.DataFrame(shap_values_list)
shap_df['instance_id'] = instance_ids
shap_df.set_index('instance_id', inplace=True)

# Load your training data (ensure it aligns with SHAP values)
train_data = pd.read_csv("train/train.csv")  # Replace with your actual data path

In [None]:
# Ensure the number of instances matches
if len(shap_df) != len(train_data):
    raise ValueError("The number of SHAP values does not match the number of instances in the training data.")

print("Number of SHAP values matches the number of training data instances.")

# Convert SHAP values to NumPy array
shap_values_array = shap_df.values

print(shap_values_array.shape)  # Should be [num_instances, num_features]

# Visualize Summary Plot
shap.summary_plot(shap_values_array, train_data)

# Visualize Force Plot for the first instance
shap.initjs()
instance_index = 0
shap.force_plot(
    base_value=None,
    shap_values=shap_values_array[instance_index],
    features=train_data.iloc[instance_index, :],
    feature_names=train_data.columns
)

### Going with AWS example

In [None]:
baseline_results_uri = f"{s3_key}/baselining"
validation_dataset = "validation_dataset.csv"
label = train_data["label"]

model_explainability_baselining_job_result_uri = f"{baseline_results_uri}/model_explainability"
model_explainability_data_config = DataConfig(
    s3_data_input_path=validation_dataset,
    s3_output_path=model_explainability_baselining_job_result_uri,
    label=label,
    dataset_type="text/csv",
)

In [None]:
test_dataframe = pd.read_csv("test_dataset.csv", header=None)
shap_baseline = [list(test_dataframe.mean())]

shap_config = SHAPConfig(
    baseline=shap_baseline,
    num_samples=100,
    agg_method="mean_abs",
    save_local_shap_values=False,
)