# KNN Model Training and Evaluation in Amazon SageMaker

In [None]:

import boto3
import sagemaker
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sagemaker import get_execution_role
from sagemaker.amazon.amazon_estimator import image_uris
from sagemaker.inputs import TrainingInput
from sagemaker.estimator import Estimator
from sagemaker.serializers import CSVSerializer
from sagemaker.predictor import Predictor

# Set up SageMaker session and role
role = get_execution_role()
sagemaker_session = sagemaker.Session()

# Define S3 bucket and prefix for storing data
bucket = 'your-s3-bucket-name'
prefix = 'knn-example'

# Retrieve the SageMaker KNN container image
region = boto3.Session().region_name
container = image_uris.retrieve(region=region, framework='knn')

# Define the KNN Estimator
knn = Estimator(
    image_uri=container,
    role=role,
    instance_count=1,
    instance_type='ml.m4.xlarge',
    output_path=f's3://{bucket}/{prefix}/output',
    sagemaker_session=sagemaker_session
)

# Set KNN hyperparameters
knn.set_hyperparameters(
    k=10,
    sample_size=5000,
    predictor_type='classifier',
    feature_dim=10,  # Adjust based on actual dataset features
    index_metric='COSINE'
)

# Define training and validation data locations
s3_input_train = TrainingInput(s3_data=f's3://{bucket}/{prefix}/train', content_type='text/csv')
s3_input_validation = TrainingInput(s3_data=f's3://{bucket}/{prefix}/validation', content_type='text/csv')

# Train the model
knn.fit({'train': s3_input_train, 'validation': s3_input_validation})


## Deploy the Trained Model

In [None]:

# Deploy the trained model
knn_predictor = knn.deploy(initial_instance_count=1, instance_type='ml.m4.xlarge')
knn_predictor.serializer = CSVSerializer()


## Load Test Data and Make Predictions

In [None]:

# Load test data (replace with actual test data location)
test_data = pd.read_csv('test_data.csv')
X_test = test_data.drop(columns=['label'])
y_test = test_data['label']

# Get predictions
predictions = knn_predictor.predict(X_test.to_numpy())
predictions = np.array([int(float(pred)) for pred in predictions.decode('utf-8').split(',')])


## Compute Confusion Matrix and Performance Metrics

In [None]:

# Compute confusion matrix
cm = pd.crosstab(y_test, predictions, rownames=['Actual'], colnames=['Predicted'])

# Compute evaluation metrics
TP = cm.loc[1, 1] if (1 in cm.index and 1 in cm.columns) else 0
FP = cm.loc[0, 1] if (0 in cm.index and 1 in cm.columns) else 0
TN = cm.loc[0, 0] if (0 in cm.index and 0 in cm.columns) else 0
FN = cm.loc[1, 0] if (1 in cm.index and 0 in cm.columns) else 0

accuracy = (TP + TN) / (TP + TN + FP + FN) * 100
precision = TP / (TP + FP) * 100 if (TP + FP) > 0 else 0
recall = TP / (TP + FN) * 100 if (TP + FN) > 0 else 0
f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
specificity = TN / (TN + FP) * 100 if (TN + FP) > 0 else 0

# Print metrics
print("Confusion Matrix:")
print(cm)

print(f"\nAccuracy: {accuracy:.2f}%")
print(f"Precision: {precision:.2f}%")
print(f"Recall: {recall:.2f}%")
print(f"F1 Score: {f1_score:.2f}%")
print(f"Specificity: {specificity:.2f}%")


## Visualize the Confusion Matrix

In [None]:

# Visualize confusion matrix
plt.figure(figsize=(6,4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=["Not Prof", "Proficient"], yticklabels=["Not Prof", "Proficient"])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()


## Cleanup the Endpoint

In [None]:

# Clean up the endpoint
knn_predictor.delete_endpoint()
