In [1]:

import sagemaker
from sagemaker import get_execution_role
import boto3
import numpy as np
import pandas as pd
import os

# Define SageMaker session and role
sagemaker_session = sagemaker.Session()
role = get_execution_role()
bucket = sagemaker_session.default_bucket()
prefix = 'sagemaker/knn-example'

# Define local file path and S3 key
local_file_path = "PISA_cleaned_dataset.csv"
file_key = f"{prefix}/PISA_cleaned_dataset.csv"

# Upload data to S3 if not already present
s3_client = boto3.client("s3")
try:
    s3_client.head_object(Bucket=bucket, Key=file_key)
    print("File already exists in S3.")
except:
    print("Uploading data to S3...")
    s3_client.upload_file(local_file_path, bucket, file_key)
    print("Upload complete.")

s3_data_path = f"s3://{bucket}/{file_key}"

# Load data for preprocessing
data = pd.read_csv(local_file_path)

# Separate features and labels
features = data.drop(columns=["target_column"]).values  # Replace with actual target column
labels = data["target_column"].values

# Convert to CSV format for SageMaker
train_data = np.column_stack((labels, features))
train_file = "train_data.csv"
pd.DataFrame(train_data).to_csv(train_file, index=False, header=False)

# Upload processed training data to S3
train_s3_path = f"s3://{bucket}/{prefix}/train/train_data.csv"
s3_client.upload_file(train_file, bucket, f"{prefix}/train/train_data.csv")

# Specify the KNN model container
from sagemaker.image_uris import retrieve

knn_container = retrieve("knn", sagemaker_session.boto_region_name)

# Define KNN model parameters
knn_estimator = sagemaker.estimator.Estimator(
    knn_container,
    role,
    instance_count=1,
    instance_type="ml.m4.xlarge",
    output_path=f"s3://{bucket}/{prefix}/output",
    sagemaker_session=sagemaker_session,
)

knn_estimator.set_hyperparameters(
    feature_dim=features.shape[1],
    k=10,
    sample_size=200,
    predictor_type="classifier",
    index_metric="euclidean",
)

# Train the model
knn_estimator.fit({"train": train_s3_path})

# Deploy the trained model for inference
predictor = knn_estimator.deploy(instance_type="ml.m4.xlarge", initial_instance_count=1)

# Make a prediction
sample_input = features[:1]  # Replace with a test sample
response = predictor.predict(sample_input)
print("Prediction:", response)


sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml
Uploading data to S3...


FileNotFoundError: [Errno 2] No such file or directory: 'PISA_cleaned_dataset.csv'

In [2]:
import os
print(os.path.exists("PISA_cleaned_dataset.csv"))


False
