## Load and Split the Data

In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split
import boto3
from sklearn.datasets import dump_svmlight_file
import sagemaker
from sagemaker import get_execution_role
from sagemaker.image_uris import retrieve
from sagemaker.serializers import LibSVMSerializer
from sagemaker.deserializers import JSONDeserializer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sagemaker.serializers import CSVSerializer

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the cleaned data
cleaned_data_path = 'cleaned_data_with_features.csv'
data_cleaned = pd.read_csv(cleaned_data_path)

# Split data into training (40%), validation (10%), test (10%), and production (40%)
train_data, temp_data = train_test_split(data_cleaned, test_size=0.60, random_state=42)
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)
prod_data = test_data  # Using the same 40% for test and production

# Save datasets to CSV files
train_data.to_csv('train_data.csv', index=False)
val_data.to_csv('val_data.csv', index=False)
test_data.to_csv('test_data.csv', index=False)
prod_data.to_csv('prod_data.csv', index=False)

print("Datasets have been split and saved as CSV files.")

Datasets have been split and saved as CSV files.


## Upload the Datasets to S3

In [3]:
import boto3

s3 = boto3.client('s3')
bucket_name = 'electronics-dataset'

# Define paths for S3
s3_train_path = 'sagemaker/benchmark-model/train/train_data.csv'
s3_val_path = 'sagemaker/benchmark-model/val/val_data.csv'
s3_test_path = 'sagemaker/benchmark-model/test/test_data.csv'
s3_prod_path = 'sagemaker/benchmark-model/prod/prod_data.csv'

# Upload the CSV files to S3
s3.upload_file('train_data.csv', bucket_name, s3_train_path)
s3.upload_file('val_data.csv', bucket_name, s3_val_path)
s3.upload_file('test_data.csv', bucket_name, s3_test_path)
s3.upload_file('prod_data.csv', bucket_name, s3_prod_path)

print(f"Training data uploaded to s3://{bucket_name}/{s3_train_path}")
print(f"Validation data uploaded to s3://{bucket_name}/{s3_val_path}")
print(f"Test data uploaded to s3://{bucket_name}/{s3_test_path}")
print(f"Production data uploaded to s3://{bucket_name}/{s3_prod_path}")

Training data uploaded to s3://electronics-dataset/sagemaker/benchmark-model/train/train_data.csv
Validation data uploaded to s3://electronics-dataset/sagemaker/benchmark-model/val/val_data.csv
Test data uploaded to s3://electronics-dataset/sagemaker/benchmark-model/test/test_data.csv
Production data uploaded to s3://electronics-dataset/sagemaker/benchmark-model/prod/prod_data.csv


## Convert Ratings to Binary Labels

In [4]:
import pandas as pd

def convert_to_binary_labels(input_csv_path, output_csv_path, threshold=3):
    df = pd.read_csv(input_csv_path)
    # Convert ratings to binary labels
    df['reviews.rating'] = (df['reviews.rating'] > threshold).astype(int)
    df.to_csv(output_csv_path, index=False)

# Convert the training, validation, and test data
convert_to_binary_labels('train_data.csv', 'train_data_binary.csv')
convert_to_binary_labels('val_data.csv', 'val_data_binary.csv')
convert_to_binary_labels('test_data.csv', 'test_data_binary.csv')
convert_to_binary_labels('prod_data.csv', 'prod_data_binary.csv')

## Convert Binary CSV to LibSVM Format

In [5]:
from sklearn.datasets import dump_svmlight_file
import pandas as pd

def convert_to_libsvm(input_csv_path, output_libsvm_path, label_column):
    df = pd.read_csv(input_csv_path)
    labels = df[label_column]
    features = df.select_dtypes(include=[float, int]).drop(columns=[label_column])  # Select only numeric columns
    dump_svmlight_file(features, labels, output_libsvm_path, zero_based=True)

# Convert the binary training data
convert_to_libsvm('train_data_binary.csv', 'train_data.libsvm', 'reviews.rating')
convert_to_libsvm('val_data_binary.csv', 'val_data.libsvm', 'reviews.rating')
convert_to_libsvm('test_data_binary.csv', 'test_data.libsvm', 'reviews.rating')
convert_to_libsvm('prod_data_binary.csv', 'prod_data.libsvm', 'reviews.rating')

##  Upload the LibSVM Files to S3

In [6]:
import boto3

s3 = boto3.client('s3')
bucket_name = 'electronics-dataset'

# Define paths for S3
s3_train_path = 'sagemaker/benchmark-model/train/train_data.libsvm'
s3_val_path = 'sagemaker/benchmark-model/val/val_data.libsvm'
s3_test_path = 'sagemaker/benchmark-model/test/test_data.libsvm'
s3_prod_path = 'sagemaker/benchmark-model/prod/prod_data.libsvm'

# Upload the LibSVM files to S3
s3.upload_file('train_data.libsvm', bucket_name, s3_train_path)
s3.upload_file('val_data.libsvm', bucket_name, s3_val_path)
s3.upload_file('test_data.libsvm', bucket_name, s3_test_path)
s3.upload_file('prod_data.libsvm', bucket_name, s3_prod_path)

print(f"Training data uploaded to s3://{bucket_name}/{s3_train_path}")
print(f"Validation data uploaded to s3://{bucket_name}/{s3_val_path}")
print(f"Test data uploaded to s3://{bucket_name}/{s3_test_path}")
print(f"Production data uploaded to s3://{bucket_name}/{s3_prod_path}")

Training data uploaded to s3://electronics-dataset/sagemaker/benchmark-model/train/train_data.libsvm
Validation data uploaded to s3://electronics-dataset/sagemaker/benchmark-model/val/val_data.libsvm
Test data uploaded to s3://electronics-dataset/sagemaker/benchmark-model/test/test_data.libsvm
Production data uploaded to s3://electronics-dataset/sagemaker/benchmark-model/prod/prod_data.libsvm


## Train the Benchmark Model


In [7]:
import sagemaker
from sagemaker import get_execution_role
from sagemaker.image_uris import retrieve

role = get_execution_role()
sess = sagemaker.Session()

bucket = 'electronics-dataset'
prefix = 'sagemaker/benchmark-model'
train_path = f's3://{bucket}/{prefix}/train/train_data.libsvm'
val_path = f's3://{bucket}/{prefix}/val/val_data.libsvm'

container = retrieve('xgboost', sess.boto_region_name, version='latest')

xgb = sagemaker.estimator.Estimator(container,
                                    role,
                                    instance_count=1,
                                    instance_type='ml.m5.xlarge',
                                    output_path=f's3://{bucket}/{prefix}/output',
                                    sagemaker_session=sess)

xgb.set_hyperparameters(objective='binary:logistic', num_round=100)

xgb.fit({'train': train_path, 'validation': val_path})

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


INFO:sagemaker:Creating training-job with name: xgboost-2024-06-30-21-09-10-383


2024-06-30 21:09:10 Starting - Starting the training job...
2024-06-30 21:09:26 Starting - Preparing the instances for training...
2024-06-30 21:09:51 Downloading - Downloading input data...
2024-06-30 21:10:16 Downloading - Downloading the training image...
2024-06-30 21:11:07 Training - Training image download completed. Training in progress.
2024-06-30 21:11:07 Uploading - Uploading generated training model[34mArguments: train[0m
[34m[2024-06-30:21:10:59:INFO] Running standalone xgboost training.[0m
[34m[2024-06-30:21:10:59:INFO] File size need to be processed in the node: 0.16mb. Available memory size in the node: 8147.97mb[0m
[34m[21:10:59] S3DistributionType set as FullyReplicated[0m
[34m[21:10:59] 2850x5 matrix with 11400 entries loaded from /opt/ml/input/data/train[0m
[34m[21:10:59] S3DistributionType set as FullyReplicated[0m
[34m[21:10:59] 2138x5 matrix with 8552 entries loaded from /opt/ml/input/data/validation[0m
[34m[21:10:59] src/tree/updater_prune.cc:74: t

## Deploy the Model


In [None]:
# Deploy the model to an endpoint
xgb_predictor = xgb.deploy(
    initial_instance_count=1,
    instance_type='ml.m5.xlarge'
)

INFO:sagemaker:Creating model with name: xgboost-2024-06-30-21-11-52-811
INFO:sagemaker:Creating endpoint-config with name xgboost-2024-06-30-21-11-52-811
INFO:sagemaker:Creating endpoint with name xgboost-2024-06-30-21-11-52-811


---

## Make Predictions


In [31]:
from sagemaker.deserializers import StringDeserializer


In [37]:
def predict(predictor, data_path, threshold=0.5):
    df = pd.read_csv(data_path)
    labels = df['reviews.rating']
    features = df.select_dtypes(include=[float, int]).drop(columns=['reviews.rating'])
    
    # Predict
    predictor.serializer = CSVSerializer()
    predictor.deserializer = StringDeserializer()
    predictions = predictor.predict(features.to_numpy())
    
    # Extract predictions
    predictions_str = predictions.strip()
    predictions_list = predictions_str.split(',')
    predicted_labels = [int(float(prediction) > threshold) for prediction in predictions_list]
    return labels, predicted_labels

# Predict on test data
test_labels, test_predictions = predict(xgb_predictor, 'test_data_binary.csv')

# Evaluate the model
accuracy = accuracy_score(test_labels, test_predictions)
conf_matrix = confusion_matrix(test_labels, test_predictions)
class_report = classification_report(test_labels, test_predictions)

print(f"Accuracy: {accuracy}")
print(f"Confusion Matrix:\n{conf_matrix}")
print(f"Classification Report:\n{class_report}")

# Clean up the endpoint
xgb_predictor.delete_endpoint()
print("Endpoint deleted.")

INFO:sagemaker:Deleting endpoint configuration with name: xgboost-2024-06-30-21-11-52-811


Accuracy: 0.8493919550982226
Confusion Matrix:
[[  37  265]
 [  57 1779]]
Classification Report:
              precision    recall  f1-score   support

           0       0.39      0.12      0.19       302
           1       0.87      0.97      0.92      1836

    accuracy                           0.85      2138
   macro avg       0.63      0.55      0.55      2138
weighted avg       0.80      0.85      0.81      2138



INFO:sagemaker:Deleting endpoint with name: xgboost-2024-06-30-21-11-52-811


Endpoint deleted.
