In [12]:
import boto3
import pandas as pd
from sagemaker.predictor import Predictor
import sagemaker
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import JSONDeserializer
from sklearn.metrics import accuracy_score
from sagemaker import get_execution_role
from sagemaker.model import Model
s3 = boto3.client('s3')

session = boto3.Session()
sagemaker_session = session.client("sagemaker")


In [13]:
#Loading data

bucket_name = "test-bucket-hamady"
test_data_key = "splitData/X_test.csv"
response = s3.get_object(Bucket=bucket_name, Key=test_data_key)

X_test = pd.read_csv(response['Body'])


X_test = X_test[["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked_Q", "Embarked_S"]]


In [14]:
bucket_name = "test-bucket-hamady"
key = "splitData/X_test.csv" 

# Step 1: Download the file
response = s3.get_object(Bucket=bucket_name, Key=key)
data = response['Body'].read().decode('utf-8')

# Step 2: Remove the header
lines = data.splitlines()
data_without_header = "\n".join(lines[1:]) 
new_key = "splitData/X_test_no_header.csv" 
s3.put_object(Bucket=bucket_name, Key=new_key, Body=data_without_header)

{'ResponseMetadata': {'RequestId': 'PZEYDQACG8GP6QJT',
  'HostId': 'nk25H+22e31PNIp7cRF4A/io0qtl2JNKbFNP2NqWpsvHHXJWCE/XELkCk9JyXOdyoalTn0Y6prM=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'nk25H+22e31PNIp7cRF4A/io0qtl2JNKbFNP2NqWpsvHHXJWCE/XELkCk9JyXOdyoalTn0Y6prM=',
   'x-amz-request-id': 'PZEYDQACG8GP6QJT',
   'date': 'Sat, 04 Jan 2025 11:08:06 GMT',
   'x-amz-version-id': '1br2QxLP1KSdSYMJZo8Py9k8OepKcb6Y',
   'x-amz-server-side-encryption': 'AES256',
   'etag': '"c28fed8875545f8e1c42b30dbb36cc83"',
   'content-length': '0',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'ETag': '"c28fed8875545f8e1c42b30dbb36cc83"',
 'ServerSideEncryption': 'AES256',
 'VersionId': '1br2QxLP1KSdSYMJZo8Py9k8OepKcb6Y'}

In [25]:


sagemaker_client = boto3.client("sagemaker", region_name="eu-west-3")


model_name = "xgboost-model-for-batch-transform" 
model_data = "s3://test-bucket-hamady/xgboost-output/sagemaker-xgboost-2025-01-04-05-14-02-316/output/model.tar.gz"  # Model artifact URI


container = sagemaker.image_uris.retrieve("xgboost", region="eu-west-3", version="1.5-1")


response = sagemaker_client.create_model(
    ModelName=model_name,  
    PrimaryContainer={
        "Image": container,
        "ModelDataUrl": model_data,
    },
    ExecutionRoleArn="arn:aws:iam::913524936566:role/SagemakerRoleAllowS3" 
)

print(f"Model created with name: {model_name}")


Model created with name: xgboost-model-for-batch-transform


In [26]:
import boto3

sagemaker_client = boto3.client("sagemaker", region_name="eu-west-3")

# Get the model name after training
model_name = "xgboost-model-for-batch-transform"

# Define job parameters
transform_job_name = "titanicdata-batch-transform-job"
input_data_location = "s3://test-bucket-hamady/splitData/X_test_no_header.csv" 
output_data_location = "s3://test-bucket-hamady/inference-output/"  
# Create the transform job
response = sagemaker_client.create_transform_job(
    TransformJobName=transform_job_name,
    ModelName=model_name,
    TransformInput={
        "DataSource": {
            "S3DataSource": {
                "S3DataType": "S3Prefix",
                "S3Uri": input_data_location,
            }
        },
        "ContentType": "text/csv",  
    },
    TransformOutput={
        "S3OutputPath": output_data_location,
        "Accept": "text/csv",  
    },
    TransformResources={
        "InstanceType": "ml.m5.xlarge", 
        "InstanceCount": 1,
    },
    BatchStrategy="MultiRecord"
)

print("Batch Transform Job initiated:", response["TransformJobArn"])


In [27]:
response = sagemaker_client.describe_transform_job(TransformJobName=transform_job_name)
job_status = response["TransformJobStatus"]
print(f"Transform Job Status: {job_status}")

if "FailureReason" in response:
    print(f"Failure Reason: {response['FailureReason']}")



Transform Job Status: Completed


In [28]:
# --- Paramètres S3 et SageMaker ---
bucket_name = "test-bucket-hamady"
output_key = "inference-output/"
y_test_key = "splitData/y_test.csv"

# --- Étape 1 : Récupérer les prédictions ---
# Lister les fichiers d'inférence générés
response = s3.list_objects_v2(Bucket=bucket_name, Prefix=output_key)
prediction_files = [content['Key'] for content in response.get('Contents', [])]

# Télécharger et charger les fichiers des prédictions
predictions = []
for file_key in prediction_files:
    obj = s3.get_object(Bucket=bucket_name, Key=file_key)
    batch_predictions = pd.read_csv(obj['Body'], header=None)  
    predictions.append(batch_predictions)

# Fusionner les prédictions
predictions_df = pd.concat(predictions, axis=0)

# --- Étape 2 : Charger les vraies étiquettes ---
obj = s3.get_object(Bucket=bucket_name, Key=y_test_key)
y_test = pd.read_csv(obj['Body'], header=None)  # Aucun en-tête

#  il faut ignorer la première ligne (index 0)
y_test = y_test[1:].reset_index(drop=True)  

# --- Étape 3 : Convertir les prédictions continues en classes binaires ---
# Appliquer un seuil de 0.5 pour transformer les prédictions en classes binaires
y_pred = (predictions_df[0] > 0.5).astype(int)  # Si > 0.5, prédiction = 1, sinon 0


# Convertir y_test en int 
y_test = y_test[0].astype(int)

# --- Étape 4 : Calculer l'accuracy ---
accuracy = accuracy_score(y_test, y_pred)  

# --- Afficher les résultats ---
print(f"Accuracy: {accuracy:.2f}")


Accuracy: 0.83
