<!-- Projeto Desenvolvido na Data Science Academy - www.datascienceacademy.com.br -->
# <font color='blue'>Data Science Academy</font>
## <font color='blue'>Cloud Computing Data Science</font>
## <font color='blue'>Projeto 6</font>
### <font color='blue'>Otimização de Hiperparâmetros em Modelo de Previsão de Churn de Clientes com AutoML</font>

## Pacotes Python Usados com SageMaker

In [1]:
# Imports
import io
import os
import sys
import time
import json
import boto3
import sagemaker
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from io import StringIO
from time import gmtime, strftime, sleep
from IPython.display import display
from sagemaker import get_execution_role
from sagemaker.predictor import Predictor
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import CSVDeserializer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
pd.set_option("display.max_columns", 500)

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


## Definindo Região AWS, Role de Execução e Sessão SageMaker

In [2]:
# Define a região
region = boto3.Session().region_name
print(region)

us-east-2


In [3]:
# Define a role
role = get_execution_role()
print(role)

arn:aws:iam::890582101704:role/service-role/AmazonSageMaker-ExecutionRole-20240523T163775


In [4]:
# Cria sessão SageMaker
session = sagemaker.Session()

## Definindo o Bucket S3 com a Fonte de Dados

In [5]:
# Cria o cliente s3
s3 = boto3.client("s3")

In [6]:
# Download do arquivo do bucket para o ambiente local do SageMaker
s3.download_file(f"dsa-p6-890582101704", "dataset.txt", "churn.txt")

In [7]:
# Carrega o arquivo como dataframe do pandas
churn = pd.read_csv("./churn.txt")

In [8]:
churn.head()

Unnamed: 0,State,Account Length,Area Code,Phone,Int'l Plan,VMail Plan,VMail Message,Day Mins,Day Calls,Day Charge,Eve Mins,Eve Calls,Eve Charge,Night Mins,Night Calls,Night Charge,Intl Mins,Intl Calls,Intl Charge,CustServ Calls,Churn?
0,PA,163,806,403-2562,no,yes,300,8.162204,3,7.579174,3.933035,4,6.508639,4.065759,100,5.111624,4.92816,6,5.673203,3,True.
1,SC,15,836,158-8416,yes,no,0,10.018993,4,4.226289,2.325005,0,9.972592,7.14104,200,6.436188,3.221748,6,2.559749,8,False.
2,MO,131,777,896-6253,no,yes,300,4.70849,3,4.76816,4.537466,3,4.566715,5.363235,100,5.142451,7.139023,2,6.254157,4,False.
3,WY,75,878,817-5729,yes,yes,700,1.268734,3,2.567642,2.528748,5,2.333624,3.773586,450,3.814413,2.245779,6,1.080692,6,False.
4,WY,146,878,450-4942,yes,no,0,2.696177,3,5.908916,6.015337,3,3.670408,3.751673,250,2.796812,6.905545,4,7.134343,6,True.


## Separando os Dados em Treino e Teste

In [9]:
# Extrai uma amostra de 80% dos dados para treino
dsa_dados_treino = churn.sample(frac = 0.8, random_state = 200)

In [10]:
# Faz o drop dos dados de treino e o que sobrar vai para a amostra de teste
dsa_dados_teste = churn.drop(dsa_dados_treino.index)

In [11]:
# Cria um dataset de teste sem a variável target
dsa_dados_teste_no_target = dsa_dados_teste.drop(columns = ["Churn?"])

## Salvando as Amostras de Treino e Teste no Bucket S3

In [12]:
# Define o bucket usado pelo SageMaker e o prefixo
bucket = session.default_bucket()
prefix = "sagemaker/dsap6-automl-churn"

In [13]:
# Salvando os dados de treino no S3
train_file = "dsa_dados_treino.csv"
dsa_dados_treino.to_csv(train_file, index = False, header = True)
train_data_s3_path = session.upload_data(path = train_file, key_prefix = prefix + "/train")
print("Dataset de Treino Salvo em: " + train_data_s3_path)

Dataset de Treino Salvo em: s3://sagemaker-us-east-2-890582101704/sagemaker/dsap6-automl-churn/train/dsa_dados_treino.csv


In [14]:
# Salvando os dados de teste no S3
test_file = "dsa_dados_teste.csv"
dsa_dados_teste.to_csv(test_file, index = False, header = False)
test_data_s3_path = session.upload_data(path = test_file, key_prefix = prefix + "/test")
print("Dataset de Teste Salvo em: " + test_data_s3_path)

Dataset de Teste Salvo em: s3://sagemaker-us-east-2-890582101704/sagemaker/dsap6-automl-churn/test/dsa_dados_teste.csv


In [15]:
# Salvando os dados de teste no S3
test_file = "dsa_dados_teste_no_target.csv"
dsa_dados_teste_no_target.to_csv(test_file, index = False, header = False)
test_data_s3_path = session.upload_data(path = test_file, key_prefix = prefix + "/test")
print("Dataset de Teste Salvo em: " + test_data_s3_path)

Dataset de Teste Salvo em: s3://sagemaker-us-east-2-890582101704/sagemaker/dsap6-automl-churn/test/dsa_dados_teste_no_target.csv


## Configuração do Job com SageMaker AutoPilot

In [16]:
# Config
input_data_config = [
    {
        "DataSource": {
            "S3DataSource": {
                "S3DataType": "S3Prefix",
                "S3Uri": "s3://{}/{}/train".format(bucket, prefix),
            }
        },
        "TargetAttributeName": "Churn?",
    }
]

In [17]:
# Config do output
output_data_config = {"S3OutputPath": "s3://{}/{}/output".format(bucket, prefix)}

In [18]:
# Timestamp sufixo
timestamp_suffix = strftime("%d-%H-%M-%S", gmtime())

In [19]:
auto_ml_job_name = "automl-churn-" + timestamp_suffix
print("AutoMLJobName: " + auto_ml_job_name)

AutoMLJobName: automl-churn-20-22-37-59


In [20]:
# Este é o cliente que usaremos para interagir com o SageMaker AutoPilot
sm = boto3.Session().client(service_name = "sagemaker", region_name = region)

In [21]:
# Cria o job
sm.create_auto_ml_job(AutoMLJobName = auto_ml_job_name,
                      InputDataConfig = input_data_config,
                      OutputDataConfig = output_data_config,
                      AutoMLJobConfig = {"CompletionCriteria": {"MaxCandidates": 20}},
                      RoleArn = role)

{'AutoMLJobArn': 'arn:aws:sagemaker:us-east-2:890582101704:automl-job/automl-churn-20-22-37-59',
 'ResponseMetadata': {'RequestId': '25ffb823-e487-4551-aba2-4a62177c512f',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '25ffb823-e487-4551-aba2-4a62177c512f',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '95',
   'date': 'Fri, 20 Dec 2024 22:37:59 GMT'},
  'RetryAttempts': 0}}

## Execução do Job no SageMaker AutoPilot

In [22]:
# Armazena o nome AutoMLJobName para uso em notebooks subsequentes
%store auto_ml_job_name
auto_ml_job_name

Stored 'auto_ml_job_name' (str)


'automl-churn-20-22-37-59'

A célula abaixo é demorada (aproximadamente 30 minutos). Pratique a paciência e aguarde.

In [23]:
%%time

print("DSA - Projeto 6 - JobStatus")  
print("------------------------------")  

# Recupera e imprime o status atual do job AutoML
describe_response = sm.describe_auto_ml_job(AutoMLJobName = auto_ml_job_name)

# Imprime o status do job e o status secundário
print(describe_response["AutoMLJobStatus"] + " - " + describe_response["AutoMLJobSecondaryStatus"])  
job_run_status = describe_response["AutoMLJobStatus"] 

# Loop para monitorar o status do job até que seja "Falha", "Concluído" ou "Interrompido"
while job_run_status not in ("Failed", "Completed", "Stopped"):
    
    # Recupera o status mais recente do job AutoML
    describe_response = sm.describe_auto_ml_job(AutoMLJobName = auto_ml_job_name)
    
    # Atualiza o status atual do job
    job_run_status = describe_response["AutoMLJobStatus"]
    
    # Imprime o status do job atualizado e o status secundário
    print(describe_response["AutoMLJobStatus"] + " - " + describe_response["AutoMLJobSecondaryStatus"])
    
    # Pausa o loop por 10 segundos antes de verificar o status novamente
    sleep(10)

DSA - Projeto 6 - JobStatus
------------------------------
InProgress - Starting
InProgress - Starting
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - FeatureEngineering
I

## Visualizando os Jupyter Notebooks Gerados Pelo SageMaker AutoPilot

In [24]:
print(describe_response)

{'AutoMLJobName': 'automl-churn-20-22-37-59', 'AutoMLJobArn': 'arn:aws:sagemaker:us-east-2:890582101704:automl-job/automl-churn-20-22-37-59', 'InputDataConfig': [{'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix', 'S3Uri': 's3://sagemaker-us-east-2-890582101704/sagemaker/dsap6-automl-churn/train'}}, 'TargetAttributeName': 'Churn?', 'ContentType': 'text/csv;header=present', 'ChannelType': 'training'}], 'OutputDataConfig': {'S3OutputPath': 's3://sagemaker-us-east-2-890582101704/sagemaker/dsap6-automl-churn/output'}, 'RoleArn': 'arn:aws:iam::890582101704:role/service-role/AmazonSageMaker-ExecutionRole-20240523T163775', 'AutoMLJobConfig': {'CompletionCriteria': {'MaxCandidates': 20}}, 'CreationTime': datetime.datetime(2024, 12, 20, 22, 37, 59, 304000, tzinfo=tzlocal()), 'EndTime': datetime.datetime(2024, 12, 20, 23, 12, 4, 463000, tzinfo=tzlocal()), 'LastModifiedTime': datetime.datetime(2024, 12, 20, 23, 12, 4, 496000, tzinfo=tzlocal()), 'BestCandidate': {'CandidateName': 'automl-ch

In [25]:
print(describe_response["AutoMLJobArtifacts"]["CandidateDefinitionNotebookLocation"])
print(describe_response["AutoMLJobArtifacts"]["DataExplorationNotebookLocation"])

s3://sagemaker-us-east-2-890582101704/sagemaker/dsap6-automl-churn/output/automl-churn-20-22-37-59/sagemaker-automl-candidates/automl-churn-20-22-37-59-pr-1-43a456ef226c46e2897cf0bcb4ef0e4f0/notebooks/SageMakerAutopilotCandidateDefinitionNotebook.ipynb
s3://sagemaker-us-east-2-890582101704/sagemaker/dsap6-automl-churn/output/automl-churn-20-22-37-59/sagemaker-automl-candidates/automl-churn-20-22-37-59-pr-1-43a456ef226c46e2897cf0bcb4ef0e4f0/notebooks/SageMakerAutopilotDataExplorationNotebook.ipynb


In [26]:
candidate_nbk = describe_response["AutoMLJobArtifacts"]["CandidateDefinitionNotebookLocation"]
data_explore_nbk = describe_response["AutoMLJobArtifacts"]["DataExplorationNotebookLocation"]

In [27]:
# Define uma função para dividir o caminho do S3 em bucket e chave
def split_s3_path(s3_path):
    
    # Remove o prefixo "s3://" e divide o restante do caminho por "/"
    path_parts = s3_path.replace("s3://", "").split("/")
    
    # Extrai o nome do bucket (primeira parte do caminho)
    bucket = path_parts.pop(0)
    
    # Junta o restante do caminho para formar a chave (key)
    key = "/".join(path_parts)
    
    # Retorna o bucket e a chave como uma tupla
    return bucket, key

In [28]:
# Aplica a função
s3_bucket, candidate_nbk_key = split_s3_path(candidate_nbk)

In [29]:
# Aplica a função
_, data_explore_nbk_key = split_s3_path(data_explore_nbk)

In [30]:
print(s3_bucket, candidate_nbk_key, data_explore_nbk_key)

sagemaker-us-east-2-890582101704 sagemaker/dsap6-automl-churn/output/automl-churn-20-22-37-59/sagemaker-automl-candidates/automl-churn-20-22-37-59-pr-1-43a456ef226c46e2897cf0bcb4ef0e4f0/notebooks/SageMakerAutopilotCandidateDefinitionNotebook.ipynb sagemaker/dsap6-automl-churn/output/automl-churn-20-22-37-59/sagemaker-automl-candidates/automl-churn-20-22-37-59-pr-1-43a456ef226c46e2897cf0bcb4ef0e4f0/notebooks/SageMakerAutopilotDataExplorationNotebook.ipynb


In [31]:
# Download do jupyter gerado pelo SageMaker AutoPilot
session.download_data(path = "./", bucket = s3_bucket, key_prefix = candidate_nbk_key)

['./SageMakerAutopilotCandidateDefinitionNotebook.ipynb']

In [32]:
# Download do jupyter gerado pelo SageMaker AutoPilot
session.download_data(path = "./", bucket = s3_bucket, key_prefix = data_explore_nbk_key)

['./SageMakerAutopilotDataExplorationNotebook.ipynb']

## Extraindo o Melhor Candidato Após a Otimização

In [33]:
# Extrai o melhor candidato
best_candidate = sm.describe_auto_ml_job(AutoMLJobName = auto_ml_job_name)["BestCandidate"]

In [34]:
# Extrai o melhor candidato
best_candidate_name = best_candidate["CandidateName"]

In [35]:
print(best_candidate)

{'CandidateName': 'automl-churn-20-22-37-59CZ7mQMmb-013-9bd940f1', 'FinalAutoMLJobObjectiveMetric': {'MetricName': 'validation:f1_binary', 'Value': 0.9273099899291992, 'StandardMetricName': 'F1'}, 'ObjectiveStatus': 'Succeeded', 'CandidateSteps': [{'CandidateStepType': 'AWS::SageMaker::ProcessingJob', 'CandidateStepArn': 'arn:aws:sagemaker:us-east-2:890582101704:processing-job/automl-churn-20-22-37-59-db-1-cdbf2d3324134e5f8997e6777a6fb8c74', 'CandidateStepName': 'automl-churn-20-22-37-59-db-1-cdbf2d3324134e5f8997e6777a6fb8c74'}, {'CandidateStepType': 'AWS::SageMaker::TrainingJob', 'CandidateStepArn': 'arn:aws:sagemaker:us-east-2:890582101704:training-job/automl-churn-20-22-37-59-dpp5-1-15ebd079d93d45409b3400082bac108', 'CandidateStepName': 'automl-churn-20-22-37-59-dpp5-1-15ebd079d93d45409b3400082bac108'}, {'CandidateStepType': 'AWS::SageMaker::TransformJob', 'CandidateStepArn': 'arn:aws:sagemaker:us-east-2:890582101704:transform-job/automl-churn-20-22-37-59-dpp5-rpb-1-165a4ca0b59f444c

In [36]:
print("CandidateName: " + best_candidate_name)

CandidateName: automl-churn-20-22-37-59CZ7mQMmb-013-9bd940f1


In [37]:
print("FinalAutoMLJobObjectiveMetricName: " + best_candidate["FinalAutoMLJobObjectiveMetric"]["MetricName"])

FinalAutoMLJobObjectiveMetricName: validation:f1_binary


In [38]:
print("FinalAutoMLJobObjectiveMetricValue: " + str(best_candidate["FinalAutoMLJobObjectiveMetric"]["Value"]))

FinalAutoMLJobObjectiveMetricValue: 0.9273099899291992


In [39]:
sm_dict = sm.list_candidates_for_auto_ml_job(AutoMLJobName = auto_ml_job_name)

In [40]:
# Itera sobre os candidatos no dicionário "Candidates" em sm_dict
for item in sm_dict["Candidates"]:
    
    # Imprime o nome do candidato e a métrica final do objetivo do AutoML
    print(item["CandidateName"], item["FinalAutoMLJobObjectiveMetric"])
    
    # Imprime a imagem do segundo contêiner de inferência, seguido por uma linha em branco
    print(item["InferenceContainers"][1]["Image"], "\n")

automl-churn-20-22-37-59CZ7mQMmb-020-d81bcba7 {'MetricName': 'validation:f1_binary', 'Value': 0.9235799908638, 'StandardMetricName': 'F1'}
257758044811.dkr.ecr.us-east-2.amazonaws.com/sagemaker-xgboost:1.3-1-cpu-py3 

automl-churn-20-22-37-59CZ7mQMmb-019-2545cbd9 {'MetricName': 'validation:f1_binary', 'Value': 0.9121699929237366, 'StandardMetricName': 'F1'}
257758044811.dkr.ecr.us-east-2.amazonaws.com/sagemaker-xgboost:1.3-1-cpu-py3 

automl-churn-20-22-37-59CZ7mQMmb-018-125ec6b3 {'MetricName': 'validation:f1_binary', 'Value': 0.8678600192070007, 'StandardMetricName': 'F1'}
257758044811.dkr.ecr.us-east-2.amazonaws.com/sagemaker-xgboost:1.3-1-cpu-py3 

automl-churn-20-22-37-59CZ7mQMmb-017-39f78e00 {'MetricName': 'validation:f1_binary', 'Value': 0.9067299962043762, 'StandardMetricName': 'F1'}
257758044811.dkr.ecr.us-east-2.amazonaws.com/sagemaker-xgboost:1.3-1-cpu-py3 

automl-churn-20-22-37-59CZ7mQMmb-016-5cc6043e {'MetricName': 'validation:f1_binary', 'Value': 0.922469973564148, 'Stand

## Criando a Versão Final do Modelo Após Otimização

In [41]:
# Sufixo
timestamp_suffix = strftime("%d-%H-%M-%S", gmtime())

In [42]:
# Gera o modelo com o melhor candidato
model_name = best_candidate_name + timestamp_suffix + "-model"

In [43]:
# Cria o modelo
model_arn = sm.create_model(Containers = best_candidate["InferenceContainers"], 
                            ModelName = model_name, 
                            ExecutionRoleArn = role)

## Preparando a Configuração do Endpoint

In [44]:
# Gera o nome da config do endpoint para o melhor modelo
epc_name = best_candidate_name + timestamp_suffix + "-epc"

In [45]:
# Cria a config do endpoint para o melhor modelo
ep_config = sm.create_endpoint_config(EndpointConfigName = epc_name,
                                      ProductionVariants = [{"InstanceType": "ml.m5.2xlarge",
                                                             "InitialInstanceCount": 1,
                                                             "ModelName": model_name,
                                                             "VariantName": "main"}])

In [46]:
# Gera o nome do endpoint para o melhor modelo
ep_name = best_candidate_name + timestamp_suffix + "-ep"

In [47]:
# Cria o endpoint para o melhor modelo
create_endpoint_response = sm.create_endpoint(EndpointName = ep_name, EndpointConfigName = epc_name)

In [48]:
%%time

# Esse comando pausa a execução do código até que o endpoint especificado por ep_name 
# esteja em estado "InService", ou seja, pronto para uso. 
sm.get_waiter("endpoint_in_service").wait(EndpointName = ep_name)

CPU times: user 44.2 ms, sys: 16 ms, total: 60.2 ms
Wall time: 3min 30s


## Criando o Preditor Para Inferência

In [49]:
# Cria o preditor
predictor = Predictor(endpoint_name = ep_name,
                      sagemaker_session = session,
                      serializer = CSVSerializer(),
                      deserializer = CSVDeserializer())

In [50]:
# Remova a coluna de destino dos dados de teste
test_data_inference = dsa_dados_teste.drop("Churn?", axis = 1)

In [51]:
# Obter previsões do endpoint do SageMaker
prediction = predictor.predict(test_data_inference.to_csv(sep = ",", header = False, index = False))

## Avaliando as Métricas

In [52]:
# Carrega a previsão no Pandas e compare com o valor real
prediction_df = pd.DataFrame(prediction)
accuracy = (dsa_dados_teste.reset_index()["Churn?"] == prediction_df[0]).sum() / len(test_data_inference)

In [53]:
print("Acurácia: {}".format(accuracy))

Acurácia: 0.932


In [54]:
# Calcula acurácia usando a função do Scikit-learn
accuracy = accuracy_score(dsa_dados_teste.reset_index()["Churn?"], prediction_df[0])

In [55]:
# Calcula precision usando a função do Scikit-learn
precision = precision_score(dsa_dados_teste.reset_index()["Churn?"], prediction_df[0], pos_label = "True.")

In [56]:
# Calcula recall usando a função do Scikit-learn
recall = recall_score(dsa_dados_teste.reset_index()["Churn?"], 
                      prediction_df[0], 
                      pos_label = "True.", 
                      average = "binary")

In [57]:
# Calcula f1 score usando a função do Scikit-learn
f1 = f1_score(dsa_dados_teste.reset_index()["Churn?"], prediction_df[0], pos_label = "True.")

In [58]:
print("Acurácia: {}".format(accuracy))
print("Precision: {}".format(precision))
print("Recall: {}".format(recall))
print("F1-Score: {}".format(f1))

Acurácia: 0.932
Precision: 0.9205426356589147
Recall: 0.9462151394422311
F1-Score: 0.9332023575638507


# Fim