### Tratamento da base de dados

In [5]:
import pandas as pd
import numpy as np

In [6]:
df_houses = pd.read_csv('../data/house_prices.csv', sep=',', on_bad_lines='skip')
df_houses

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3,1.00,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.7210,-122.319,1690,7639
2,5631500400,20150225T000000,180000.0,2,1.00,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000.0,4,3.00,1960,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000.0,3,2.00,1680,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34212,263000018,20140521T000000,360000.0,3,2.50,1530,1131,3.0,0,0,...,8,1530,0,2009,0,98103,47.6993,-122.346,1530,1509
34213,6600060120,20150223T000000,400000.0,4,2.50,2310,5813,2.0,0,0,...,8,2310,0,2014,0,98146,47.5107,-122.362,1830,7200
34214,1523300141,20140623T000000,402101.0,2,0.75,1020,1350,2.0,0,0,...,7,1020,0,2009,0,98144,47.5944,-122.299,1020,2007
34215,291310100,20150116T000000,400000.0,3,2.50,1600,2388,2.0,0,0,...,8,1600,0,2004,0,98027,47.5345,-122.069,1410,1287


In [7]:
df_houses.drop(columns=['id', 'date', 'sqft_living15', 'sqft_lot15'], axis=1, inplace=True)

In [8]:
df_houses.dtypes

price            float64
bedrooms           int64
bathrooms        float64
sqft_living        int64
sqft_lot           int64
floors           float64
waterfront         int64
view               int64
condition          int64
grade              int64
sqft_above         int64
sqft_basement      int64
yr_built           int64
yr_renovated       int64
zipcode            int64
lat              float64
long             float64
dtype: object

In [9]:
df_houses = df_houses.drop_duplicates()

In [10]:
df_houses.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long
0,221900.0,3,1.0,1180,5650,1.0,0,0,3,7,1180,0,1955,0,98178,47.5112,-122.257
1,538000.0,3,2.25,2570,7242,2.0,0,0,3,7,2170,400,1951,1991,98125,47.721,-122.319
2,180000.0,2,1.0,770,10000,1.0,0,0,3,6,770,0,1933,0,98028,47.7379,-122.233
3,604000.0,4,3.0,1960,5000,1.0,0,0,5,7,1050,910,1965,0,98136,47.5208,-122.393
4,510000.0,3,2.0,1680,8080,1.0,0,0,3,8,1680,0,1987,0,98074,47.6168,-122.045


In [11]:
train_base = df_houses.iloc[0:15129, :]
train_base.shape

(15129, 17)

In [12]:
test_base = df_houses.iloc[15129:, :]
test_base.shape

(6479, 17)

In [13]:
X_test = test_base.iloc[:, 1:17].values
X_test

array([[ 4.00000e+00,  2.50000e+00,  2.97000e+03, ...,  9.81250e+04,
         4.77118e+01, -1.22290e+02],
       [ 2.00000e+00,  1.00000e+00,  7.90000e+02, ...,  9.80110e+04,
         4.77644e+01, -1.22198e+02],
       [ 6.00000e+00,  1.50000e+00,  2.14000e+03, ...,  9.80010e+04,
         4.72668e+01, -1.22252e+02],
       ...,
       [ 2.00000e+00,  7.50000e-01,  1.02000e+03, ...,  9.81440e+04,
         4.75944e+01, -1.22299e+02],
       [ 3.00000e+00,  2.50000e+00,  1.60000e+03, ...,  9.80270e+04,
         4.75345e+01, -1.22069e+02],
       [ 2.00000e+00,  7.50000e-01,  1.02000e+03, ...,  9.81440e+04,
         4.75941e+01, -1.22299e+02]])

In [14]:
y_test = test_base.iloc[:, 0].values
y_test

array([635000., 245500., 230000., ..., 402101., 400000., 325000.])

In [15]:
#No XGBOOST a variável alvo precisa estar na primeira posição do DF
train_base.to_csv('house_prices_train_xgboost.csv', header=False, index=False)
test_base.to_csv('house_prices_test_xgboost.csv', header=False, index=False)

### Configurações do SageMaker

In [16]:
import sagemaker
import boto3
from sagemaker import Session

In [17]:
session = sagemaker.Session()

In [18]:
bucket = 'aws-sagemaker-course-633'
subdir = 'models/house-prices/xgboost'
subdir_dataset = 'datasets/house-prices'
key_train = 'houses-train-data-xgboost'
key_test = 'houses-test-data-xgboost'
role = sagemaker.get_execution_role()
s3_train_data = 's3://{}/{}/train/{}'.format(bucket, subdir_dataset, key_train)
s3_test_data = 's3://{}/{}/test/{}'.format(bucket, subdir_dataset, key_test)
output_location = 's3://{}/{}/output'.format(bucket, subdir)

print(f'Localização da base de treinamento: {s3_train_data}')
print(f'Localização da base de treinamento: {s3_test_data}')

Localização da base de treinamento: s3://aws-sagemaker-course-633/datasets/house-prices/train/houses-train-data-xgboost
Localização da base de treinamento: s3://aws-sagemaker-course-633/datasets/house-prices/test/houses-test-data-xgboost


In [19]:
s3 = boto3.Session().resource('s3')

In [20]:
import os
with open('house_prices_train_xgboost.csv', 'rb') as train_file:
    s3.Bucket(bucket).Object(f'{subdir_dataset}/train/{key_train}').upload_fileobj(train_file)

In [21]:
with open('house_prices_test_xgboost.csv', 'rb') as test_file:
    s3.Bucket(bucket).Object(f'{subdir_dataset}/test/{key_test}').upload_fileobj(test_file)

### Treinamento do XGBoost

In [22]:
from sagemaker import image_uris
# O SageMaker usa contêineres Docker para garantir que o ambiente de execução seja o mesmo para diferentes máquinas e instâncias. 
container = image_uris.retrieve(framework='xgboost', region=boto3.Session().region_name, version='latest')

In [23]:
# O estimator centraliza a configuração de todos os requisitos necessários para o treinamento de um modelo
xgboost = sagemaker.estimator.Estimator(image_uri = container,
                                       role = role,
                                       instance_count = 1,
                                       instance_type = 'ml.m5.large',
                                       output_path = output_location,
                                       sagemaker_session = session)

In [24]:
# Os hiperparâmetros permitem ajustar o comportamento do modelo de acordo com as características dos dados e o objetivo do problema.
xgboost.set_hyperparameters(num_round = 100)

In [25]:
# Usado para ensinar o modelo (ajustar os pesos) com base nos dados de entrada.
train_input = sagemaker.inputs.TrainingInput(s3_data = s3_train_data, content_type='csv', s3_data_type='S3Prefix')
# Usado para avaliar o desempenho do modelo durante o treinamento e evitar problemas como overfitting.
validation_input = sagemaker.inputs.TrainingInput(s3_data = s3_test_data, content_type='csv', s3_data_type='S3Prefix')
data_channels = {'train': train_input, 'validation': validation_input}

In [26]:
xgboost.fit(data_channels)

INFO:sagemaker:Creating training-job with name: xgboost-2025-01-08-19-24-09-421


2025-01-08 19:24:10 Starting - Starting the training job...
2025-01-08 19:24:25 Starting - Preparing the instances for training...
2025-01-08 19:25:04 Downloading - Downloading input data......
2025-01-08 19:25:50 Downloading - Downloading the training image...
2025-01-08 19:26:36 Training - Training image download completed. Training in progress.
2025-01-08 19:26:36 Uploading - Uploading generated training model[34mArguments: train[0m
[34m[2025-01-08:19:26:29:INFO] Running standalone xgboost training.[0m
[34m[2025-01-08:19:26:29:INFO] File size need to be processed in the node: 1.55mb. Available memory size in the node: 164.91mb[0m
[34m[2025-01-08:19:26:29:INFO] Determined delimiter of CSV input is ','[0m
[34m[19:26:29] S3DistributionType set as FullyReplicated[0m
[34m[19:26:29] 15129x16 matrix with 242064 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2025-01-08:19:26:29:INFO] Determined delimiter of CSV input is ','[0m
[34m[

### Deploy, previsões e avaliação

In [27]:
from sagemaker.serializers import CSVSerializer

In [28]:
xgboost_regressor = xgboost.deploy(initial_instance_count=1, instance_type='ml.m5.large')

INFO:sagemaker:Creating model with name: xgboost-2025-01-08-19-27-26-873
INFO:sagemaker:Creating endpoint-config with name xgboost-2025-01-08-19-27-26-873
INFO:sagemaker:Creating endpoint with name xgboost-2025-01-08-19-27-26-873


------!

In [29]:
xgboost_regressor.serializer = CSVSerializer()

In [32]:
X_test.shape, type(X_test)

((6479, 16), numpy.ndarray)

In [38]:
#previsão dos preços das casas
#convertido pra utf-8 e float32
previsions = np.array(xgboost_regressor.predict(X_test).decode('utf-8').split(',')).astype(np.float32)
previsions

array([619424.06, 174189.36, 215866.77, ..., 343506.47, 479892.4 ,
       320657.4 ], dtype=float32)

In [39]:
#previsões, respostas reais
previsions.shape, y_test.shape

((6479,), (6479,))

In [40]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [41]:
mae = mean_absolute_error(y_test, previsions)
mse = mean_squared_error(y_test, previsions)

In [43]:
print('MAE: ', mae, '\nMSE: ', mse)

MAE:  69609.95510133701 
MSE:  15768176926.750965


In [44]:
#O algortimo pode errar a previsão até 69609.95 para cima ou para baixo

### Tuning
base de dados diferentes podem requerer parâmetros diferentes

In [50]:
tuning_job_config = {
    "ParameterRanges": {
      "CategoricalParameterRanges": [],
      "ContinuousParameterRanges": [
        {
          "MaxValue": "1",
          "MinValue": "0",
          "Name": "eta"
        },
        {
          "MaxValue": "2",
          "MinValue": "0",
          "Name": "alpha"
        },
        {
          "MaxValue": "10",
          "MinValue": "1",
          "Name": "min_child_weight"
        }
      ],
      "IntegerParameterRanges": [
        {
          "MaxValue": "10",
          "MinValue": "1",
          "Name": "max_depth"
        }
      ],
      "IntegerParameterRanges": [
        {
          "MaxValue": "100",
          "MinValue": "10",
          "Name": "num_round"
        }
      ]
    },
    "ResourceLimits": {
      "MaxNumberOfTrainingJobs": 9,
      "MaxParallelTrainingJobs": 3
    },
    "Strategy": "Bayesian",
    "HyperParameterTuningJobObjective": {
      "MetricName": "validation:rmse",
      "Type": "Minimize"
    }
  }

In [51]:
training_job_definition = {
    "AlgorithmSpecification": {
      "TrainingImage": container,
      "TrainingInputMode": "File"
    },
    "InputDataConfig": [
      {
        "ChannelName": "train",
        "CompressionType": "None",
        "ContentType": "csv",
        "DataSource": {
          "S3DataSource": {
            "S3DataDistributionType": "FullyReplicated",
            "S3DataType": "S3Prefix",
            "S3Uri": s3_train_data
          }
        }
      },
      {
        "ChannelName": "validation",
        "CompressionType": "None",
        "ContentType": "csv",
        "DataSource": {
          "S3DataSource": {
            "S3DataDistributionType": "FullyReplicated",
            "S3DataType": "S3Prefix",
            "S3Uri": s3_test_data
          }
        }
      }
    ],
    "OutputDataConfig": {
      "S3OutputPath": "s3://{}/{}/output".format(bucket,subdir)
    },
    "ResourceConfig": {
      "InstanceCount": 1,
      "InstanceType": "ml.m5.large",
      "VolumeSizeInGB": 10
    },
    "RoleArn": role,
    "StaticHyperParameters": {
      "eval_metric": "rmse",
      "objective": "reg:linear",
      "rate_drop": "0.3",
      "tweedie_variance_power": "1.4"
    },
    "StoppingCondition": {
      "MaxRuntimeInSeconds": 43200
    }
}

In [53]:
smclient = boto3.client('sagemaker')
smclient.create_hyper_parameter_tuning_job(HyperParameterTuningJobName = 'xgboosttuninghousestest2',
                                          HyperParameterTuningJobConfig = tuning_job_config,
                                          TrainingJobDefinition = training_job_definition)

{'HyperParameterTuningJobArn': 'arn:aws:sagemaker:us-east-1:537124933216:hyper-parameter-tuning-job/xgboosttuninghousestest2',
 'ResponseMetadata': {'RequestId': '03cd2636-225c-458e-afcd-f941da06c9cb',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '03cd2636-225c-458e-afcd-f941da06c9cb',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '125',
   'date': 'Wed, 08 Jan 2025 20:25:45 GMT'},
  'RetryAttempts': 0}}