In [1]:
pip install xgboost

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Collecting xgboost
  Downloading xgboost-1.7.4-py3-none-manylinux2014_x86_64.whl (193.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 MB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: xgboost
Successfully installed xgboost-1.7.4
Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install optuna

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Collecting optuna
  Downloading optuna-3.1.0-py3-none-any.whl (365 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m365.3/365.3 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m00:01[0m
Collecting colorlog
  Downloading colorlog-6.7.0-py2.py3-none-any.whl (11 kB)
Collecting cmaes>=0.9.1
  Downloading cmaes-0.9.1-py3-none-any.whl (21 kB)
Collecting alembic>=1.5.0
  Downloading alembic-1.10.2-py3-none-any.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.2/212.2 kB[0m [31m18.2 MB/s[0m eta [36m0:00:00[0m
Collecting Mako
  Downloading Mako-1.2.4-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.7/78.7 kB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: Mako, colorlog, cmaes, alembic, optuna
Successfully installed Mako-1.2.4 alembic-1.10.2 cmaes-0.9.1 colorlog-6.7.0 optuna-3.1.0
Note: you may 

In [13]:
import boto3
import pandas as pd; pd.set_option('display.max_columns', 50)
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
import optuna

## Defining the bucket 
s3 = boto3.resource('s3')
bucket_name = 'omar-vargas-bucket'
bucket = s3.Bucket(bucket_name)

## Defining the csv file 
file_key = 'insurance.csv'

bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

## Reading the csv file
insurance = pd.read_csv(file_content_stream)
insurance.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [14]:
## Changing labels to numbers 
insurance['sex'] = np.where(insurance['sex'] == 'female', 0, 1)
insurance['smoker'] = np.where(insurance['smoker'] == 'no', 0, 1)

## Extracting region dummies
region_dummies = pd.get_dummies(insurance['region']).iloc[:, 0:3]

## Appending dummies 
insurance = pd.concat([insurance, region_dummies], axis = 1)
insurance.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,northeast,northwest,southeast
0,19,0,27.9,0,1,southwest,16884.924,0,0,0
1,18,1,33.77,1,0,southeast,1725.5523,0,0,1
2,28,1,33.0,3,0,southeast,4449.462,0,0,1
3,33,1,22.705,0,0,northwest,21984.47061,0,1,0
4,32,1,28.88,0,0,northwest,3866.8552,0,1,0


In [15]:
## Computing interactions from chapter 4
insurance['interaction_1'] = np.where((insurance['smoker'] == 0) & (insurance['age'] <= 32.5), 1, 0)
insurance['interaction_2'] = np.where((insurance['smoker'] == 0) & (insurance['age'] > 32.5) & (insurance['age'] <= 44.5), 1, 0)
insurance['interaction_3'] = np.where((insurance['smoker'] == 0) & (insurance['age'] > 44.5) & (insurance['age'] < 51.5), 1, 0)
insurance['interaction_4'] = np.where((insurance['smoker'] == 0) & (insurance['age'] > 51.5), 1, 0)

In [16]:
## Defining input and target 
X = insurance[['age', 'bmi', 'children', 'smoker', 'interaction_4']]
Y = insurance['charges']

## Splitting the data 
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2)

# Random Forest

In [25]:
X = X_train
Y = Y_train

class Objective:
    
    def __init__(self, seed):
        self.seed = seed
        
    def __call__(self, trial):
        
        params = dict(n_estimators = trial.suggest_int('n_estimators', 100, 2000),
                      min_samples_split = trial.suggest_int('min_samples_split', 5, 30),
                      min_samples_leaf = trial.suggest_int('min_samples_leaf', 5, 30),
                      max_depth = trial.suggest_int('max_depth', 2, 10)
                      )
        
        scores = list()
        
        skf = KFold(n_splits = 3, shuffle = True, random_state = self.seed)
        
        for train_idx, valid_idx in skf.split(X, Y):
            
            X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
            Y_train, Y_valid = Y.iloc[train_idx], Y.iloc[valid_idx]
            
            RF_md = RandomForestRegressor(**params).fit(X_train, Y_train)
            
            pred_valid = RF_md.predict(X_valid)
            score = mean_squared_error(Y_valid, pred_valid)
            scores.append(score)
            
        return np.mean(scores)

In [26]:
SEED = 42
N_TRIALS = 20

## Excuting optuna
study = optuna.create_study(direction = 'minimize')
study.optimize(Objective(SEED), n_trials = N_TRIALS)

[32m[I 2023-03-29 23:11:23,987][0m A new study created in memory with name: no-name-addf336b-0b3a-47bb-9af7-f386435e1cac[0m
[32m[I 2023-03-29 23:11:26,493][0m Trial 0 finished with value: 22039028.72462304 and parameters: {'n_estimators': 512, 'min_samples_split': 29, 'min_samples_leaf': 28, 'max_depth': 9}. Best is trial 0 with value: 22039028.72462304.[0m
[32m[I 2023-03-29 23:11:27,370][0m Trial 1 finished with value: 19786930.21360138 and parameters: {'n_estimators': 178, 'min_samples_split': 26, 'min_samples_leaf': 9, 'max_depth': 10}. Best is trial 1 with value: 19786930.21360138.[0m
[32m[I 2023-03-29 23:11:28,395][0m Trial 2 finished with value: 24199091.75579248 and parameters: {'n_estimators': 248, 'min_samples_split': 18, 'min_samples_leaf': 13, 'max_depth': 2}. Best is trial 1 with value: 19786930.21360138.[0m
[32m[I 2023-03-29 23:11:33,635][0m Trial 3 finished with value: 22072293.059934642 and parameters: {'n_estimators': 1173, 'min_samples_split': 23, 'min_sa

In [27]:
## Building the optimized model
RF_md = RandomForestRegressor(**study.best_trial.params).fit(X_train, Y_train)

## Predicting on test
RF_pred = RF_md.predict(X_test)

## Computing the mse
RF_mse = mean_squared_error(Y_test, RF_pred)
print('The mse of the random forest model is ', RF_mse)

The mse of the random forest model is  25367459.563626


# XGBoost

In [28]:
X = X_train
Y = Y_train

class Objective:
    
    def __init__(self, seed):
        self.seed = seed
        
    def __call__(self, trial):
        
        params = dict(n_estimators = trial.suggest_int('n_estimators', 100, 2000),
                      max_depth = trial.suggest_int('max_depth', 2, 10),
                      min_child_weight = trial.suggest_int('min_child_weight', 2, 20),
                      learning_rate = trial.suggest_float('learning_rate', 0.01, 100, log = True),
                      gamma = trial.suggest_float('gamma', 1, 10),
                      colsample_bytree = trial.suggest_float('colsample_bytree', 0.2, 0.9),
                      subsample = trial.suggest_float('subsample', 0.2, 0.9)
                      )
        
        scores = list()
        
        skf = KFold(n_splits = 3, shuffle = True, random_state = self.seed)
        
        for train_idx, valid_idx in skf.split(X, Y):
            
            X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
            Y_train, Y_valid = Y.iloc[train_idx], Y.iloc[valid_idx]
            
            Y_train = np.log(Y_train)
            Y_valid = np.log(Y_valid)
            
            xgb_md = XGBRegressor(**params).fit(X_train, Y_train)
            
            pred_valid = np.log(xgb_md.predict(X_valid))
            score = mean_squared_error(Y_valid, pred_valid)
            scores.append(score)
            
        return np.mean(scores)

In [29]:
SEED = 42
N_TRIALS = 20

## Excuting optuna
study = optuna.create_study(direction = 'minimize')
study.optimize(Objective(SEED), n_trials = N_TRIALS)

[32m[I 2023-03-29 23:13:57,684][0m A new study created in memory with name: no-name-733197dc-260b-4cd5-a9ef-07d7e1012ccc[0m
[33m[W 2023-03-29 23:13:58,322][0m Trial 0 failed with parameters: {'n_estimators': 1633, 'max_depth': 6, 'min_child_weight': 10, 'learning_rate': 19.451990737592613, 'gamma': 6.364167214410687, 'colsample_bytree': 0.20753302601773432, 'subsample': 0.8367662684215702} because of the following error: ValueError('Input contains NaN.').[0m
Traceback (most recent call last):
  File "/home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/optuna/study/_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "/tmp/ipykernel_11915/381278057.py", line 35, in __call__
    score = mean_squared_error(Y_valid, pred_valid)
  File "/home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/sklearn/metrics/_regression.py", line 442, in mean_squared_error
    y_type, y_true, y_pred, multioutput = _check_reg_targets(
  File "/home/ec

ValueError: Input contains NaN.

In [None]:
## Building the optimized model
xgb_md = XGBRegressor(**study.best_trial.params).fit(X_train, Y_train)

## Predicting on test
xgb_pred = xgb_md.predict(X_test)

## Computing the mse
xgb_mse = mean_squared_error(Y_test, xgb_pred)
print('The mse of the XGBoost model is', xgb_mse)