In [3]:
pip install xgboost optuna

Collecting optuna
  Downloading optuna-3.0.6-py3-none-any.whl (348 kB)
     |████████████████████████████████| 348 kB 26.1 MB/s            
Collecting alembic>=1.5.0
  Downloading alembic-1.7.7-py3-none-any.whl (210 kB)
     |████████████████████████████████| 210 kB 62.9 MB/s            
[?25hCollecting cliff
  Downloading cliff-3.10.1-py3-none-any.whl (81 kB)
     |████████████████████████████████| 81 kB 8.8 MB/s             
Collecting colorlog
  Downloading colorlog-6.7.0-py2.py3-none-any.whl (11 kB)
Collecting cmaes>=0.8.2
  Downloading cmaes-0.9.0-py3-none-any.whl (23 kB)
Collecting importlib-resources
  Downloading importlib_resources-5.4.0-py3-none-any.whl (28 kB)
Collecting Mako
  Downloading Mako-1.1.6-py2.py3-none-any.whl (75 kB)
     |████████████████████████████████| 75 kB 7.0 MB/s             
Collecting pbr!=2.1.0,>=2.0.0
  Downloading pbr-5.11.1-py2.py3-none-any.whl (112 kB)
     |████████████████████████████████| 112 kB 80.8 MB/s            
[?25hCollecting cmd2>=1.0.

In [7]:
import boto3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.feature_selection import RFE, RFECV
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error

import optuna

## Defining the s3 bucket
s3 = boto3.resource('s3')
bucket_name = 'data-448'
bucket = s3.Bucket(bucket_name)

## Defining the file to be read from s3 bucket
file_key = 'Chapter4/insurance.csv'

bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

## Reading the csv file
insurance = pd.read_csv(file_content_stream)
insurance.head()



Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [8]:
## Changing labels to numbers 
insurance['sex'] = np.where(insurance['sex'] == 'female', 0, 1)
insurance['smoker'] = np.where(insurance['smoker'] == 'no', 0, 1)

## Extracting region dummies
region_dummies = pd.get_dummies(insurance['region']).iloc[:, 0:3]

## Appending dummies 
insurance = pd.concat([insurance, region_dummies], axis = 1)

insurance.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,northeast,northwest,southeast
0,19,0,27.9,0,1,southwest,16884.924,0,0,0
1,18,1,33.77,1,0,southeast,1725.5523,0,0,1
2,28,1,33.0,3,0,southeast,4449.462,0,0,1
3,33,1,22.705,0,0,northwest,21984.47061,0,1,0
4,32,1,28.88,0,0,northwest,3866.8552,0,1,0


In [9]:
## Engineering features from decision-tree
insurance['interaction_1'] = np.where((insurance['smoker'] == 0) & (insurance['age'] <= 32.5), 1, 0)
insurance['interaction_2'] = np.where((insurance['smoker'] == 0) & (insurance['age'] > 32.5) & (insurance['age'] <= 44.5), 1, 0)
insurance['interaction_3'] = np.where((insurance['smoker'] == 0) & (insurance['age'] > 44.5) & (insurance['age'] < 51.5), 1, 0)
insurance['interaction_4'] = np.where((insurance['smoker'] == 0) & (insurance['age'] > 51.5), 1, 0)

In [10]:
## Defining the input and target variables
X = insurance[['age', 'bmi', 'children', 'smoker', 'interaction_4']]
Y = insurance['charges']

## Splitting the data 
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2)

# Random Forest with Optuna

In [11]:
X = X_train
Y = Y_train

class Objective:
    
    def __init__(self, seed):
        self.seed = seed
        
    def __call__(self, trial):
        
        params = dict(n_estimators = trial.suggest_int('n_estimators', 100, 2000),
                      min_samples_split = trial.suggest_int('min_samples_split', 5, 30),
                      min_samples_leaf = trial.suggest_int('min_samples_leaf', 5, 30),
                      max_depth = trial.suggest_int('max_depth', 2, 10)
                      )
        
        scores = list()
        
        skf = KFold(n_splits = 3, shuffle = True, random_state = self.seed)
        
        for train_idx, valid_idx in skf.split(X, Y):
            
            X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
            Y_train, Y_valid = Y.iloc[train_idx], Y.iloc[valid_idx]
            
            RF_md = RandomForestRegressor(**params).fit(X_train, Y_train)
            
            pred_valid = RF_md.predict(X_valid)
            score = mean_squared_error(Y_valid, pred_valid)
            scores.append(score)
            
        return np.mean(scores)

In [13]:
SEED = 42
N_TRIALS = 20

## Excuting optuna
study = optuna.create_study(direction = 'minimize')
study.optimize(Objective(SEED), n_trials = N_TRIALS)

[32m[I 2023-03-29 17:43:40,187][0m A new study created in memory with name: no-name-7dde943c-8743-4c23-8c93-d1e8934c2d6a[0m
[32m[I 2023-03-29 17:43:45,609][0m Trial 0 finished with value: 21845695.666118626 and parameters: {'n_estimators': 1128, 'min_samples_split': 11, 'min_samples_leaf': 22, 'max_depth': 9}. Best is trial 0 with value: 21845695.666118626.[0m
[32m[I 2023-03-29 17:43:49,831][0m Trial 1 finished with value: 25317863.579828482 and parameters: {'n_estimators': 1065, 'min_samples_split': 28, 'min_samples_leaf': 25, 'max_depth': 2}. Best is trial 0 with value: 21845695.666118626.[0m
[32m[I 2023-03-29 17:43:59,218][0m Trial 2 finished with value: 21355859.717632037 and parameters: {'n_estimators': 1960, 'min_samples_split': 21, 'min_samples_leaf': 7, 'max_depth': 9}. Best is trial 2 with value: 21355859.717632037.[0m
[32m[I 2023-03-29 17:44:01,816][0m Trial 3 finished with value: 22241962.355024204 and parameters: {'n_estimators': 565, 'min_samples_split': 16, 

In [14]:
## Building the optimized model
RF_md = RandomForestRegressor(**study.best_trial.params).fit(X_train, Y_train)

## Predicting on test
RF_pred = RF_md.predict(X_test)

## Computing the mse
RF_mse = mean_squared_error(Y_test, RF_pred)
print('The mse of the random forest model is ', RF_mse)

The mse of the random forest model is  19904453.680625267


# XGBoost with Optuna

In [19]:
X = X_train
Y = Y_train

class Objective:
    
    def __init__(self, seed):
        self.seed = seed
        
    def __call__(self, trial):
        
        params = dict(n_estimators = trial.suggest_int('n_estimators', 100, 2000),
                      max_depth = trial.suggest_int('max_depth', 2, 10),
                      min_child_weight = trial.suggest_int('min_child_weight', 2, 20),
                      learning_rate = trial.suggest_float('learning_rate', 0.01, 100, log = True),
                      gamma = trial.suggest_float('gamma', 1, 10),
                      colsample_bytree = trial.suggest_float('colsample_bytree', 0.2, 0.9),
                      subsample = trial.suggest_float('subsample', 0.2, 0.9)
                      )
        
        scores = list()
        
        skf = KFold(n_splits = 3, shuffle = True, random_state = self.seed)
        
        for train_idx, valid_idx in skf.split(X, Y):
            
            X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
            Y_train, Y_valid = Y.iloc[train_idx], Y.iloc[valid_idx]
            
            Y_train = np.log(Y_train)
            Y_valid = np.log(Y_valid)
            
            xgb_md = XGBRegressor(**params).fit(X_train, Y_train)
            
            pred_valid = np.log(xgb_md.predict(X_valid))
            score = mean_squared_error(Y_valid, pred_valid)
            scores.append(score)
            
        return np.mean(scores)

In [20]:
SEED = 42
N_TRIALS = 20

## Excuting optuna
study = optuna.create_study(direction = 'minimize')
study.optimize(Objective(SEED), n_trials = N_TRIALS)

[32m[I 2023-03-29 17:53:03,033][0m A new study created in memory with name: no-name-d2aeaabf-8e5c-411a-a49f-fd1d2e4f0352[0m
[32m[I 2023-03-29 17:53:07,692][0m Trial 0 finished with value: 40806372.49541391 and parameters: {'n_estimators': 350, 'max_depth': 4, 'min_child_weight': 15, 'learning_rate': 0.312062633192075, 'gamma': 8.832460376568196, 'colsample_bytree': 0.24466878008572795, 'subsample': 0.23048676206987062}. Best is trial 0 with value: 40806372.49541391.[0m
[32m[I 2023-03-29 17:53:16,082][0m Trial 1 finished with value: 41587427.1088909 and parameters: {'n_estimators': 690, 'max_depth': 5, 'min_child_weight': 8, 'learning_rate': 0.04951591415176063, 'gamma': 3.859134421098504, 'colsample_bytree': 0.2063905073531767, 'subsample': 0.5690758700649292}. Best is trial 0 with value: 40806372.49541391.[0m
[32m[I 2023-03-29 17:53:23,006][0m Trial 2 finished with value: 26835907.850282643 and parameters: {'n_estimators': 574, 'max_depth': 8, 'min_child_weight': 10, 'learn

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

In [None]:
## Building the optimized model
xgb_md = XGBRegressor(**study.best_trial.params).fit(X_train, Y_train)

## Predicting on test
xgb_pred = RF_md.predict(X_test)

## Computing the mse
xgb_mse = mean_squared_error(Y_test, xgb_pred)
print('The mse of the random forest model is ', xgb_mse)