In [18]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import RandomizedSearchCV
import pandas as pd

params = dict(
    n_estimators = [i for i in range(100, 201)],
    max_depth = [i for i in range(2, 11)],
    learning_rate = [(i / 100) for i in range(1, 21)],
    gamma = [(i / 100) for i in range(1, 11)],
    subsample = [0.5, 0.6, 0.7, 0.8, 0.9, 1],
    colsample_bytree = [0.5, 0.6, 0.7, 0.8, 0.9, 1]
)

model = XGBRegressor(objective = "reg:squarederror", random_state = 0)

In [19]:
### Predict insurance costs for all charges ###

trainInsurance = pd.read_csv("../../data/sets/insurance_train.csv")
testInsurance = pd.read_csv("../../data/sets/insurance_test.csv")

trainX = trainInsurance.drop(columns = ["charges"])
trainY = trainInsurance["charges"]

testX = testInsurance.drop(columns = ["charges"])
testY = testInsurance["charges"]

randomSearch = RandomizedSearchCV(
    estimator = model, 
    param_distributions = params, 
    n_iter = 300, 
    random_state = 0
)

randomSearch.fit(trainX, trainY)
predictions = randomSearch.predict(testX)

print("Best Training Score: " + str(randomSearch.best_score_))
print("Best Params: " + str(randomSearch.best_params_))
print("R2 score: " + str(r2_score(testY, predictions)))
print("MSE: " + str(mean_squared_error(testY, predictions)))

Best Training Score: 0.8580885588377456
Best Params: {'subsample': 0.9, 'n_estimators': 135, 'max_depth': 2, 'learning_rate': 0.05, 'gamma': 0.06, 'colsample_bytree': 0.9}
R2 score: 0.8513412202920095
MSE: 0.17064900383629192


In [20]:
### Predict insurance costs for high charges ###

highTrain = pd.read_csv("../../data/sets/insurance_high_train.csv")
highTest = pd.read_csv("../../data/sets/insurance_high_test.csv")

highTrainX = highTrain.drop(columns = ["charges"])
highTrainY = highTrain["charges"]

highTestX = highTest.drop(columns = ["charges"])
highTestY = highTest["charges"]

randomSearch = RandomizedSearchCV(
    estimator = model, 
    param_distributions = params, 
    n_iter = 300, 
    random_state = 0
)

randomSearch.fit(highTrainX, highTrainY)
predictions = randomSearch.predict(highTestX)

print("Best Training Score: " + str(randomSearch.best_score_))
print("Best Params: " + str(randomSearch.best_params_))
print("R2 score: " + str(r2_score(highTestY, predictions)))
print("MSE: " + str(mean_squared_error(highTestY, predictions)))

Best Training Score: 0.8182477206951925
Best Params: {'subsample': 1, 'n_estimators': 166, 'max_depth': 3, 'learning_rate': 0.06, 'gamma': 0.1, 'colsample_bytree': 0.9}
R2 score: 0.8476563001555733
MSE: 0.12878579737860602


In [21]:
### Predict insurance costs for low charges ###

lowTrain = pd.read_csv("../../data/sets/insurance_low_train.csv")
lowTest = pd.read_csv("../../data/sets/insurance_low_test.csv")

lowTrainX = lowTrain.drop(columns = ["charges"])
lowTrainY = lowTrain["charges"]

lowTestX = lowTest.drop(columns = ["charges"])
lowTestY = lowTest["charges"]

randomSearch = RandomizedSearchCV(
    estimator = model, 
    param_distributions = params, 
    n_iter = 300, 
    random_state = 0
)

randomSearch.fit(lowTrainX, lowTrainY)
predictions = randomSearch.predict(lowTestX)

print("Best Training Score: " + str(randomSearch.best_score_))
print("Best Params: " + str(randomSearch.best_params_))
print("R2 score: " + str(r2_score(lowTestY, predictions)))
print("MSE: " + str(mean_squared_error(lowTestY, predictions)))

Best Training Score: 0.9096155752149357
Best Params: {'subsample': 1, 'n_estimators': 195, 'max_depth': 2, 'learning_rate': 0.07, 'gamma': 0.01, 'colsample_bytree': 0.6}
R2 score: 0.8943069162358996
MSE: 0.11259403942349125
