In [1]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import RandomizedSearchCV
import pandas as pd

params = dict(
    n_estimators = [i for i in range(100, 201)],
    max_features = [0.5, 0.6, 0.7, 0.8, 0.9, "auto"],
    max_depth = [i for i in range(2, 11)],
    min_samples_split = [i for i in range(2, 11)],
    min_samples_leaf = [i for i in range(1, 11)]
)

estimator = RandomForestRegressor(random_state = 0)

In [3]:
### Predict insurance costs for all charges ###

trainInsurance = pd.read_csv("../../data/sets/insurance_train.csv")
testInsurance = pd.read_csv("../../data/sets/insurance_test.csv")

trainX = trainInsurance.drop(columns = ["charges"])
trainY = trainInsurance["charges"]

testX = testInsurance.drop(columns = ["charges"])
testY = testInsurance["charges"]

randomSearch = RandomizedSearchCV(
    estimator = estimator, 
    param_distributions = params, 
    n_iter = 300, 
    random_state = 0
)

randomSearch.fit(trainX, trainY)
predictions = randomSearch.predict(testX)

print("Best Training Score: " + str(randomSearch.best_score_))
print("Best Params: " + str(randomSearch.best_params_))
print("R2 score: " + str(r2_score(testY, predictions)))
print("MSE: " + str(mean_squared_error(testY, predictions)))

Best Training Score: 0.8593153001811272
Best Params: {'n_estimators': 116, 'min_samples_split': 7, 'min_samples_leaf': 10, 'max_features': 0.8, 'max_depth': 6}
R2 score: 0.8499593222371465
MSE: 0.17223531799095473


In [4]:
### Predict insurance costs for high charges ###

highTrain = pd.read_csv("../../data/sets/insurance_high_train.csv")
highTest = pd.read_csv("../../data/sets/insurance_high_test.csv")

highTrainX = highTrain.drop(columns = ["charges"])
highTrainY = highTrain["charges"]

highTestX = highTest.drop(columns = ["charges"])
highTestY = highTest["charges"]

randomSearch = RandomizedSearchCV(
    estimator = estimator, 
    param_distributions = params, 
    n_iter = 300, 
    random_state = 0
)

randomSearch.fit(highTrainX, highTrainY)
predictions = randomSearch.predict(highTestX)

print("Best Training Score: " + str(randomSearch.best_score_))
print("Best Params: " + str(randomSearch.best_params_))
print("R2 score: " + str(r2_score(highTestY, predictions)))
print("MSE: " + str(mean_squared_error(highTestY, predictions)))

Best Training Score: 0.8340333620008297
Best Params: {'n_estimators': 139, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_features': 0.8, 'max_depth': 10}
R2 score: 0.8366161395403259
MSE: 0.1381187457675087


In [5]:
### Predict insurance costs for low-to-medium charges ###

lowTrain = pd.read_csv("../../data/sets/insurance_low_train.csv")
lowTest = pd.read_csv("../../data/sets/insurance_low_test.csv")

lowTrainX = lowTrain.drop(columns = ["charges"])
lowTrainY = lowTrain["charges"]

lowTestX = lowTest.drop(columns = ["charges"])
lowTestY = lowTest["charges"]

randomSearch = RandomizedSearchCV(
    estimator = estimator, 
    param_distributions = params, 
    n_iter = 300, 
    random_state = 0
)

randomSearch.fit(lowTrainX, lowTrainY)
predictions = randomSearch.predict(lowTestX)

print("Best Training Score: " + str(randomSearch.best_score_))
print("Best Params: " + str(randomSearch.best_params_))
print("R2 score: " + str(r2_score(lowTestY, predictions)))
print("MSE: " + str(mean_squared_error(lowTestY, predictions)))

Best Training Score: 0.9029538149922793
Best Params: {'n_estimators': 176, 'min_samples_split': 3, 'min_samples_leaf': 1, 'max_features': 0.7, 'max_depth': 9}
R2 score: 0.8803258565871266
MSE: 0.12748795608496002
