In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score, KFold
from sklearn.pipeline import make_pipeline

In [2]:
df = pd.read_parquet('data_cleaned.parquet.gzip')
df.head()

Unnamed: 0_level_0,age,sex,bmi,children,smoker,region,charges
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,19,female,27.9,0,yes,southwest,16884.923828
1,18,male,33.77,1,no,southeast,1725.552246
2,28,male,33.0,3,no,southeast,4449.461914
3,33,male,22.705,0,no,northwest,21984.470703
4,32,male,28.879999,0,no,northwest,3866.855225


In [3]:
numeric_features = ["age", "bmi", "children"]
numeric_transformer = StandardScaler()

categorical_features = ["sex", "smoker", "region"]
categorical_transformer = OneHotEncoder(handle_unknown="ignore")

In [4]:
features = df.drop('charges', axis=1)
target = df['charges']

In [5]:
preprocessor = ColumnTransformer(transformers=[
  ("num", numeric_transformer, numeric_features),
  ("cat", categorical_transformer, categorical_features)
])

In [6]:
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=2)
kf = KFold(n_splits=10, shuffle=True)

In [7]:
from sklearn.linear_model import LinearRegression
regression = LinearRegression()
pipeline = make_pipeline(preprocessor, poly, regression)
cv_results = cross_val_score(pipeline, features, target, cv=kf, scoring="r2", n_jobs=-1)
cv_results.mean()

0.8339142959544521

In [8]:
from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor(random_state = 0)
pipeline = make_pipeline(preprocessor, regressor)
cv_results = cross_val_score(pipeline, features, target, cv=kf, scoring="r2", n_jobs=-1)
cv_results.mean()

0.7171489026210185

In [9]:
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators = 10, random_state = 0)
pipeline = make_pipeline(preprocessor, regressor)
cv_results = cross_val_score(pipeline, features, target, cv=kf, scoring="r2", n_jobs=-1)
cv_results.mean()

0.8229397763460259

In [10]:
from xgboost import XGBRegressor
regressor = XGBRegressor()
pipeline = make_pipeline(preprocessor, regressor)
cv_results = cross_val_score(pipeline, features, target, cv=kf, scoring="r2", n_jobs=-1)
cv_results.mean()

0.8157234303543

In [19]:
from sklearn.ensemble import GradientBoostingRegressor
regressor = GradientBoostingRegressor(n_estimators= 100, max_depth=3, min_samples_split=2, learning_rate=0.1, loss="squared_error")
pipeline = make_pipeline(preprocessor, regressor)
cv_results = cross_val_score(pipeline, features, target, cv=kf, scoring="r2", n_jobs=-1)
cv_results.mean()

0.855998697637584