In [1]:
import matplotlib.pyplot as plt
from sklearn import metrics
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.metrics import r2_score,mean_squared_error,explained_variance_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor

# Insurance Estimation

In [2]:
insurance_df = pd.read_csv("insurance.csv")
insurance_df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [3]:
X = insurance_df.iloc[:,:-1]
y = insurance_df.iloc[:,-1]

In [4]:
X["region"].value_counts()

southeast    364
southwest    325
northwest    325
northeast    324
Name: region, dtype: int64

In [5]:
X["region"].unique()

array(['southwest', 'southeast', 'northwest', 'northeast'], dtype=object)

In [6]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [7]:
x_test

Unnamed: 0,age,sex,bmi,children,smoker,region
578,52,male,30.200,1,no,southwest
610,47,female,29.370,1,no,southeast
569,48,male,40.565,2,yes,northwest
1034,61,male,38.380,0,no,northwest
198,51,female,18.050,0,no,northwest
...,...,...,...,...,...,...
1084,62,female,30.495,2,no,northwest
726,41,male,28.405,1,no,northwest
1132,57,male,40.280,0,no,northeast
725,30,female,39.050,3,yes,southeast


In [8]:
categorical_features = X.select_dtypes(include=[object]).columns.values.tolist()

In [9]:
print(categorical_features)

['sex', 'smoker', 'region']


In [10]:
categorical_transformer = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)

In [11]:
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", categorical_transformer, categorical_features)
    ], remainder='passthrough')

In [12]:
x_train = preprocessor.fit_transform(x_train)
x_test = preprocessor.transform(x_test)

In [13]:
clf_trees = {
    'Decision Tree': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor(),
    'AdaBoost': AdaBoostRegressor(),
    'GradientBoosting': GradientBoostingRegressor(verbose=0),
    'CatBoost': CatBoostRegressor(verbose=0),
    'XGBoost': XGBRegressor()  
}

In [14]:
results = []
for name, model in clf_trees.items():
    model.fit(x_train, y_train)
    prediction = model.predict(x_test)
   
    r2_score = metrics.r2_score(prediction, y_test)
    mae = metrics.mean_absolute_error(prediction, y_test)
    results.append({
        'ModelName': name,
        'R2 Score': r2_score,
        'MAE': mae
    })

In [15]:
results = pd.DataFrame(results)

In [16]:
results

Unnamed: 0,ModelName,R2 Score,MAE
0,Decision Tree,0.70425,3726.850201
1,Random Forest,0.871744,2598.109315
2,AdaBoost,0.855189,3653.27818
3,GradientBoosting,0.885559,2434.077769
4,CatBoost,0.872664,2578.749332
5,XGBoost,0.845206,2918.580883
