In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

plt.rcParams['figure.figsize'] = ["10", "6"] # Chance default parameters when plotting axes

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning) # Suppressing warnings

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Getting Data and First Look

In [None]:
data = pd.read_csv("/kaggle/input/insurance/insurance.csv")
data.head()

In [None]:
# Getting information from data
print("Samples in data: {}\n".format(data.index.stop)) # Show the number of samples in the data
data.info() # There are 1338 samples and no null elements in data

In [None]:
# Summarizing the central tendency, dispersion and shape of the dataset
data.describe().T

# Distribution and Relations of Samples

In [None]:
# Distribution of bmi
sns.set(style='whitegrid')
sns.histplot(data=data, x="bmi", kde=True)

In [None]:
# Relationship between "smoker" and "charges"
sns.catplot(x="smoker", y="charges", data=data, kind="box", palette="Paired")

In [None]:
# Relationship between "region" and "smoker"

data.groupby(["region","smoker"]).size().reset_index(name="count")

In [None]:
# Smokers based on region
sns.countplot(x="region", hue="smoker", data=data, palette="Paired")

In [None]:
# Relation between "age" and "bmi"
sns.lineplot(x="age", y="bmi", hue="sex", data=data, palette="Paired") 

In [None]:
# Children count based on region
children_count = data.groupby("region")["children"].sum()
children_count

In [None]:
sns.barplot(x = children_count.index.tolist(), y = children_count, palette="Paired")

In [None]:
# Relationship between "age" and "bmi"
sns.lineplot(x="age", y="bmi", data=data, palette="Paired")

In [None]:
# Relationship between "children" and "bmi"
data.groupby(["children"])["bmi"].describe()

In [None]:
sns.lineplot(x = "children", y = "bmi", data=data)

In [None]:
# Relations between "bmi" and "charges"
sns.scatterplot(x = "bmi", y="charges", hue="sex", data=data, palette="Paired")

In [None]:
# Relations between "region", "smoker" and "bmi"
sns.barplot(x = "region", y = "bmi", hue = "smoker", data=data, palette="Paired")

In [None]:
sns.heatmap(data.corr(), annot = True)

# Encoding Categorical Variables

We need to encode categorical variables before including them into the train data. 
Approaching that situation says:
- If there is a hierarchy among data, it is ordinal. (Use LabelEncoder)
- If there is no hierarchy among the data, it is nominal. (Use OneHotEncoder)

There is no hierarchical order in categorical variables which are "sex", "smoker" and "region". 

Therefore I will use OneHotEncoding.

In [None]:
# Using pandas's get_dummies func easier than using TensorFlow's OneHotEncoder to convert data
encoded_data = pd.get_dummies(data = data, columns=["sex", "smoker", "region"])
encoded_data.head()

# BMI Outlier Detection with IQR

One last touch will be the outlier detection of BMI before training.

The concept of the Interquartile Range (IQR) is used to build the boxplot graphs. IQR is a concept in statistics that is used to measure the statistical dispersion and data variability by dividing the dataset into quartiles.
In simple words, any dataset or any set of observations is divided into four defined intervals based upon the values of the data and how they compare to the entire dataset. A quartile is what divides the data into three points and four intervals.

It is the difference between the third quartile and the first quartile (IQR = Q3 -Q1). Outliers in this case are defined as the observations that are below (Q1 − 1.5x IQR) or boxplot lower whisker or above (Q3 + 1.5x IQR) or boxplot upper whisker. It can be visually represented by the box plot.

In [None]:
# Sorting BMI values
sorted_bmi = encoded_data["bmi"].sort_values()

In [None]:
# Calculate Q1, Q2, Q3 and IQR.
Q1 = np.percentile(sorted_bmi, 25, interpolation = 'midpoint') 
Q2 = np.percentile(sorted_bmi, 50, interpolation = 'midpoint') 
Q3 = np.percentile(sorted_bmi, 75, interpolation = 'midpoint') 
  
print('Q1 25 percentile of the BMI values is, ', Q1)
print('Q1 50 percentile of the BMI values is, ', Q2)
print('Q1 75 percentile of the BMI values is, ', Q3)
  
IQR = Q3 - Q1 
print('Interquartile range is', IQR)

In [None]:
# Find the lower and upper limits as Q1 – 1.5 IQR and Q3 + 1.5 IQR, respectively
low_lim = Q1 - 1.5 * IQR
up_lim = Q3 + 1.5 * IQR
print('low_limit is', low_lim)
print('up_limit is', up_lim)

In [None]:
# Find outliers in the dataset
outliers =[]
for x in sorted_bmi:
    if ((x> up_lim) or (x<low_lim)):
         outliers.append(x)
print(' Outliers in the dataset is', outliers)

In [None]:
# Paint red outlier areas on the boxplot
sns.boxplot(sorted_bmi, palette="Paired")
plt.axvspan(xmin = low_lim, xmax = sorted_bmi.min(), alpha=0.3, color='red')
plt.axvspan(xmin = up_lim, xmax = sorted_bmi.max(), alpha=0.3, color='red')

In [None]:
# Dropping outliers from data

clean_data = encoded_data[encoded_data["bmi"] < 47.41]
clean_data

In [None]:
# Checking the outliers with a graph
sns.boxplot(clean_data["bmi"].sort_values(), palette="Paired")
plt.axvspan(xmin = low_lim, xmax = clean_data["bmi"].sort_values().min(), alpha=0.3, color='red')
plt.axvspan(xmin = up_lim, xmax = clean_data["bmi"].sort_values().max(), alpha=0.3, color='red')

# Splitting data into train and test

To do this, I'll use sklearn.model_selection.train_test_split

In [None]:
from sklearn.model_selection import train_test_split

# Splitting dependent (y: "charges") and independent (X: other features) variables 
X = clean_data.drop(["charges"], axis=1)
y = clean_data["charges"]

# Splitting X and y into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=7)

In [None]:
# Checking splitted data
print(X_train.ndim, X_test.ndim, y_train.ndim, y_test.ndim)
print("")
print(type(X_train),type(X_test),type(y_train),type(y_test))

In [None]:
# Converting pandas Series to DataFrames
y_train = y_train.to_frame()
y_test = y_test.to_frame()

# Make data points closer by scaling

The most widely used types of normalization techniques in machine learning are:
- Min-max (MinMaxScaler)
- Z-score (StandardScaler)
- Log scaling (FunctionTransformer)

In order to implement the above techniques, the following functions are used to achieve functionality:

The fit(data): This method helps compute the mean and std dev for a given feature, which assists in further scaling.

The transform(data): This supports performing scaling using mean and std dev calculated using the .fit() method.

The fit_transform(): This method performs both fit and transform.

# Scaling Data

In [None]:
# Scaling data for better modelling performance
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

for i in [X_train, X_test, y_train, y_test]:
  scaler.fit_transform(i)
  print(scaler.mean_)

# Training and Model Comparison

For the modeling part I will compare 7 known algorithms for Regression:

1: Linear Regression

2: k-Nearest Neighbors Regressor

3: DecisionTreeRegressor

4: RandomForestRegressor

5: AdaBoostRegressor

6: GradientBoostingRegressor

7: XGBoost

In [None]:
# Training and comparing models
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor

# Report function
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [None]:
# Creating models with relevant class names
linear_regression_model = LinearRegression()

knn_model = KNeighborsRegressor(n_neighbors=7)

tree_model = DecisionTreeRegressor(max_depth = 3)

random_forest_model = RandomForestRegressor(max_depth = 3, n_estimators=500)

ada_model = AdaBoostRegressor( n_estimators=50, learning_rate =.01)

gradient_boosting_model = GradientBoostingRegressor(max_depth=2, n_estimators=100, learning_rate =.2)

xgb_model = XGBRegressor(max_depth = 3, n_estimators=50, learning_rate =.2, objective = "reg:squarederror")

In [None]:
regressors = [('Linear Regression', linear_regression_model), 
              ('k Nearest Neighbours', knn_model),
              ('Decision Tree', tree_model),
              ('Random Forest', random_forest_model),
              ('AdaBoost', ada_model),
              ('Gradient Boosting Regressor', gradient_boosting_model), 
              ('XGBoost', xgb_model)]

In [None]:
# Instantiate the regression model
r2_scores = []

for regressor_name, model in regressors:
    
    # checking model accuracy 
    scores = cross_val_score(model, X_train, y_train.values.ravel(), cv=5, scoring="r2")
    r2_scores.append(scores.mean()*100)

    print("{} r2 Score:".format(regressor_name),str(round(sum(scores)/5*100,3))+"%\n")


# Hyperparameter Optimization



In [None]:
from sklearn.model_selection import GridSearchCV


xgb = XGBRegressor()

parameters = {
              'eta': [.03, .07, 0.1], #(eta or shrinkage)
              "min_child_weight": [.1, .5, 1],
              "max_depth" : [4, 5, 6, 7]}


xgb_grid = GridSearchCV(xgb,
                        parameters,
                        cv = 5,
                        n_jobs = -1
                        )

xgb_grid.fit(X_train, y_train)

print(xgb_grid.best_score_)
print(xgb_grid.best_params_)
         

# Model Evaluation

In [None]:
# Calculation of RMSE, RMAE and r2_score
    
# Checking model accuracy 
rmse = cross_val_score( XGBRegressor(learning_rate=0.03, max_depth = 4, min_child_weight = 0.1), X_train, y_train ,cv=5, scoring = "neg_mean_squared_error")
print("Mean Squared Error of XGBoost:", (rmse, 3))

rmae = cross_val_score( XGBRegressor(learning_rate=0.07, max_depth = 4, min_child_weight = 0.1), X_train, y_train ,cv=5, scoring = "neg_mean_absolute_error")
print("Mean Absolute Error of XGBoost:", (rmae, 3))

r2 = cross_val_score( XGBRegressor(learning_rate=0.07, max_depth = 4, min_child_weight = 0.1), X_train, y_train ,cv=5, scoring = "r2").mean() # Generally used to determine how good is the model. 
print("r2 Score of XGBoost:",str(round(r2*100,4))+"%\n")