In [27]:
import pandas as pd
import numpy as np

df = pd.read_csv("insurance.csv")
print(df.head())
print(f"Shape of data: {df.shape}")

   age     sex     bmi  children smoker     region      charges
0   19  female  27.900         0    yes  southwest  16884.92400
1   18    male  33.770         1     no  southeast   1725.55230
2   28    male  33.000         3     no  southeast   4449.46200
3   33    male  22.705         0     no  northwest  21984.47061
4   32    male  28.880         0     no  northwest   3866.85520
Shape of data: (1338, 7)


# Label Encoding

In [28]:
#label encoder
from sklearn.preprocessing import LabelEncoder

for c in df.columns:
    if df[c].dtype=='object': 
        lbl = LabelEncoder()
        lbl.fit(list(df[c].values))
        df[c] = lbl.transform(df[c].values)
        
        
df.head(10)
#female=0 male=1
#yes=1 no=0
#northeast=0 northwest=1 southeast=2 southwest=3 


Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,1,3,16884.924
1,18,1,33.77,1,0,2,1725.5523
2,28,1,33.0,3,0,2,4449.462
3,33,1,22.705,0,0,1,21984.47061
4,32,1,28.88,0,0,1,3866.8552
5,31,0,25.74,0,0,2,3756.6216
6,46,0,33.44,1,0,2,8240.5896
7,37,0,27.74,3,0,1,7281.5056
8,37,1,29.83,2,0,0,6406.4107
9,60,0,25.84,0,0,1,28923.13692


# Spliting and scaling

In [29]:
X = df.drop(['charges'], axis = 1)
y = df['charges']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Model training & Testing

In [30]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import LogisticRegression
#from lightgbm import LGBMRegressor

In [31]:
lr = LinearRegression()

knn = KNeighborsRegressor(n_neighbors=10)

dt = DecisionTreeRegressor(max_depth = 3)

rf = RandomForestRegressor(max_depth = 3, n_estimators=500)

ada = AdaBoostRegressor( n_estimators=50, learning_rate =.01)

gbr = GradientBoostingRegressor(max_depth=2, n_estimators=100, learning_rate =.2)

xgb = XGBRegressor(max_depth = 3, n_estimators=50, learning_rate =.2)

cb = CatBoostRegressor(learning_rate =.01, max_depth =5, verbose = 0)

SVR = SVR(kernel = 'rbf', C = 1e3, gamma = 0.1)

Ridge = Ridge(alpha = .5)

Lasso = Lasso(alpha = .5)

mlp = MLPRegressor(hidden_layer_sizes = (100,), max_iter = 1000)

En = ElasticNet(alpha = .5, l1_ratio = .5)

Kr = KernelRidge(alpha = .5, kernel = 'rbf')

#lgr = LogisticRegression(C = 1e3, random_state = 42)

#lgbmr = LGBMRegressor(n_estimators = 100, learning_rate =.2)



     

regressors = [('Linear Regression', lr), ('K Nearest Neighbours', knn),
               ('Decision Tree', dt), ('Random Forest', rf), ('AdaBoost', ada),
              ('Gradient Boosting Regressor', gbr), ('XGBoost', xgb), ('catboost', cb),
              ('SVR', SVR), ('Ridge', Ridge), ('Lasso', Lasso), ('MLP', mlp),
              ('ElasticNet', En), ('Kernel Ridge', Kr),
              #('LightGBM', lgbmr)
              ]

features = ['age','sex', 'bmi', 'children', 'smoker', 'region']


In [32]:
from sklearn.metrics import r2_score , mean_squared_error , mean_absolute_error
   
scores = []
def regression():
   
    
    for regressor_name, regressor in regressors:
        # Fit regressor to the training set
        regressor.fit(X_train, y_train)    
        # Predict 
        y_pred = regressor.predict(X_test)
        

        r2= r2_score(y_test,y_pred)
        mse= round(mean_squared_error(y_test,y_pred,squared=False),4)
        mae= round(mean_absolute_error(y_test,y_pred),4)
        scores.append({
            'model': regressor_name,
            'r2': r2,
            'mse': mse,
            'mae': mae,
            'features': features

        })
      
    return scores

In [33]:

#call the function
regression()
#print(scores)
dataframe = pd.DataFrame(scores,columns=["model", "r2", "mse", "mae", "features"])  
dataframe 





Unnamed: 0,model,r2,mse,mae,features
0,Linear Regression,0.783346,5799.5871,4186.5089,"[age, sex, bmi, children, smoker, region]"
1,K Nearest Neighbours,0.838228,5011.4695,3103.6022,"[age, sex, bmi, children, smoker, region]"
2,Decision Tree,0.853057,4776.2611,2865.6378,"[age, sex, bmi, children, smoker, region]"
3,Random Forest,0.866961,4544.6809,2736.7095,"[age, sex, bmi, children, smoker, region]"
4,AdaBoost,0.854823,4747.4846,3029.7623,"[age, sex, bmi, children, smoker, region]"
5,Gradient Boosting Regressor,0.875355,4398.9693,2388.3039,"[age, sex, bmi, children, smoker, region]"
6,XGBoost,0.878295,4346.7924,2443.7868,"[age, sex, bmi, children, smoker, region]"
7,catboost,0.881499,4289.1929,2398.818,"[age, sex, bmi, children, smoker, region]"
8,SVR,0.804005,5516.1601,2434.7733,"[age, sex, bmi, children, smoker, region]"
9,Ridge,0.783325,5799.8698,4187.2407,"[age, sex, bmi, children, smoker, region]"


In [34]:
dataframe.sort_values(by=['r2'], ascending=False).head()

Unnamed: 0,model,r2,mse,mae,features
7,catboost,0.881499,4289.1929,2398.818,"[age, sex, bmi, children, smoker, region]"
6,XGBoost,0.878295,4346.7924,2443.7868,"[age, sex, bmi, children, smoker, region]"
5,Gradient Boosting Regressor,0.875355,4398.9693,2388.3039,"[age, sex, bmi, children, smoker, region]"
3,Random Forest,0.866961,4544.6809,2736.7095,"[age, sex, bmi, children, smoker, region]"
4,AdaBoost,0.854823,4747.4846,3029.7623,"[age, sex, bmi, children, smoker, region]"


In [35]:
dataframe.sort_values(by=['mae'], ascending=True).head()

Unnamed: 0,model,r2,mse,mae,features
5,Gradient Boosting Regressor,0.875355,4398.9693,2388.3039,"[age, sex, bmi, children, smoker, region]"
7,catboost,0.881499,4289.1929,2398.818,"[age, sex, bmi, children, smoker, region]"
8,SVR,0.804005,5516.1601,2434.7733,"[age, sex, bmi, children, smoker, region]"
6,XGBoost,0.878295,4346.7924,2443.7868,"[age, sex, bmi, children, smoker, region]"
3,Random Forest,0.866961,4544.6809,2736.7095,"[age, sex, bmi, children, smoker, region]"


In [36]:
dataframe.sort_values(by=['mse'], ascending=True).head()

Unnamed: 0,model,r2,mse,mae,features
7,catboost,0.881499,4289.1929,2398.818,"[age, sex, bmi, children, smoker, region]"
6,XGBoost,0.878295,4346.7924,2443.7868,"[age, sex, bmi, children, smoker, region]"
5,Gradient Boosting Regressor,0.875355,4398.9693,2388.3039,"[age, sex, bmi, children, smoker, region]"
3,Random Forest,0.866961,4544.6809,2736.7095,"[age, sex, bmi, children, smoker, region]"
4,AdaBoost,0.854823,4747.4846,3029.7623,"[age, sex, bmi, children, smoker, region]"


In [37]:
dataframe.sort_values(by=['mse'], ascending=True).head()

Unnamed: 0,model,r2,mse,mae,features
7,catboost,0.881499,4289.1929,2398.818,"[age, sex, bmi, children, smoker, region]"
6,XGBoost,0.878295,4346.7924,2443.7868,"[age, sex, bmi, children, smoker, region]"
5,Gradient Boosting Regressor,0.875355,4398.9693,2388.3039,"[age, sex, bmi, children, smoker, region]"
3,Random Forest,0.866961,4544.6809,2736.7095,"[age, sex, bmi, children, smoker, region]"
4,AdaBoost,0.854823,4747.4846,3029.7623,"[age, sex, bmi, children, smoker, region]"
