In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv("insurance.csv")
print(df.head())
print(f"Shape of data: {df.shape}")

# Label Encoding

In [None]:
#label encoder
from sklearn.preprocessing import LabelEncoder

for c in df.columns:
    if df[c].dtype=='object': 
        lbl = LabelEncoder()
        lbl.fit(list(df[c].values))
        df[c] = lbl.transform(df[c].values)
        
        
df.head(10)
#female=0 male=1
#yes=1 no=0
#northeast=0 northwest=1 southeast=2 southwest=3 


# Spliting and scaling

In [None]:
X = df.drop(['charges'], axis = 1)
y = df['charges']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Model training & Testing

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMRegressor

In [None]:
lr = LinearRegression()

knn = KNeighborsRegressor(n_neighbors=10)

dt = DecisionTreeRegressor(max_depth = 3)

rf = RandomForestRegressor(max_depth = 3, n_estimators=500)

ada = AdaBoostRegressor( n_estimators=50, learning_rate =.01)

gbr = GradientBoostingRegressor(max_depth=2, n_estimators=100, learning_rate =.2)

xgb = XGBRegressor(max_depth = 3, n_estimators=50, learning_rate =.2)

cb = CatBoostRegressor(learning_rate =.01, max_depth =5, verbose = 0)

SVR = SVR(kernel = 'rbf', C = 1e3, gamma = 0.1)

Ridge = Ridge(alpha = .5)

Lasso = Lasso(alpha = .5)

mlp = MLPRegressor(hidden_layer_sizes = (100,), max_iter = 1000)

En = ElasticNet(alpha = .5, l1_ratio = .5)

Kr = KernelRidge(alpha = .5, kernel = 'rbf')

#lgr = LogisticRegression(C = 1e3, random_state = 42)

lgbmr = LGBMRegressor(n_estimators = 100, learning_rate =.2)



regressor3 = [ ('K Nearest Neighbours', knn),
               ('Decision Tree', dt), ('Random Forest', rf)]

               

regressors = [('Linear Regression', lr), ('K Nearest Neighbours', knn),
               ('Decision Tree', dt), ('Random Forest', rf), ('AdaBoost', ada),
              ('Gradient Boosting Regressor', gbr), ('XGBoost', xgb), ('catboost', cb),
              ('SVR', SVR), ('Ridge', Ridge), ('Lasso', Lasso), ('MLP', mlp),
              ('ElasticNet', En), ('Kernel Ridge', Kr),
              ('LightGBM', lgbmr)]

features = ['age','sex', 'bmi', 'children', 'smoker', 'region']


In [None]:
from sklearn.metrics import r2_score , mean_squared_error , mean_absolute_error
   
scores = []
def regression():
   
    
    for regressor_name, regressor in regressors:
        # Fit regressor to the training set
        regressor.fit(X_train, y_train)    
        # Predict 
        y_pred = regressor.predict(X_test)
        

        r2= r2_score(y_test,y_pred)
        mse= round(mean_squared_error(y_test,y_pred,squared=False),4)
        mae= round(mean_absolute_error(y_test,y_pred),4)
        scores.append({
            'model': regressor_name,
            'r2': r2,
            'mse': mse,
            'mae': mae,
            'features': features

        })
      
    return scores

In [None]:

#call the function
regression()
#print(scores)
dataframe = pd.DataFrame(scores,columns=["model", "r2", "mse", "mae", "features"])  
dataframe 



In [None]:
dataframe.sort_values(by=['r2'], ascending=False).head(1)

In [None]:
dataframe.sort_values(by=['mae'], ascending=True).head(1)

In [None]:
dataframe.sort_values(by=['mse'], ascending=True).head(1)

In [None]:
dataframe.sort_values(by=['mse'], ascending=True).head(1)

In [None]:
#feature selection 1 column
for i in range(len(features)):
     
    
            X_train_fs = X_train[:, [i]]
            X_test_fs = X_test[:, [i,]]
            for regressor_name, regressor in regressors:
                # Fit regressor to the training set
                regressor.fit(X_train_fs, y_train)    
                # Predict 
                y_pred = regressor.predict(X_test_fs)
                

                r2= r2_score(y_test,y_pred)
                mse= round(mean_squared_error(y_test,y_pred,squared=False),4)
                mae= round(mean_absolute_error(y_test,y_pred),4)
                scores.append({
                    'model': regressor_name,
                    'r2': r2,
                    'mse': mse,
                    'mae': mae,
                    'features': features[i]

                })
dataframe = pd.DataFrame(scores,columns=["model", "r2", "mse", "mae", "features"])
dataframe






In [None]:
dataframe.sort_values(by=['r2'], ascending=False).head(1)

In [None]:
dataframe.sort_values(by=['mae'], ascending=True).head(1)

In [None]:
dataframe.sort_values(by=['mse'], ascending=True).head(1)

In [None]:
#feature selection 2 column
for i in range(len(features)):
    for j in range(len(features)):
        if i!=j:
            X_train_fs = X_train[:, [i,j]]
            X_test_fs = X_test[:, [i,j]]
            for regressor_name, regressor in regressors:
                # Fit regressor to the training set
                regressor.fit(X_train_fs, y_train)    
                # Predict 
                y_pred = regressor.predict(X_test_fs)
                

                r2= r2_score(y_test,y_pred)
                mse= round(mean_squared_error(y_test,y_pred,squared=False),4)
                mae= round(mean_absolute_error(y_test,y_pred),4)
                scores.append({
                    'model': regressor_name,
                    'r2': r2,
                    'mse': mse,
                    'mae': mae,
                    'features': features[i]+" and "+features[j]

                })
dataframe = pd.DataFrame(scores,columns=["model", "r2", "mse", "mae", "features"])
dataframe
                   

In [None]:
dataframe.sort_values(by=['r2'], ascending=False).head(1)

In [None]:
dataframe.sort_values(by=['mae'], ascending=True).head(1)

In [None]:
dataframe.sort_values(by=['mse'], ascending=True).head(1)

In [None]:
#feature selection 4 column
for i in range(len(features)):
    for j in range(len(features)):
        for k in range(len(features)):
            for l in range(len(features)):
                if i!=j and i!=k and i!=l and j!=k and j!=l and k!=l:
                    X_train_fs = X_train[:, [i,j,k,l]]
                    X_test_fs = X_test[:, [i,j,k,l]]
                    for regressor_name, regressor in regressors:
                        # Fit regressor to the training set
                        regressor.fit(X_train_fs, y_train)    
                        # Predict 
                        y_pred = regressor.predict(X_test_fs)
                        

                        r2= r2_score(y_test,y_pred)
                        mse= round(mean_squared_error(y_test,y_pred,squared=False),4)
                        mae= round(mean_absolute_error(y_test,y_pred),4)
                        scores.append({
                            'model': regressor_name,
                            'r2': r2,
                            'mse': mse,
                            'mae': mae,
                            'features': features[i]+" , "+features[j]+" , "+features[k]+" , "+features[l]

                        })
dataframe = pd.DataFrame(scores,columns=["model", "r2", "mse", "mae", "features"])
dataframe

In [None]:
dataframe.sort_values(by=['r2'], ascending=False).head(1)

In [None]:
dataframe.sort_values(by=['mae'], ascending=True).head(1)

In [None]:
dataframe.sort_values(by=['mse'], ascending=True).head(1)