In [1]:
import pandas as pd 
import numpy as np 

In [6]:
df = pd.read_csv(r'D:\Data Science\iNeuron\Internship\Insurance Premium Prediction\data\insurance.csv')
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
0,19,female,27.9,0,yes,southwest,16884.92
1,18,male,33.8,1,no,southeast,1725.55
2,28,male,33.0,3,no,southeast,4449.46
3,33,male,22.7,0,no,northwest,21984.47
4,32,male,28.9,0,no,northwest,3866.86


In [7]:
## Independent and dependent features
X = df.drop(labels=['expenses'],axis=1)
y = df[['expenses']]

In [8]:
# Define which columns should be ordinal-encoded and which should be scaled
categorical_cols = X.select_dtypes(include='object').columns
numerical_cols = X.select_dtypes(exclude='object').columns
numerical_cols

Index(['age', 'bmi', 'children'], dtype='object')

In [9]:
from sklearn.preprocessing import StandardScaler # Handling Feature Scaling
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, OrdinalEncoder # One Hot Encoding
## pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [10]:
sex_cat = ['male', 'female']
smoker_cat = ['no', 'yes']
region_cat = ['southeast', 'southwest', 'northwest', 'northeast']

In [11]:
## Numerical Pipeline
num_pipeline=Pipeline(
    steps=[
    # ('imputer',SimpleImputer(strategy='median')),
    ('scaler',StandardScaler())

    ]
)

# Categorigal Pipeline
cat_pipeline=Pipeline(
    steps=[
    # ('imputer',SimpleImputer(strategy='most_frequent')),
    ('ordinalencoder',OrdinalEncoder(categories=[sex_cat, smoker_cat, region_cat])),
    ('scaler',StandardScaler())
    ]
)

preprocessor=ColumnTransformer([
('num_pipeline',num_pipeline,numerical_cols),
('cat_pipeline',cat_pipeline,categorical_cols)
])

In [17]:
preprocessor

In [9]:
## Train test split

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.30,random_state=30)

In [10]:
X_train.head()

Unnamed: 0,age,sex,bmi,children,smoker,region
1077,21,male,26.0,0,no,northeast
61,25,male,33.7,4,no,southeast
796,30,male,44.2,2,no,southeast
1061,57,male,27.9,1,no,southeast
69,28,male,24.0,3,yes,southeast


In [11]:
X_train=pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test=pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

In [12]:
X_train.head()

Unnamed: 0,num_pipeline__age,num_pipeline__bmi,num_pipeline__children,cat_pipeline__sex,cat_pipeline__smoker,cat_pipeline__region
0,-1.261206,-0.760093,-0.925046,-1.006431,-0.511327,1.405078
1,-0.978336,0.495641,2.437452,-1.006431,-0.511327,-1.26438
2,-0.624748,2.208006,0.756203,-1.006431,-0.511327,-1.26438
3,1.284628,-0.450236,-0.084422,-1.006431,-0.511327,-1.26438
4,-0.766183,-1.086257,1.596828,-1.006431,1.955695,-1.26438


In [13]:
X_train['cat_pipeline__smoker'].value_counts()

cat_pipeline__smoker
-0.511327    742
 1.955695    194
Name: count, dtype: int64

In [14]:
X_train.shape

(936, 6)

In [15]:
# from graphviz import Source

# # Create a dot file representing the pipeline structure
# dot_data = "digraph preprocessor_graph { rankdir=LR; " + preprocessor.named_steps['preprocessing'].get_feature_names_out().replace('\n', '; ') + "; }"

# # Visualize the dot data using graphviz
# src = Source(dot_data, format="png")
# src.render("preprocessor_structure")

In [16]:
# from graphviz import Digraph
# from sklearn import set_config

# # Set sklearn to use the 'diagram' display option
# set_config(display='diagram')

# # Create a Digraph (graph visualization) object
# dot = Digraph(comment='ColumnTransformer Structure')

# # Add a node for the ColumnTransformer
# dot.node('ct', 'ColumnTransformer', shape='box')

# # Loop through the transformers in the ColumnTransformer
# for name, transformer, columns in preprocessor.transformers_:
#     # Check if the transformer has 'named_steps' attribute (for Pipelines)
#     if hasattr(transformer, 'named_steps'):
#         # Add a node for the transformer
#         dot.node(name, name, shape='box')
#         # Add an edge from ColumnTransformer to the transformer
#         dot.edge('ct', name)
#         # Loop through the steps in the transformer (sub-pipeline)
#         for step_name, step in transformer.named_steps.items():
#             # Add a node for the step
#             dot.node(step_name, step_name)
#             # Add an edge from the transformer to the step
#             dot.edge(name, step_name)
#     else:
#         # Add a node for the transformer
#         dot.node(name, name)
#         # Add an edge from ColumnTransformer to the transformer
#         dot.edge('ct', name)

# # Render the visualization (output as PDF)
# dot.render('column_transformer_structure', format='pdf')


In [17]:
## Model Training

from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor, BaggingRegressor
from sklearn.svm import SVR
import xgboost as xgb
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

In [18]:
import numpy as np
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mse)
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [32]:
## Train multiple models

models={
    'LinearRegression':LinearRegression(),
    'Lasso':Lasso(),
    'Ridge':Ridge(),
    'Elasticnet':ElasticNet(),
    # 'NaiveBias':MultinomialNB(),
    'KNNR':KNeighborsRegressor(n_neighbors=5),
    'DecisionTree':DecisionTreeRegressor(random_state=42),
    'SVR linear':SVR(kernel='linear'),
    'SVR rbf':SVR(kernel='rbf'),
    'RandomForest':RandomForestRegressor(random_state=42),
    'AdaBoost':AdaBoostRegressor(),
    'Gradient Boosting':GradientBoostingRegressor(),
    'XGB':xgb.XGBRegressor(),
    'BaggingSVR':BaggingRegressor()
}

trained_model_list = {'Model_Name':[], 'Model': [],'Train_Acc': [], 'R2_Score': [], 'RMSE': [], 'MAE': []}

for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_train)

    # Training Accuracy
    train_acc = model.score(X_train, y_train)

    #Make Predictions on test data
    y_pred=model.predict(X_test)

    mae, rmse, r2_square=evaluate_model(y_test,y_pred)

    print(list(models.keys())[i])

    print('Model Training Performance')
    print('Train_Acc:', train_acc)
    print("RMSE:",rmse)
    print("MAE:",mae)
    print("R2 score",r2_square*100)

    trained_model_list['Model_Name'].append(list(models.keys())[i])
    trained_model_list['Model'].append(model)
    trained_model_list['Train_Acc'].append(train_acc)
    trained_model_list['R2_Score'].append(r2_square*100)
    trained_model_list['RMSE'].append(rmse)
    trained_model_list['MAE'].append(mae)


    # trained_model_list.update({'Model_Name': list(models.keys())[i], 'Model': model, 'R2_Score': r2_square*100})

    print('='*35)
    print('\n')

LinearRegression
Model Training Performance
Train_Acc: 0.7448696022447795
RMSE: 5856.215912002978
MAE: 4032.775826140364
R2 score 76.33640323948327


Lasso
Model Training Performance
Train_Acc: 0.7448695576483975
RMSE: 5856.332118633082
MAE: 4032.7085325372304
R2 score 76.33546410262076


Ridge
Model Training Performance
Train_Acc: 0.7448687322838119
RMSE: 5856.468027738752
MAE: 4034.024213604533
R2 score 76.33436571435843


Elasticnet
Model Training Performance
Train_Acc: 0.6611489736567657
RMSE: 6873.853442883536
MAE: 4966.959591486746
R2 score 67.39778311658738


KNNR
Model Training Performance
Train_Acc: 0.8744835060905698
RMSE: 5432.785822046603
MAE: 3159.283850746269
R2 score 79.63465537332019


DecisionTree
Model Training Performance
Train_Acc: 0.9981033529989862
RMSE: 6860.293225969959
MAE: 3235.506542288557
R2 score 67.52628660376976


SVR linear
Model Training Performance
Train_Acc: -0.024905714543626223
RMSE: 12295.008356494869
MAE: 7755.428049500913
R2 score -4.304789762033

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


SVR rbf
Model Training Performance
Train_Acc: -0.0981509447866229
RMSE: 12719.642966311027
MAE: 8199.88553419539
R2 score -11.633987478669216




  return fit_method(estimator, *args, **kwargs)


RandomForest
Model Training Performance
Train_Acc: 0.9775109905245781
RMSE: 5110.42369844157
MAE: 2873.908251086235
R2 score 81.979766573041


AdaBoost
Model Training Performance
Train_Acc: 0.8391942031208657
RMSE: 5354.2828176371495
MAE: 4167.039788005943
R2 score 80.21895595119585


Gradient Boosting
Model Training Performance
Train_Acc: 0.9141380088896844
RMSE: 4827.922103005303
MAE: 2680.4304917064596
R2 score 83.91699843884012




  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


XGB
Model Training Performance
Train_Acc: 0.9959133943834275
RMSE: 5818.572804920652
MAE: 3340.223923722452
R2 score 76.63963946933356


BaggingSVR
Model Training Performance
Train_Acc: 0.971258409140544
RMSE: 5263.11298444194
MAE: 2963.050099087894
R2 score 80.88686259472787




  return column_or_1d(y, warn=True)


In [33]:
pd.DataFrame(trained_model_list)

Unnamed: 0,Model_Name,Model,Train_Acc,R2_Score,RMSE,MAE
0,LinearRegression,LinearRegression(),0.74487,76.336403,5856.215912,4032.775826
1,Lasso,Lasso(),0.74487,76.335464,5856.332119,4032.708533
2,Ridge,Ridge(),0.744869,76.334366,5856.468028,4034.024214
3,Elasticnet,ElasticNet(),0.661149,67.397783,6873.853443,4966.959591
4,KNNR,KNeighborsRegressor(),0.874484,79.634655,5432.785822,3159.283851
5,DecisionTree,DecisionTreeRegressor(random_state=42),0.998103,67.526287,6860.293226,3235.506542
6,SVR linear,SVR(kernel='linear'),-0.024906,-4.30479,12295.008356,7755.42805
7,SVR rbf,SVR(),-0.098151,-11.633987,12719.642966,8199.885534
8,RandomForest,"(DecisionTreeRegressor(max_features=1.0, rando...",0.977511,81.979767,5110.423698,2873.908251
9,AdaBoost,"(DecisionTreeRegressor(max_depth=3, random_sta...",0.839194,80.218956,5354.282818,4167.039788


In [25]:
modelnow = trained_model_list['Model'][10]
modelnow

In [29]:
modelnow.score(X_train, y_train)

0.9141380088896844