In [1]:
import pandas as pd

In [3]:
df = pd.read_csv('data/insurance.csv')
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
0,19,female,27.9,0,yes,southwest,16884.92
1,18,male,33.8,1,no,southeast,1725.55
2,28,male,33.0,3,no,southeast,4449.46
3,33,male,22.7,0,no,northwest,21984.47
4,32,male,28.9,0,no,northwest,3866.86


In [4]:
## Independent and dependent features
X = df.drop(labels=['expenses'],axis=1)
Y = df[['expenses']]

In [5]:
Y

Unnamed: 0,expenses
0,16884.92
1,1725.55
2,4449.46
3,21984.47
4,3866.86
...,...
1333,10600.55
1334,2205.98
1335,1629.83
1336,2007.95


In [6]:
# Segregating numerical and categorical variables
categorical_cols = X.select_dtypes(include='object').columns
numerical_cols = X.select_dtypes(exclude='object').columns

In [7]:
categorical_cols

Index(['sex', 'smoker', 'region'], dtype='object')

In [9]:
sex_categories = ['male', 'female']
smoker_categories = ['no', 'yes']
region_categories = ['southeast', 'southwest', 'northwest', 'northeast']

In [11]:
from sklearn.impute import SimpleImputer ## HAndling Missing Values
from sklearn.preprocessing import StandardScaler # HAndling Feature Scaling
from sklearn.preprocessing import OneHotEncoder # onehot Encoding
## pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [17]:
## Numerical Pipeline
num_pipeline=Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='median')),
    ('scaler',StandardScaler())

    ]

)
##categorical pipeline
categorical_pipeline = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehotencoder', OneHotEncoder(categories=[sex_categories, smoker_categories, region_categories])),
        ('scaler', StandardScaler(with_mean=False))
    ]
)


preprocessor=ColumnTransformer([
('num_pipeline',num_pipeline,numerical_cols),
('cat_pipeline',categorical_pipeline,categorical_cols)
])

In [18]:
## Train test split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=0.30,random_state=30)

In [19]:
X_train=pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test=pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

In [20]:
X_train

Unnamed: 0,num_pipeline__age,num_pipeline__bmi,num_pipeline__children,cat_pipeline__sex_male,cat_pipeline__sex_female,cat_pipeline__smoker_no,cat_pipeline__smoker_yes,cat_pipeline__region_southeast,cat_pipeline__region_southwest,cat_pipeline__region_northwest,cat_pipeline__region_northeast
0,-1.261206,-0.760093,-0.925046,2.000041,0.000000,2.467022,0.000000,0.000000,0.000000,0.000000,2.381225
1,-0.978336,0.495641,2.437452,2.000041,0.000000,2.467022,0.000000,2.224797,0.000000,0.000000,0.000000
2,-0.624748,2.208006,0.756203,2.000041,0.000000,2.467022,0.000000,2.224797,0.000000,0.000000,0.000000
3,1.284628,-0.450236,-0.084422,2.000041,0.000000,2.467022,0.000000,2.224797,0.000000,0.000000,0.000000
4,-0.766183,-1.086257,1.596828,2.000041,0.000000,0.000000,2.467022,2.224797,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...
931,-0.271160,0.577183,-0.084422,0.000000,2.000041,2.467022,0.000000,2.224797,0.000000,0.000000,0.000000
932,0.153146,-0.385003,-0.084422,0.000000,2.000041,2.467022,0.000000,0.000000,0.000000,2.326207,0.000000
933,-0.695465,0.609799,-0.925046,2.000041,0.000000,0.000000,2.467022,0.000000,2.322788,0.000000,0.000000
934,0.789604,2.517863,-0.084422,0.000000,2.000041,2.467022,0.000000,2.224797,0.000000,0.000000,0.000000


In [21]:
## Model Training

from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

In [22]:
regression=LinearRegression()
regression.fit(X_train,y_train)

In [23]:
regression.coef_

array([[3.76965520e+03, 1.93070637e+03, 4.79824260e+02, 5.29520353e+16,
        5.29520353e+16, 3.81688278e+16, 3.81688278e+16, 2.54766673e+17,
        2.44018908e+17, 2.43660235e+17, 2.38030547e+17]])

In [24]:
regression.intercept_

array([-7.66873785e+17])

In [25]:
import numpy as np
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [26]:
## Train multiple models
## Model Ecaluation
models={
    'LinearRegression':LinearRegression(),
    'Lasso':Lasso(),
    'Ridge':Ridge(),
    'Elasticnet':ElasticNet()
}
trained_model_list=[]
model_list=[]
r2_list=[]

for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_train)

    #Make Predictions
    y_pred=model.predict(X_test)

    mae, rmse, r2_square=evaluate_model(y_test,y_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model Training Performance')
    print("RMSE:",rmse)
    print("MAE:",mae)
    print("R2 score",r2_square*100)

    r2_list.append(r2_square)
    
    print('='*35)
    print('\n')


LinearRegression
Model Training Performance
RMSE: 5860.673339842195
MAE: 4020.3712686567164
R2 score 76.30036668657321


Lasso
Model Training Performance
RMSE: 5855.079466251751
MAE: 4028.6166995685417
R2 score 76.34558657051434


Ridge
Model Training Performance
RMSE: 5854.951527799284
MAE: 4028.7131234209096
R2 score 76.34662029721333


Elasticnet
Model Training Performance
RMSE: 6337.585698814458
MAE: 4523.894764595956
R2 score 72.28631440843687




In [27]:
model_list

['LinearRegression', 'Lasso', 'Ridge', 'Elasticnet']

In [28]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR

# ... (Previous code)

models = {
    'DecisionTreeRegressor': DecisionTreeRegressor(),
    'RandomForestRegressor': RandomForestRegressor(),
    'GradientBoostingRegressor': GradientBoostingRegressor(),
    'SVR': SVR()
}

trained_model_list = []
model_list = []
r2_list = []

for model_name, model in models.items():
    model.fit(X_train, y_train)

    # Make Predictions
    y_pred = model.predict(X_test)

    mae, rmse, r2_square = evaluate_model(y_test, y_pred)

    print(model_name)
    model_list.append(model_name)

    print('Model Training Performance')
    print("RMSE:", rmse)
    print("MAE:", mae)
    print("R2 score", r2_square * 100)

    r2_list.append(r2_square)

    print('=' * 35)
    print('\n')


DecisionTreeRegressor
Model Training Performance
RMSE: 6731.631475995211
MAE: 3144.085621890548
R2 score 68.73292463534192




  return fit_method(estimator, *args, **kwargs)


RandomForestRegressor
Model Training Performance
RMSE: 5092.904056401521
MAE: 2831.2559679021556
R2 score 82.1031093328053




  y = column_or_1d(y, warn=True)


GradientBoostingRegressor
Model Training Performance
RMSE: 4836.570370120989
MAE: 2668.147165367973
R2 score 83.85932780259428


SVR
Model Training Performance
RMSE: 12718.475506518787
MAE: 8207.39560537504
R2 score -11.613495989851174




  y = column_or_1d(y, warn=True)


In [29]:
from sklearn.model_selection import GridSearchCV

# Define the hyperparameters and their potential values
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Create the RandomForestRegressor model
rf_model = RandomForestRegressor()

# Create the GridSearchCV object
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, 
                           scoring='r2', cv=3, n_jobs=-1)

# Fit the GridSearchCV object to the data
grid_search.fit(X_train, y_train)

# Get the best parameters from the grid search
best_params = grid_search.best_params_

# Print the best parameters
print("Best Hyperparameters:", best_params)

# Get the best model from the grid search
best_model = grid_search.best_estimator_

# Make predictions using the best model
y_pred_best = best_model.predict(X_test)

# Evaluate the performance of the best model
mae_best, rmse_best, r2_square_best = evaluate_model(y_test, y_pred_best)

print("Best Model Performance:")
print("RMSE:", rmse_best)
print("MAE:", mae_best)
print("R2 score:", r2_square_best * 100)

  return fit_method(estimator, *args, **kwargs)


Best Hyperparameters: {'bootstrap': True, 'max_depth': None, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 50}
Best Model Performance:
RMSE: 4787.068003295957
MAE: 2662.1373492630523
R2 score: 84.18803699912318


In [30]:
from sklearn.model_selection import GridSearchCV

# Define the hyperparameters and their potential values
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20],
    'min_samples_split': [10,15,20],
    'min_samples_leaf': [4,6,8],
    'bootstrap': [True, False]
}

# Create the RandomForestRegressor model
rf_model = RandomForestRegressor()

# Create the GridSearchCV object
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, 
                           scoring='r2', cv=3, n_jobs=-1)

# Fit the GridSearchCV object to the data
grid_search.fit(X_train, y_train)

# Get the best parameters from the grid search
best_params = grid_search.best_params_

# Print the best parameters
print("Best Hyperparameters:", best_params)

# Get the best model from the grid search
best_model = grid_search.best_estimator_

# Make predictions using the best model
y_pred_best = best_model.predict(X_test)

# Evaluate the performance of the best model
mae_best, rmse_best, r2_square_best = evaluate_model(y_test, y_pred_best)

print("Best Model Performance:")
print("RMSE:", rmse_best)
print("MAE:", mae_best)
print("R2 score:", r2_square_best * 100)


  return fit_method(estimator, *args, **kwargs)


Best Hyperparameters: {'bootstrap': True, 'max_depth': None, 'min_samples_leaf': 6, 'min_samples_split': 10, 'n_estimators': 50}
Best Model Performance:
RMSE: 4794.452991851897
MAE: 2640.4400147676806
R2 score: 84.13921327783595


In [None]:
pip install dataclasses

: 