In [1]:
import os
os.chdir("../")

In [2]:
from dataclasses import dataclass
import pandas as pd
import numpy as np
from sklearn.ensemble import (
    AdaBoostRegressor,
    GradientBoostingRegressor,
    RandomForestRegressor
)
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, median_absolute_error, mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, cross_val_score
from xgboost import XGBRegressor
import joblib as jl
from src.logger import logger
from src.utils import evaluate_models
from sklearn.pipeline import make_pipeline

In [3]:
train_df = pd.read_csv("artifacts/train_data.csv")

In [4]:
X_train, X_test, y_train, y_test = train_test_split(train_df.iloc[:, :-1], train_df.iloc[:, -1], test_size=0.3, shuffle=True, random_state=42)
pre_processor = jl.load("artifacts/preprocessor.joblib")

In [6]:
LR = LinearRegression()
LR_pipe = make_pipeline(pre_processor, LR)
LR_cv = cross_val_score(LR_pipe,X_train,y_train,cv=5, scoring='neg_root_mean_squared_error', n_jobs=-1)
np.mean(LR_cv)

np.float64(-1357.828667291603)

In [36]:
RFR = RandomForestRegressor()
RF_pipe = make_pipeline(pre_processor, RFR)
RFR_CV = cross_val_score(RF_pipe, X_train, y_train, scoring='neg_root_mean_squared_error', n_jobs=-1)
np.mean(RFR_CV)

np.float64(-1451.1467257560416)

In [38]:
RF_pipe.get_params(deep=True)

{'memory': None,
 'steps': [('columntransformer',
   ColumnTransformer(transformers=[('Numerical_Pipeline',
                                    Pipeline(steps=[('imputer',
                                                     SimpleImputer(strategy='median')),
                                                    ('scaler', StandardScaler())]),
                                    ['age', 'bmi', 'children']),
                                   ('Categorical_Pipeline',
                                    Pipeline(steps=[('imputer',
                                                     SimpleImputer(strategy='most_frequent')),
                                                    ('OneHotEncoder',
                                                     OneHotEncoder()),
                                                    ('scaler',
                                                     StandardScaler(with_mean=False))]),
                                    ['gender', 'smoker', 'region',
            

In [15]:
XGBR = XGBRegressor()
XG_pipe = make_pipeline(pre_processor, XGBR)
XGBR_CV = cross_val_score(XG_pipe, X_train, y_train, scoring='neg_root_mean_squared_error', n_jobs=-1)
np.mean(XGBR_CV)

np.float64(-1372.5402395076815)

In [22]:
# Fit the pipeline on the training data
XG_pipe.fit(X_train, y_train)

# Access the trained XGBRegressor inside the pipeline
xgb_model = XG_pipe.named_steps['xgbregressor']

# Extract feature importances
feature_importances = xgb_model.feature_importances_

# Display feature importances
for feature, importance in zip(X_train.columns, feature_importances):
    print(f"{feature}: {importance}")


age: 0.0009113559499382973
gender: 0.0017294995486736298
bmi: 0.0023405873216688633
children: 0.00984949991106987
smoker: 0.0
region: 0.34131765365600586
medical_history: 0.0
family_medical_history: 0.0058792270720005035
exercise_frequency: 0.0006577105959877372
occupation: 0.0006766581209376454
coverage_level: 0.001049300655722618


In [39]:
# Fit the pipeline on the training data
RF_pipe.fit(X_train, y_train)

# Access the trained XGBRegressor inside the pipeline
rfr_model = RF_pipe.named_steps['randomforestregressor']

# Extract feature importances
feature_importances = rfr_model.feature_importances_

# Display feature importances
for feature, importance in zip(X_train.columns, feature_importances):
    print(f"{feature}: {importance}")


age: 0.05071217597394721
gender: 0.08125266449371514
bmi: 0.02327363697741817
children: 0.00628069996200737
smoker: 0.006316322202888095
region: 0.1574060349612039
medical_history: 0.163700788115566
family_medical_history: 0.007061480603053384
exercise_frequency: 0.0045846516846908296
occupation: 0.0045040722888541425
coverage_level: 0.0049074548910508385


In [13]:
LR_cv

array([-1848.55050968,  -637.95423392, -1846.37850886, -1838.21929538,
        -618.04078862])

In [8]:
np.mean(LR_cv)

np.float64(0.8873002074478338)

In [14]:
GBR = GradientBoostingRegressor()
GBR_pipe = make_pipeline(pre_processor, GBR)
GBR_CV = cross_val_score(GBR_pipe, X_train, y_train,n_jobs=-1, scoring='neg_root_mean_squared_error')
np.mean(GBR_CV)

np.float64(-1405.5360377863922)

In [None]:
XG_pipe.get_params(deep=True)

{'memory': None,
 'steps': [('columntransformer',
   ColumnTransformer(transformers=[('Numerical_Pipeline',
                                    Pipeline(steps=[('imputer',
                                                     SimpleImputer(strategy='median')),
                                                    ('scaler', StandardScaler())]),
                                    ['age', 'bmi', 'children']),
                                   ('Categorical_Pipeline',
                                    Pipeline(steps=[('imputer',
                                                     SimpleImputer(strategy='most_frequent')),
                                                    ('OneHotEncoder',
                                                     OneHotEncoder()),
                                                    ('scaler',
                                                     StandardScaler(with_mean=False))]),
                                    ['gender', 'smoker', 'region',
            

In [None]:

def Model_Tuning(X_train, y_train, hyperparameters, pre_processor):
    # Instantiate the model
    model = XGBRegressor()
    
    # Create a pipeline with the preprocessor and model
    pipe = make_pipeline(pre_processor, model)
    
    # Set up GridSearchCV
    grid = GridSearchCV(pipe, param_grid=hyperparameters, n_jobs=-1, cv=5, return_train_score=True, scoring="neg_root_mean_squared_error")
    
    # Fit the model
    grid.fit(X_train, y_train)
    
    # Extract the mean test and train scores
    mean_test_score = grid.cv_results_['mean_test_score']
    mean_train_score = grid.cv_results_['mean_train_score']
    
    return mean_test_score, mean_train_score


In [18]:
params = {
    "xgbregressor__tree_method":["approx"],
    "xgbregressor__objective" : ["reg:absoluteerror"],
    "xgbregressor__learning_rate" : [0.3],
    "xgbregressor__n_estimators" : [500],
    "xgbregressor__min_child_weight": np.arange(1,26, 2),
    "xgbregressor__max_depth" : np.arange(3,13, 2),
    "xgbregressor__colsample_bynode" : np.arange(0.5, 1.0, 0.1),
    "xgbregressor__eval_metric": ["mae"],
}

Model_Tuning(X_train,y_train,params,pre_processor)

KeyboardInterrupt: 

In [None]:
y_train.median()

In [8]:
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

In [11]:

def Model_Tuning(X_train, y_train, hyperparameters, pre_processor):
    # Instantiate the model
    model = RandomForestRegressor()
    
    # Create a pipeline with the preprocessor and model
    pipe = make_pipeline(pre_processor, model)
    
    # Set up GridSearchCV
    grid = BayesSearchCV(pipe, search_spaces=hyperparameters, n_jobs=-1, cv=5, return_train_score=True, scoring="neg_root_mean_squared_error")
    
    # Fit the model
    grid.fit(X_train, y_train)
    
    # Extract the mean test and train scores
    
    return grid


In [12]:
params = {
    "randomforestregressor__n_estimators" : Integer(200, 501, 'uniform'),
    "randomforestregressor__max_depth" : Integer(2,6,"uniform"),
    "randomforestregressor__min_samples_split" : Integer(10, 50, 'uniform')
}

RF_grid = Model_Tuning(X_train,y_train,params,pre_processor)



In [16]:
RF_best = RF_grid.best_estimator_

In [17]:
RF_best.fit(X_train,y_train)

In [25]:
# Fit the pipeline on the training data

# Access the trained XGBRegressor inside the pipeline
rfr_model = RF_best.named_steps['randomforestregressor']

# Extract feature importances
feature_importances = rfr_model.feature_importances_

# Display feature importances
for feature, importance in zip(X_train.columns, feature_importances):
    print(f"{feature}: {importance}")


age: 0.0
gender: 7.608384047659358e-06
bmi: 4.0079835430535855e-07
children: 3.926360718658506e-05
smoker: 3.64866334164031e-05
region: 0.18660210621578532
medical_history: 0.24584356622722206
family_medical_history: 0.0
exercise_frequency: 0.0
occupation: 0.0
coverage_level: 0.0


In [31]:
RF_grid.cv_results_['mean_train_score']

array([-2520.20779445, -2522.65095408, -1940.28989406, -2064.55011797,
       -2519.63288743, -3112.38160102, -3112.38153076, -2064.46250133,
       -2520.52045577, -3112.38155816, -2217.66561331, -1940.78848092,
       -1940.69976451, -1940.94151479, -1940.6356434 , -1940.66786978,
       -1940.77221472, -1940.7108214 , -2522.95636617, -2064.63729549,
       -2064.65511903, -2064.82955025, -2217.59059441, -2217.63733797,
       -3112.38155835, -1940.45896339, -2217.62982914, -2217.63599994,
       -3112.38156479, -3112.38156995, -1940.49306247, -2064.67219599,
       -2515.98165845, -2217.73502292, -2064.58227973, -3112.38155231,
       -1940.85595146, -2064.41870931, -2217.70295007, -2217.72432254,
       -2514.32331169, -2217.69993402, -2217.68338684, -2217.60687275,
       -2217.65893583, -2064.42419036, -2064.51612519, -2064.68575807,
       -2516.21332655, -2217.61831553])