In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
DATA_PATH = Path(r"C:\Users\tariq\Desktop\Data_EDA_project\data\EDA_files_CSVs\insurance.csv")
df = pd.read_csv(DATA_PATH)
df.head()


Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [9]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

X = df.drop(columns='charges')
y = df['charges']

numeric = ['age','bmi','children']
cat = ['sex','smoker','region']

preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numeric),
    ('cat', OneHotEncoder(drop='first', sparse_output=False)
, cat)
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [24]:
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import root_mean_squared_error, mean_squared_error, r2_score

pipe_lr = Pipeline([('pre', preprocessor), ('lr', LinearRegression())])
pipe_lr.fit(X_train, y_train)
pred = pipe_lr.predict(X_test)

print("MAE:", mean_absolute_error(y_test, pred))
print("RMSE:", root_mean_squared_error(y_test, pred))
print("R2:", r2_score(y_test, pred))




MAE: 4181.194473753652
RMSE: 5796.284659276275
R2: 0.7835929767120722


In [25]:
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

# XGBoost
pipe_xgb = Pipeline([('pre', preprocessor), ('xgb', XGBRegressor(random_state=42, eval_metric='rmse'))])
g_xgb = GridSearchCV(pipe_xgb, {
    'xgb__n_estimators':[100,200],
    'xgb__learning_rate':[0.05,0.1,0.2],
    'xgb__max_depth':[3,5,7]
}, cv=4, scoring='neg_root_mean_squared_error', n_jobs=-1)
g_xgb.fit(X_train,y_train)
print("XGBoost best:", g_xgb.best_params_)

# LightGBM
pipe_lgb = Pipeline([('pre', preprocessor), ('lgb', LGBMRegressor(random_state=42))])
g_lgb = GridSearchCV(pipe_lgb, {
    'lgb__n_estimators':[100,200],
    'lgb__learning_rate':[0.05,0.1,0.2],
    'lgb__max_depth':[3,5,7]
}, cv=4, scoring='neg_root_mean_squared_error', n_jobs=-1)
g_lgb.fit(X_train,y_train)
print("LightGBM best:", g_lgb.best_params_)

XGBoost best: {'xgb__learning_rate': 0.05, 'xgb__max_depth': 3, 'xgb__n_estimators': 100}


KeyboardInterrupt: 

In [26]:
# Evaluate all models
models.update({'XGBoost': g_xgb, 'LightGBM': g_lgb})
for name, model in models.items():
    y_pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    print(f"{name} RMSE: {rmse:.2f}")

NameError: name 'models' is not defined

In [22]:
import joblib

# Load the model back
loaded_model = joblib.load("best_insurance_model.joblib")

# Test prediction on one sample (from test data)
sample = X_test.iloc[[0]]  # pick one row
predicted_charge = loaded_model.predict(sample)
print("Predicted insurance charge:", predicted_charge[0])
print("Actual insurance charge:", y_test.iloc[0])


Predicted insurance charge: 9885.67902400627
Actual insurance charge: 9095.06825
