In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder ,MinMaxScaler,StandardScaler,RobustScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error,r2_score,accuracy_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv("https://raw.githubusercontent.com/mujahidashraf/data/refs/heads/main/insurance.csv")

In [None]:
le = LabelEncoder()
df['sex'] = le.fit_transform(df['sex'])
df['smoker'] = le.fit_transform(df['smoker'])
df['region'] = le.fit_transform(df['region'])
X = df[['age','sex','bmi','children','smoker','region']]
y = df['expenses']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
regressor = DecisionTreeRegressor(random_state=42)
regressor.fit(X_train, y_train)

# Get Feature Importance
feature_importance = regressor.feature_importances_
feature_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': feature_importance
})

# Sort by importance
feature_df = feature_df.sort_values(by="Importance", ascending=False)
print(feature_df)
selected_features = feature_df[feature_df['Importance'] > 0.02]['Feature'].tolist()
X_selected = df[selected_features]
X_selected.head()

    Feature  Importance
4    smoker    0.606824
2       bmi    0.216922
0       age    0.133178
3  children    0.023706
5    region    0.013198
1       sex    0.006171


Unnamed: 0,smoker,bmi,age,children
0,1,27.9,19,0
1,0,33.8,18,1
2,0,33.0,28,3
3,0,22.7,33,0
4,0,28.9,32,0


In [None]:
scalers = {
    "StandardScaler": StandardScaler(),
    "MinMaxScaler": MinMaxScaler(),
    "RobustScaler": RobustScaler()
}

# Define regression models
models = {
    "LinearRegression": LinearRegression(),
    "Lasso": Lasso(alpha=0.1),
    "Ridge": Ridge(alpha=1.0),
    "ElasticNet": ElasticNet(alpha=0.1, l1_ratio=0.5),
    "KNNRegressor": KNeighborsRegressor(n_neighbors=5),
    "DecisionTreeRegressor": DecisionTreeRegressor(max_depth=5),
}

# Store results
results = {}

for scaler_name, scaler in scalers.items():
    for model_name, model in models.items():
        # Create pipeline
        pipeline = Pipeline([
            ("scaler", scaler),
            ("regressor", model)
        ])

        # Train model
        pipeline.fit(X_train, y_train)

        # Predict
        y_pred = pipeline.predict(X_test)

        # Evaluate performance
        mse = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)

        # Store results
        results[f"{model_name} with {scaler_name}"] = {"MSE": mse, "R2 Score": r2}


In [None]:
for model, metrics in results.items():
  print(f"{model}: ")
  print(f"MSE = {metrics['MSE']:.4f}, R2 Score = {metrics['R2 Score']:.4f}")

LinearRegression with StandardScaler: 
MSE = 33802383.5549, R2 Score = 0.7695
Lasso with StandardScaler: 
MSE = 33802324.1918, R2 Score = 0.7695
Ridge with StandardScaler: 
MSE = 33805133.8765, R2 Score = 0.7694
ElasticNet with StandardScaler: 
MSE = 34178067.0258, R2 Score = 0.7669
KNNRegressor with StandardScaler: 
MSE = 25275990.0444, R2 Score = 0.8276
DecisionTreeRegressor with StandardScaler: 
MSE = 20869582.3142, R2 Score = 0.8577
LinearRegression with MinMaxScaler: 
MSE = 33802383.5549, R2 Score = 0.7695
Lasso with MinMaxScaler: 
MSE = 33802131.5286, R2 Score = 0.7695
Ridge with MinMaxScaler: 
MSE = 33791848.7582, R2 Score = 0.7695
ElasticNet with MinMaxScaler: 
MSE = 42886139.3394, R2 Score = 0.7075
KNNRegressor with MinMaxScaler: 
MSE = 29563314.4081, R2 Score = 0.7984
DecisionTreeRegressor with MinMaxScaler: 
MSE = 20869582.3142, R2 Score = 0.8577
LinearRegression with RobustScaler: 
MSE = 33802383.5549, R2 Score = 0.7695
Lasso with RobustScaler: 
MSE = 33802310.0332, R2 Scor

In [None]:
df_encoded = pd.get_dummies(df, columns=['sex','smoker','region'])
X = df_encoded.drop(columns=['expenses'])
y = df_encoded['expenses']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
regressor = DecisionTreeRegressor(random_state=42)
regressor.fit(X_train, y_train)

# Get Feature Importance
feature_importance = regressor.feature_importances_
feature_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': feature_importance
})

# Sort by importance
feature_df = feature_df.sort_values(by="Importance", ascending=False)
print(feature_df)
selected_features = feature_df[feature_df['Importance'] > 0.01]['Feature'].tolist()
X_selected = df_encoded[selected_features]
X_selected.head()

     Feature  Importance
6   smoker_1    0.606824
1        bmi    0.207776
0        age    0.132428
2   children    0.019703
9   region_2    0.007402
4      sex_1    0.006111
7   region_0    0.005921
8   region_1    0.005468
10  region_3    0.004467
3      sex_0    0.003898
5   smoker_0    0.000000


Unnamed: 0,smoker_1,bmi,age,children
0,True,27.9,19,0
1,False,33.8,18,1
2,False,33.0,28,3
3,False,22.7,33,0
4,False,28.9,32,0


In [None]:
scalers = {
    "StandardScaler": StandardScaler(),
    "MinMaxScaler": MinMaxScaler(),
    "RobustScaler": RobustScaler()
}

# Define regression models
models = {
    "LinearRegression": LinearRegression(),
    "Lasso": Lasso(alpha=0.1),
    "Ridge": Ridge(alpha=1.0),
    "ElasticNet": ElasticNet(alpha=0.1, l1_ratio=0.5),
    "KNNRegressor": KNeighborsRegressor(n_neighbors=5),
    "DecisionTreeRegressor": DecisionTreeRegressor(max_depth=5),
}

# Store results
results2 = {}

for scaler_name, scaler in scalers.items():
    for model_name, model in models.items():
        # Create pipeline
        pipeline = Pipeline([
            ("scaler", scaler),
            ("regressor", model)
        ])

        # Train model
        pipeline.fit(X_train, y_train)

        # Predict
        y_pred = pipeline.predict(X_test)

        # Evaluate performance
        mse = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)

        # Store results
        results2[f"{model_name} with {scaler_name}"] = {"MSE": mse, "R2 Score": r2}

In [None]:
for model, metrics in results2.items():
  print(f"{model}: ")
  print(f"MSE = {results[model]['MSE']:.4f}, R2 Score = {results[model]['R2 Score']:.4f}")
  print(f"MSE = {metrics['MSE']:.4f}, R2 Score = {metrics['R2 Score']:.4f}")

LinearRegression with StandardScaler: 
MSE = 33802383.5549, R2 Score = 0.7695
MSE = 33777093.1008, R2 Score = 0.7696
Lasso with StandardScaler: 
MSE = 33802324.1918, R2 Score = 0.7695
MSE = 33777097.0443, R2 Score = 0.7696
Ridge with StandardScaler: 
MSE = 33805133.8765, R2 Score = 0.7694
MSE = 33777655.8082, R2 Score = 0.7696
ElasticNet with StandardScaler: 
MSE = 34178067.0258, R2 Score = 0.7669
MSE = 33903045.5991, R2 Score = 0.7688
KNNRegressor with StandardScaler: 
MSE = 25275990.0444, R2 Score = 0.8276
MSE = 29402812.2203, R2 Score = 0.7995
DecisionTreeRegressor with StandardScaler: 
MSE = 20869582.3142, R2 Score = 0.8577
MSE = 20957331.4924, R2 Score = 0.8571
LinearRegression with MinMaxScaler: 
MSE = 33802383.5549, R2 Score = 0.7695
MSE = 33777093.1008, R2 Score = 0.7696
Lasso with MinMaxScaler: 
MSE = 33802131.5286, R2 Score = 0.7695
MSE = 33777001.6757, R2 Score = 0.7696
Ridge with MinMaxScaler: 
MSE = 33791848.7582, R2 Score = 0.7695
MSE = 33756421.3007, R2 Score = 0.7698
El