In [None]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_regression, RFE
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.inspection import permutation_importance
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')


In [None]:

# Load Admission and Insurance Datasets
admission_data = pd.read_csv("./data/admission_data.csv")
insurance_data = pd.read_csv("./data/insurance.csv")

# Preview Data
print("Admission Data:")
print(admission_data.head())

print("\nInsurance Data:")
print(insurance_data.head())


In [None]:

# Admission Data Overview
print(admission_data.info())
print(admission_data.describe())

# Insurance Data Overview
print(insurance_data.info())
print(insurance_data.describe())


In [None]:

# Encode categorical data in Insurance Dataset
encoder = OneHotEncoder(drop='first', sparse_output=False)
encoded = encoder.fit_transform(insurance_data[['sex', 'smoker', 'region']])
encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out())

insurance_data_encoded = pd.concat([insurance_data.drop(['sex', 'smoker', 'region'], axis=1), encoded_df], axis=1)

print("Encoded Insurance Data:")
print(insurance_data_encoded.head())


In [None]:

# Split Admission Data
X_admission = admission_data.iloc[:,0:-1]
y_admission = admission_data.iloc[:,-1:]
X_train_admission, X_test_admission, y_train_admission, y_test_admission = train_test_split(
    X_admission, y_admission, test_size=0.2, random_state=42)

# Split Insurance Data
X_insurance = insurance_data_encoded.drop('expenses', axis=1)
y_insurance = insurance_data_encoded['expenses']
X_train_insurance, X_test_insurance, y_train_insurance, y_test_insurance = train_test_split(
    X_insurance, y_insurance, test_size=0.2, random_state=42)


In [None]:

# Filter Method using Mutual Information for Admission Data
selector = SelectKBest(score_func=mutual_info_regression, k=5)
X_train_selected = selector.fit_transform(X_train_admission, y_train_admission)
X_test_selected = selector.transform(X_test_admission)

model = LinearRegression()
model.fit(X_train_selected, y_train_admission)
y_pred = model.predict(X_test_selected)

filter_score_admission = r2_score(y_test_admission, y_pred)
print("R2 Score (Filter Method - Admission):", filter_score_admission)


In [None]:

# Filter Method using ANOVA for Insurance Data
selector = SelectKBest(score_func=f_classif, k=5)
X_train_selected = selector.fit_transform(X_train_insurance, y_train_insurance)
X_test_selected = selector.transform(X_test_insurance)

model = LinearRegression()
model.fit(X_train_selected, y_train_insurance)
y_pred = model.predict(X_test_selected)

filter_score_insurance = r2_score(y_test_insurance, y_pred)
print("R2 Score (Filter Method - Insurance):", filter_score_insurance)


In [None]:

# Wrapper Method using RFE for Admission Data
model = LinearRegression()
rfe = RFE(estimator=model, n_features_to_select=5)
X_train_selected = rfe.fit_transform(X_train_admission, y_train_admission)
X_test_selected = rfe.transform(X_test_admission)

model.fit(X_train_selected, y_train_admission)
y_pred = model.predict(X_test_selected)

wrapper_score_admission = r2_score(y_test_admission, y_pred)
print("R2 Score (Wrapper Method - Admission):", wrapper_score_admission)


In [None]:

# Wrapper Method using RFE for Insurance Data
model = LinearRegression()
rfe = RFE(estimator=model, n_features_to_select=5)
X_train_selected = rfe.fit_transform(X_train_insurance, y_train_insurance)
X_test_selected = rfe.transform(X_test_insurance)

model.fit(X_train_selected, y_train_insurance)
y_pred = model.predict(X_test_selected)

wrapper_score_insurance = r2_score(y_test_insurance, y_pred)
print("R2 Score (Wrapper Method - Insurance):", wrapper_score_insurance)

In [None]:

# Embedded Method using Random Forest for Admission Data
model = RandomForestRegressor()
model.fit(X_train_admission, y_train_admission)
y_pred = model.predict(X_test_admission)

embedded_score_admission = r2_score(y_test_admission, y_pred)
print("R2 Score (Embedded Method - Admission):", embedded_score_admission)


In [None]:

# Embedded Method using Random Forest for insurance Data
model = RandomForestRegressor()
model.fit(X_train_insurance, y_train_insurance)
y_pred = model.predict(X_test_insurance)

embedded_score_insurance = r2_score(y_test_insurance, y_pred)
print("R2 Score (Embedded Method - Insurance):", embedded_score_insurance)

In [None]:
# Permutation Importance for Admission Data
model = RandomForestRegressor()
model.fit(X_train_admission, y_train_admission)
y_pred = model.predict(X_test_admission)

permutation_score_admission = r2_score(y_test_admission, y_pred)
print("R2 Score (Permutation Importance - Admission):", permutation_score_admission)


In [None]:

# Permutation Importance for Insurance Data
model = RandomForestRegressor()
model.fit(X_train_insurance, y_train_insurance)
y_pred = model.predict(X_test_insurance)

permutation_score_insurance = r2_score(y_test_insurance, y_pred)
print("R2 Score (Permutation Importance - Insurance):", permutation_score_insurance)


In [None]:
# Create Summary Table
results = {
    "Method": ["Filter (Admission)", "Wrapper (Admission)", "Embedded (Admission)", "Permutation (Admission)",
               "Filter (Insurance)", "Wrapper (Insurance)", "Embedded (Insurance)", "Permutation (Insurance)"],
    "R2 Score": [filter_score_admission, wrapper_score_admission, embedded_score_admission, permutation_score_admission,
                 filter_score_insurance, wrapper_score_insurance, embedded_score_insurance, permutation_score_insurance]
}

results_df = pd.DataFrame(results)
print(results_df)


## ✅ Summary of Results
| Method | R2 Score |
|--------|----------|
| Filter (Admission) | High if correlation is strong |
| Filter (Insurance) | ANOVA may depend on normality |
| Wrapper (Admission) | Usually higher, but computationally expensive |
| Embedded (Admission) | Tree-based models often perform well |
| Permutation (Insurance) | Measures true impact of features |

**✅ Goal:** Select features that improve model performance and interpretability.
