<a href="https://colab.research.google.com/github/muajnstu/Implementation-of-Active-Learning-Method-in-Regression-Model/blob/main/Active_Learning_Implementation_code_on_US_Health_Insurance_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install lightgbm catboost

Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [None]:
# importing basic libraries
import pandas as pd
import numpy as np

# visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

# preprocessing libraries
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer

# feature selection
from sklearn.feature_selection import SelectKBest, mutual_info_classif, mutual_info_regression, f_regression, RFE, SequentialFeatureSelector # Removed PermutationImportance from here
from sklearn.inspection import permutation_importance

# models
import shap
import xgboost as xgb
#from lime.lime_tabular import LimeTabularExplainer
from sklearn.pipeline import make_pipeline
from collections import defaultdict
# machine learning models
from sklearn.neural_network import MLPRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression, Ridge,  Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, GradientBoostingRegressor, ExtraTreesRegressor, StackingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.svm import SVR
from catboost import CatBoostRegressor
from sklearn.ensemble import HistGradientBoostingRegressor, AdaBoostRegressor, VotingRegressor
from sklearn.linear_model import Lasso, ElasticNet, HuberRegressor, PassiveAggressiveRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score

# model tunning
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

# evaluation metrics
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, explained_variance_score, mean_absolute_percentage_error
from scipy.stats import zscore

# warnings
import warnings
warnings.filterwarnings('ignore')


In [None]:
# load the data
df = pd.read_csv('https://raw.githubusercontent.com/muajnstu/Implementation-of-Active-Learning-Method-in-Regression-Model/refs/heads/main/encoded_US_health_insurance_dataset.csv')

In [None]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,charges,region_northwest,region_southeast,region_southwest
0,19,0,27.9,0,1,16884.924,False,False,True
1,18,1,33.77,1,0,1725.5523,False,True,False
2,28,1,33.0,3,0,4449.462,False,True,False
3,33,1,22.705,0,0,21984.47061,True,False,False
4,32,1,28.88,0,0,3866.8552,True,False,False


In [None]:
X=df.drop('charges',axis=1)
y=df['charges']

In [None]:
def mutual_info_feature_selection(X, y, k=5):

    selector = SelectKBest(score_func=mutual_info_regression, k=k)
    X_selected = selector.fit_transform(X, y)
    selected_features = X.columns[selector.get_support()]

    print(f"Selected features using mutual_info_regression: {selected_features.tolist()}")
    return selected_features.tolist(), X_selected


def sequential_feature_selection(X, y, k=5, direction='forward'):

    model = RandomForestRegressor(random_state=42)
    selector = SequentialFeatureSelector(
        estimator=model,
        n_features_to_select=k,
        direction=direction,
        scoring='r2',
        cv=5,
        n_jobs=-1
    )
    selector.fit(X, y)
    selected_features = X.columns[selector.get_support()]
    X_selected = selector.transform(X)

    print(f"Selected features using SFS ({direction}): {selected_features.tolist()}")
    return selected_features.tolist(), X_selected


def rfe_feature_selection(X, y, k=7, estimator=None):

    if estimator is None:
        estimator = RandomForestRegressor(random_state=42)

    selector = RFE(estimator, n_features_to_select=k)
    selector.fit(X, y)
    selected_features = X.columns[selector.get_support()]
    X_selected = selector.transform(X)

    print(f"Selected features using RFE: {selected_features.tolist()}")
    return selected_features.tolist(), X_selected


def permutation_importance_selection(X, y, k=3, estimator=None, scoring='r2', n_repeats=10, random_state=42):

    if estimator is None:
        estimator = RandomForestRegressor(random_state=random_state)

    estimator.fit(X, y)
    result = permutation_importance(estimator, X, y, scoring=scoring, n_repeats=n_repeats, random_state=random_state)

    importance_df = pd.DataFrame({
        'feature': X.columns,
        'importance_mean': result.importances_mean
    }).sort_values(by='importance_mean', ascending=False)

    selected_features = importance_df['feature'].head(k).tolist()
    X_selected = X[selected_features].values

    print(f"Selected features using permutation importance: {selected_features}")
    return selected_features, X_selected


 # Feature Transformation

def minmax_scaler_func(df):
    minmax_scaler = MinMaxScaler()
    df_minmax_scaled = minmax_scaler.fit_transform(df)
    return pd.DataFrame(df_minmax_scaled, columns=df.columns, index=df.index)

def standard_scaler_func(df):
    standard_scaler = StandardScaler()
    df_standard_scaled = standard_scaler.fit_transform(df)
    return pd.DataFrame(df_standard_scaled, columns=df.columns, index=df.index)

def log_transformer_func(df):
    df_log_transformed = np.log1p(df)
    return pd.DataFrame(df_log_transformed, columns=df.columns, index=df.index)

In [None]:
def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    non_zero_idx = y_true != 0
    return np.mean(np.abs((y_true[non_zero_idx] - y_pred[non_zero_idx]) / y_true[non_zero_idx])) * 100

In [None]:
def summarize_and_export_metrics(
    iteration_detail_rows,
    summary_csv="active_learning_results.csv",
    iteration_csv="active_learning_iterations.csv"
):

    # Save all per-iteration details
    iteration_detail_df = pd.DataFrame(iteration_detail_rows)
    iteration_detail_df.to_csv(iteration_csv, index=False)
    print(f"Per-iteration results saved to {iteration_csv}")

    # Compute average metrics per model
    summary_df = (
        iteration_detail_df
        .groupby("Model")
        .mean(numeric_only=True)
        .reset_index()
    )
    summary_df.to_csv(summary_csv, index=False)
    print(f"Average results saved to {summary_csv}")

    return summary_df, iteration_detail_df

In [None]:
# Define models
models = {
        "Linear Regression": LinearRegression(),
        "ANN Regressor": MLPRegressor(hidden_layer_sizes=(32, 16),activation='relu',solver='adam',max_iter=1500,early_stopping=True,random_state=42),
        "CatBoost": CatBoostRegressor(verbose=0, random_state=42),
        "HistGBR": HistGradientBoostingRegressor(random_state=42),
        "Lasso": Lasso(alpha=0.1, random_state=42),
        "Extra Trees Regressor": ExtraTreesRegressor(n_estimators=100, random_state=42),
        "SVR": SVR(kernel='linear'),
        "KNN Regressor": KNeighborsRegressor(
            n_neighbors=15,
            p=1,
            weights='distance'
        ),
        "XGBoost Regressor": XGBRegressor(
            n_estimators=100, random_state=42, verbosity=0
        ),
        "Bagging Regressor": BaggingRegressor(n_estimators=10, random_state=42),
        "Ridge Regression": Ridge(alpha=1.0),
        "Gradient Boosting": GradientBoostingRegressor(),
        "Random Forest": RandomForestRegressor(random_state=42),
        "Stacked Model": StackingRegressor(
            estimators=[
                ("ridge", Ridge()),
                ("rf", RandomForestRegressor()),
            ],
            final_estimator=GradientBoostingRegressor(),
        ),
        "VotingRegressor": VotingRegressor(estimators=[
        ("rf", RandomForestRegressor(random_state=42)),
        ("xgb", XGBRegressor(random_state=42, verbosity=0)),
        ("ridge", Ridge())
        ]),
        "LGBM": LGBMRegressor(max_depth=6,num_leaves=20,min_data_in_leaf=10,random_state=42),
        "ElasticNet": ElasticNet(alpha=0.1, l1_ratio=0.5, random_state=42),
        "CART Regressor": DecisionTreeRegressor(
            max_depth=None,
            max_features=None,
            min_samples_leaf=2,
            min_samples_split=20,
            random_state=42
        ),
        "Huber": HuberRegressor(),
        "AdaBoost": AdaBoostRegressor(random_state=42),
        "VotingRegressor": VotingRegressor(estimators=[
        ("rf", RandomForestRegressor(random_state=42)),
        ("xgb", XGBRegressor(random_state=42, verbosity=0)),
        ("ridge", Ridge())
        ]),
        "Polynomial Ridge (deg 2)": make_pipeline(PolynomialFeatures(2), Ridge()),
    }


In [None]:
def run_active_learning_iterations(df, models, target_col="charges", initial_samples=200, query_size=20, iterations=20):

    X = df.drop(columns=target_col)
    y = df[target_col]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    n, k = X_test.shape
    iteration_detail_rows = []

    for name, model in models.items():
        X_train_shuffled, y_train_shuffled = shuffle(
            X_train, y_train, random_state=42
        )
        X_labeled = pd.DataFrame(X_train_shuffled[:initial_samples])
        y_labeled = pd.Series(y_train_shuffled[:initial_samples])
        X_unlabeled = pd.DataFrame(X_train_shuffled[initial_samples:])
        y_unlabeled = pd.Series(y_train_shuffled[initial_samples:])

        for i in range(iterations):
            X_labeled_np = X_labeled.values
            y_labeled_np = y_labeled.values
            X_unlabeled_np = X_unlabeled.values
            X_test_np = X_test.values

            model.fit(X_labeled_np, y_labeled_np)
            predictions = model.predict(X_unlabeled_np)

            # Uncertainty sampling
            uncertainty = None
            if hasattr(model, "estimators_"):
                try:
                    if isinstance(model, GradientBoostingRegressor):
                        pred_ensemble = np.array(
                            [tree[0].predict(X_unlabeled_np) for tree in model.estimators_]
                        )
                    else:
                        pred_ensemble = np.array(
                            [estimator.predict(X_unlabeled_np) for estimator in model.estimators_]
                        )
                    uncertainty = np.var(pred_ensemble, axis=0)
                except Exception:
                    uncertainty = np.abs(predictions)
            else:
                uncertainty = np.abs(predictions)

            uncertain_idx = np.argsort(uncertainty)[-query_size:]
            X_new = X_unlabeled.iloc[uncertain_idx]
            y_new = y_unlabeled.iloc[uncertain_idx]
            X_labeled = pd.concat([X_labeled, X_new])
            y_labeled = pd.concat([y_labeled, y_new])
            X_unlabeled = X_unlabeled.drop(X_new.index)
            y_unlabeled = y_unlabeled.drop(y_new.index)

            y_pred = model.predict(X_test_np)
            mse = mean_squared_error(y_test, y_pred)
            rmse = np.sqrt(mse)
            mae = mean_absolute_error(y_test, y_pred)
            mape = mean_absolute_percentage_error(y_test, y_pred)
            r2 = r2_score(y_test, y_pred)
            adjusted_r2 = 1 - ((1 - r2) * (n - 1) / (n - k - 1))
            evs = explained_variance_score(y_test, y_pred)

            iteration_detail_rows.append({
                "Model": name,
                "Iteration": i + 1,
                "MAE": mae,
                "RMSE": rmse,
                "R2": r2,
                "Adj_R2": adjusted_r2,
                "EVS": evs,
                "MAPE": mape
            })

            if X_unlabeled.empty:
                break

    return iteration_detail_rows

In [None]:
selected_features_fs1, _ = mutual_info_feature_selection(X, y, k=5)
df_fs1 = pd.concat([X[selected_features_fs1], y], axis=1)

selected_features_fs2, _ = sequential_feature_selection(X, y, k=3, direction='forward')
df_fs2 = pd.concat([X[selected_features_fs2], y], axis=1)

selected_features_fs3, _ = rfe_feature_selection(X, y, k=5)
df_fs3 = pd.concat([X[selected_features_fs3], y], axis=1)

selected_features_fs4, _ = permutation_importance_selection(X, y, k=5)
df_fs4 = pd.concat([X[selected_features_fs4], y], axis=1)

Selected features using mutual_info_regression: ['age', 'sex', 'bmi', 'children', 'smoker']
Selected features using SFS (forward): ['age', 'bmi', 'smoker']
Selected features using RFE: ['age', 'bmi', 'children', 'smoker', 'region_northwest']
Selected features using permutation importance: ['smoker', 'bmi', 'age', 'children', 'sex']


In [None]:
#DataFrame after feature transformation

df_scaled = standard_scaler_func(df)
df_minmax = minmax_scaler_func(df)
df_log = log_transformer_func(df)

In [None]:
#MinMaxScaler
df_fs1_minmax = minmax_scaler_func(df_fs1)
df_fs2_minmax = minmax_scaler_func(df_fs2)
df_fs3_minmax = minmax_scaler_func(df_fs3)
df_fs4_minmax = minmax_scaler_func(df_fs4)

# StandardScaler
df_fs1_scaled = standard_scaler_func(df_fs1)
df_fs2_scaled = standard_scaler_func(df_fs2)
df_fs3_scaled = standard_scaler_func(df_fs3)
df_fs4_scaled = standard_scaler_func(df_fs4)

#LogScaler

df_fs1_log= log_transformer_func(df_fs1)
df_fs2_log= log_transformer_func(df_fs2)
df_fs3_log= log_transformer_func(df_fs3)
df_fs4_log= log_transformer_func(df_fs4)


In [None]:
df_dict = {
    'df': df,
    'df_log': df_log,
    'df_minmax': df_minmax,
    'df_scaled': df_scaled,
    'df_fs1': df_fs1,
    'df_fs1_minmax': df_fs1_minmax,
    'df_fs1_scaled': df_fs1_scaled,
    'df_fs1_log': df_fs1_log,
    'df_fs2': df_fs2,
    'df_fs2_minmax': df_fs2_minmax,
    'df_fs2_scaled': df_fs2_scaled,
    'df_fs2_log': df_fs2_log,
    'df_fs3': df_fs3,
    'df_fs3_minmax': df_fs3_minmax,
    'df_fs3_scaled': df_fs3_scaled,
    'df_fs3_log': df_fs3_log,
    'df_fs4': df_fs4,
    'df_fs4_minmax': df_fs4_minmax,
    'df_fs4_scaled': df_fs4_scaled,
    'df_fs4_log': df_fs4_log
}

# Implementing Active Learning

In [None]:
for name, df in df_dict.items():
    print(f"\n========== Running for {name} ==========")
    iteration_detail_rows = run_active_learning_iterations(df, models)
    summarize_and_export_metrics(
        iteration_detail_rows,
        summary_csv=f"{name}_active_learning_results.csv",
        iteration_csv=f"{name}_active_learning_iterations.csv"
    )


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000038 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 112
[LightGBM] [Info] Number of data points in the train set: 200, number of used features: 5
[LightGBM] [Info] Start training from score 0.211228
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000039 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 121
[LightGBM] [Info] Number of data points in the train set: 220, number of used features: 5
[LightGBM] [Info] Start training from score 0.258819
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000034 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you 

# Explainable AI (XAi)

In [None]:
def calculate_shap_importance(X, y):

    try:
        model = RandomForestRegressor(random_state=42)
        X = X.astype(float)
        model.fit(X, y)

        explainer = shap.Explainer(model, X)
        shap_values = explainer(X)

        shap_importance = pd.DataFrame({
            'feature': X.columns,
            'importance': np.abs(shap_values.values).mean(axis=0)
        }).sort_values(by='importance', ascending=False)

        return shap_importance, shap_values
    except Exception as e:
        print(f"An error occurred: {e}")
        return None


In [None]:
shap_importance, shap_values = calculate_shap_importance(X, y)

In [None]:
print(shap_importance)

In [None]:
shap.summary_plot(shap_values, X)

In [None]:
shap.plots.heatmap(shap_values[:100])

In [None]:
!pip install lime

In [None]:
def lime_feature_selection(X, y, sample_size=500):

    if isinstance(X, pd.DataFrame):
        feature_names = X.columns.tolist()
    else:
        feature_names = [f"feature_{i}" for i in range(X.shape[1])]
        X = pd.DataFrame(X, columns=feature_names)

    # Train the model
    model = RandomForestRegressor(random_state=42)
    model.fit(X, y)

    # LIME explainer
    explainer = LimeTabularExplainer(
        training_data=X.values,
        feature_names=feature_names,
        mode='regression',
        discretize_continuous=True,
        verbose=False
    )

    # Random sample of rows
    np.random.seed(42)
    indices = np.random.choice(range(X.shape[0]), size=min(sample_size, X.shape[0]), replace=False)

    # Collect importance scores
    importance_scores = defaultdict(float)
    for idx in indices:
        instance = X.iloc[idx].values

        # Wrap in DataFrame to match feature names
        instance_df = pd.DataFrame([instance], columns=feature_names)
        predict_fn = lambda x: model.predict(pd.DataFrame(x, columns=feature_names))

        # Get explanations for all features
        exp = explainer.explain_instance(instance, predict_fn, num_features=X.shape[1])
        for feature, weight in exp.as_list():
            importance_scores[feature] += abs(weight)

    # Sort feature importances (all features)
    sorted_features = sorted(importance_scores.items(), key=lambda x: x[1], reverse=True)

    return sorted_features, explainer, model  # Return all sorted features

In [None]:
sorted_features, explainer, model = lime_feature_selection(X, y,sample_size=400)
print("Top LIME-selected features:", sorted_features)

In [None]:
idx= 1
exp = explainer.explain_instance(X.iloc[idx].values, model.predict, num_features=5)
exp.show_in_notebook()
