<a href="https://colab.research.google.com/github/muajnstu/Implementation-of-Active-Learning-Method-in-Regression-Model/blob/main/Implementing_Active_Learning_on_primary_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Libraries

In [None]:
# importing basic libraries
import pandas as pd
import numpy as np

# visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

# preprocessing libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer

# feature selection
from sklearn.feature_selection import SelectKBest, mutual_info_classif, mutual_info_regression, f_regression, RFE
import shap
import xgboost as xgb
# machine learning models
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, GradientBoostingRegressor, ExtraTreesRegressor, StackingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.svm import SVR
from sklearn.model_selection import cross_val_score

# model tunning
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

# evaluation metrics
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


## Load The Data

In [None]:
# load the data
df = pd.read_csv('https://raw.githubusercontent.com/muajnstu/ML-Datasets/main/insurance.csv')

In [None]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [None]:
df.describe()

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


In [None]:
df.nunique()

age           47
sex            2
bmi          548
children       6
smoker         2
region         4
charges     1337
dtype: int64

## Data Encoding

In [None]:
# initialize encoder
le = LabelEncoder()

# encode binary categorical features
binary_features = ['sex', 'smoker']
for feature in binary_features:
    df[feature] = le.fit_transform(df[feature])

# encode categorical features with more than two categories
multiclass_features = ['region']
df = pd.get_dummies(df, columns=multiclass_features, drop_first=True)

# display the transformed data
print("\nTransformed DataFrame after Encoding:")
print(df.head())


Transformed DataFrame after Encoding:
   age  sex     bmi  children  smoker      charges  region_northwest  \
0   19    0  27.900         0       1  16884.92400             False   
1   18    1  33.770         1       0   1725.55230             False   
2   28    1  33.000         3       0   4449.46200             False   
3   33    1  22.705         0       0  21984.47061              True   
4   32    1  28.880         0       0   3866.85520              True   

   region_southeast  region_southwest  
0             False              True  
1              True             False  
2              True             False  
3             False             False  
4             False             False  


In [None]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,charges,region_northwest,region_southeast,region_southwest
0,19,0,27.9,0,1,16884.924,False,False,True
1,18,1,33.77,1,0,1725.5523,False,True,False
2,28,1,33.0,3,0,4449.462,False,True,False
3,33,1,22.705,0,0,21984.47061,True,False,False
4,32,1,28.88,0,0,3866.8552,True,False,False


In [None]:
df.shape

(1338, 9)

# Implementing Active Learning

In [None]:
def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    non_zero_idx = y_true != 0
    return np.mean(np.abs((y_true[non_zero_idx] - y_pred[non_zero_idx]) / y_true[non_zero_idx])) * 100

In [None]:
def run_active_learning(df, target_col='charges', initial_samples=100, query_size=50, iterations=20):

    # Split into features and target
    X = df.drop(columns=target_col)
    y = df[target_col]

    # Split into training and testing
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Define models
    models = {
        "Linear Regression": LinearRegression(),
        "Decision Tree": DecisionTreeRegressor(random_state=42),
        "KNN Regressor": KNeighborsRegressor(n_neighbors=5),
        "Extra Trees Regressor": ExtraTreesRegressor(n_estimators=100, random_state=42),
        "XGBoost Regressor": XGBRegressor(n_estimators=100, random_state=42, verbosity=0),
        "Bagging Regressor": BaggingRegressor(n_estimators=10, random_state=42),
        "Ridge Regression": Ridge(alpha=1.0),
        "Gradient Boosting": GradientBoostingRegressor(),
        "Random Forest": RandomForestRegressor(random_state=42),
        "Support Vector Regression": SVR(kernel='rbf', C=100, gamma='scale'),
        "Stacked Model": StackingRegressor(
            estimators=[
                ('ridge', Ridge()),
                ('rf', RandomForestRegressor())
            ],
            final_estimator=GradientBoostingRegressor()
        )
    }

    results = {}

    for name, model in models.items():
        print(f"\nRunning Active Learning for {name}")
        # Initial labeled and unlabeled split
        X_labeled = X_train.iloc[:initial_samples].copy()
        y_labeled = y_train.iloc[:initial_samples].copy()
        X_unlabeled = X_train.iloc[initial_samples:].copy()
        y_unlabeled = y_train.iloc[initial_samples:].copy()

        for i in range(iterations):
            model.fit(X_labeled, y_labeled)
            predictions = model.predict(X_unlabeled)

            # Uncertainty sampling
            uncertainty = np.abs(y_unlabeled.values - predictions)
            uncertain_idx = np.argsort(uncertainty)[-query_size:]

            # Move the most uncertain samples from unlabeled to labeled
            X_new = X_unlabeled.iloc[uncertain_idx]
            y_new = y_unlabeled.iloc[uncertain_idx]

            X_labeled = pd.concat([X_labeled, X_new])
            y_labeled = pd.concat([y_labeled, y_new])
            X_unlabeled = X_unlabeled.drop(X_new.index)
            y_unlabeled = y_unlabeled.drop(y_new.index)

            # Evaluate model
            y_pred = model.predict(X_test)
            mse = mean_squared_error(y_test, y_pred)
            rmse = np.sqrt(mse)
            mae = mean_absolute_error(y_test, y_pred)
            mape = mean_absolute_percentage_error(y_test, y_pred)
            r2 = r2_score(y_test, y_pred)
            n, k = X_test.shape
            adjusted_r2 = 1 - ((1 - r2) * (n - 1) / (n - k - 1))

            print(f"Iteration {i + 1}: MAE = {mae:.3f}, RMSE = {rmse:.3f}, R² = {r2:.3f}, Adjusted R² = {adjusted_r2:.3f}, MAPE = {mape:.2f}%")

        # Final evaluation
        y_pred_final = model.predict(X_test)
        mse_final = mean_squared_error(y_test, y_pred_final)
        rmse_final = np.sqrt(mse_final)
        mae_final = mean_absolute_error(y_test, y_pred_final)
        mape_final = mean_absolute_percentage_error(y_test, y_pred_final)
        r2_final = r2_score(y_test, y_pred_final)
        adjusted_r2_final = 1 - ((1 - r2_final) * (n - 1) / (n - k - 1))

        results[name] = {
            "Final MAE": mae_final,
            "Final RMSE": rmse_final,
            "Final R²": r2_final,
            "Final Adjusted R²": adjusted_r2_final,
            "Final MAPE": mape_final
        }

       # Final summary
    print("\nModel Comparison After Active Learning:")
    for name, metrics in results.items():
        print(f"{name}: Final MAE = {metrics['Final MAE']:.3f}, Final RMSE = {metrics['Final RMSE']:.3f}, "
              f"Final R² = {metrics['Final R²']:.3f}, Final Adjusted R² = {metrics['Final Adjusted R²']:.3f}, Final MAPE = {metrics['Final MAPE']:.2f}%")

    return results


In [None]:
result=run_active_learning(df)


Running Active Learning for Linear Regression
Iteration 1: MAE = 4412.096, RMSE = 5905.437, R² = 0.775, Adjusted R² = 0.768, MAPE = 55.86%
Iteration 2: MAE = 7770.541, RMSE = 8603.072, R² = 0.523, Adjusted R² = 0.509, MAPE = 136.38%
Iteration 3: MAE = 7897.894, RMSE = 8811.893, R² = 0.500, Adjusted R² = 0.484, MAPE = 145.35%
Iteration 4: MAE = 6464.990, RMSE = 7335.034, R² = 0.653, Adjusted R² = 0.643, MAPE = 92.39%
Iteration 5: MAE = 5492.799, RMSE = 6481.571, R² = 0.729, Adjusted R² = 0.721, MAPE = 78.20%
Iteration 6: MAE = 5554.520, RMSE = 6508.193, R² = 0.727, Adjusted R² = 0.719, MAPE = 81.44%
Iteration 7: MAE = 5133.678, RMSE = 6227.173, R² = 0.750, Adjusted R² = 0.743, MAPE = 72.47%
Iteration 8: MAE = 4890.775, RMSE = 6057.436, R² = 0.764, Adjusted R² = 0.756, MAPE = 65.98%
Iteration 9: MAE = 4602.319, RMSE = 5913.396, R² = 0.775, Adjusted R² = 0.768, MAPE = 57.91%
Iteration 10: MAE = 4445.416, RMSE = 5855.895, R² = 0.779, Adjusted R² = 0.772, MAPE = 54.60%
Iteration 11: MAE = 