# QBC sampling strategy

In [8]:
TRAIN_PATH = "../data/randomsplit/train"
TEST_PATH = "../data/randomsplit/test"

float_numbers = [
    '4903217',
    '4903218',
    '4903220', 
    '4903052',
    '4903054',
]
    
float_number = float_numbers[1]


# QUERY_STRATEGY = 'random'
QUERY_STRATEGY = 'consensus-entropy'
RESULT_PATH = f"../results/randomsplit/{float_number}/{QUERY_STRATEGY}"

import os
os.makedirs(RESULT_PATH, exist_ok=True)

n_initial = 1000
k = 1  # Number of samples to query at each iteration
budget = 100  # Number of queried samples desired

split_method = 'random'

In [9]:
RESULT_PATH

'../results/randomsplit/4903218/consensus-entropy'

In [10]:
def comp_ratio(dataset):
    ''' Compute anomaly ratio
    '''
    instance = dataset[(dataset['Label']==1)]
    rate=len(instance)/len(dataset)*100
    return round(rate,2), len(instance)

In [11]:
import random
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, cohen_kappa_score

from modAL.uncertainty import uncertainty_sampling

def create_model(model_name):
    if model_name == 'KNN':
        model = KNeighborsClassifier(n_neighbors=5, leaf_size=30)
    elif model_name == 'LR': 
        model = LogisticRegression(penalty='l2', random_state=42)
    elif model_name == 'RF': 
        model = RandomForestClassifier(n_estimators=20, random_state=42)
    elif model_name == 'XGBoost':
        model = XGBClassifier(max_depth=6)
    elif model_name == 'CatBoost':
        model = CatBoostClassifier(depth=2, iterations=20, silent=True)
    elif model_name == 'LightGBM':
        model = LGBMClassifier(max_depth=2, n_estimators=50)
    else:
        raise ValueError(f"Invalid model name: {model_name}")

    return model

def fit_model(model, labeled_data):
    X_train = labeled_data.drop(['ID', 'Label'], axis=1).values
    y_train = labeled_data['Label']
    
    model.fit(X_train, y_train)
    return model

def evaluate_committee(models, test_data):
    X_test = test_data.drop(['ID', 'Label'], axis=1).values
    y_test = test_data['Label'].values

    model_probabilities = []
    for model in models:
        model_probabilities.append(model.predict_proba(X_test))

    # Compute the mean probability of all models
    mean_probabilities = np.mean(model_probabilities, axis=0)
    y_pred = np.argmax(mean_probabilities, axis=1)

    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    kappa = cohen_kappa_score(y_test, y_pred)
    return precision, recall, f1, kappa

def query_strategy(strategy_name, models, X_unlabeled, k): 
    ''' Consensus entropy selects the instance with the maximum entropy in terms of mean probability of the committee. 
    computation steps: 
        1. Get predicted class probabilities from each classifier; 
        2. Calculate the mean probability of the committee; 
        3. Calculate the entropy of mean probabilities; 
        4. Select instances with highest entropy. 
    Ref: https://modal-python.readthedocs.io/en/latest/content/query_strategies/Disagreement-sampling.html
    '''
    
    if strategy_name == 'consensus-entropy': 
        model_probabilities = []
        for model in models:
            model_probabilities.append(model.predict_proba(X_unlabeled))
        # Compute the mean probability of all models
        mean_probabilities = np.mean(model_probabilities, axis=0)

        consensus_entropy = -np.sum(np.where(mean_probabilities != 0, mean_probabilities * np.log2(mean_probabilities), 0), axis=1)
        query_indices = np.argsort(consensus_entropy)[-k:]
       
    elif strategy_name == 'max-disagreement': 
        num_rows = len(X_unlabeled)
        if num_rows == 0:
            raise ValueError("The matrix is empty.")
        if k > num_rows:
            raise ValueError("The number of rows to select is greater than the number of rows in the matrix.")
        query_indices = random.sample(range(num_rows), k)
    
    return query_indices

def qbc(model_names, initial_data, unlabeled_data, test_data, k, budget):
    models = [create_model(model_name=model_name) for model_name in model_names]
    
    labeled_data = initial_data.copy()  # Initialize the labeled set with the initial data

    queried_samples = 0
    query_indices = []
    query_ids = []

    metrics = {
        # 'model_name': model_name, 
        'num_samples': [], 
        'query_ids': [], 
        'Precision': [],
        'Recall': [],
        'F1-score': [],
        'Kappa': []
        }
    while queried_samples <= budget:
        # Train the models on the initial data
        trained_models = []
        for model in models: 
            model = fit_model(model, labeled_data)
            trained_models.append(model)
        
        # Evaluate the committee
        precision, recall, f1, kappa = evaluate_committee(trained_models, test_data)

        # Store the metrics for the current model
        metrics['num_samples'].append(queried_samples)
        metrics['query_ids'].append(query_ids)
        metrics['Precision'].append(precision)
        metrics['Recall'].append(recall)
        metrics['F1-score'].append(f1)
        metrics['Kappa'].append(kappa)
        
        # Compute uncertainty scores for the remaining unlabeled set
        X_unlabeled = unlabeled_data.drop(['ID', 'Label'], axis=1).values
        query_indices = query_strategy(QUERY_STRATEGY, trained_models, X_unlabeled, k)
        
        # Add the queried samples to the labeled set
        labeled_data = pd.concat([labeled_data, unlabeled_data.iloc[query_indices]])
        query_ids = unlabeled_data.iloc[query_indices]['ID'].to_list()

        print(f"ID: {unlabeled_data.iloc[query_indices]['ID'].to_list()}; Label: {unlabeled_data.iloc[query_indices]['Label'].to_list()}")

        # Remove the queried samples from the unlabeled set
        unlabeled_data = unlabeled_data.drop(unlabeled_data.index[query_indices])

        # Update the number of queried samples
        queried_samples += len(query_indices)

        # # Train the final model on the labeled set
        # model = fit_model(model, labeled_data)

    return metrics

In [12]:
# %%capture
# Example usage
import os 
train_file = os.path.join(TRAIN_PATH, f'PR_PF_{float_number}.csv')
test_file = os.path.join(TEST_PATH, f'PR_PF_{float_number}.csv')
initial_file = os.path.join(TRAIN_PATH, f'{split_method}_PR_PF_{float_number}_{n_initial}_initial.csv')
unlabeled_file = os.path.join(TRAIN_PATH, f'{split_method}_PR_PF_{float_number}_{n_initial}_unlabeled.csv')


# Load the train and test datasets
train_data = pd.read_csv(train_file)
test_data = pd.read_csv(test_file)
initial_data = pd.read_csv(initial_file)
unlabeled_data = pd.read_csv(unlabeled_file)

train_data = train_data.drop('Date', axis=1)
test_data = test_data.drop('Date', axis=1)
initial_data = initial_data.drop('Date', axis=1)
unlabeled_data = unlabeled_data.drop('Date', axis=1)

In [13]:
def generate_subsets(input_list, n_elements):
    n = len(input_list)
    subsets = []

    for i in range(2 ** n):
        subset = [input_list[j] for j in range(n) if (i & (1 << j)) > 0]
        if len(subset) == n_elements:
            subsets.append(subset)

    return subsets

elements = ['KNN', 'XGBoost', 'CatBoost', 'LightGBM']

combinations_2 = generate_subsets(elements, 2)
combinations_3 = generate_subsets(elements, 3)
combinations_4 = generate_subsets(elements, 4)
for item in combinations_4: 
    print(item)



['KNN', 'XGBoost', 'CatBoost', 'LightGBM']


In [14]:
# model_names = ['XGBoost'] Y

# model_names = ['KNN', 'XGBoost', 'CatBoost', 'LightGBM'] Y

# model_names = ['KNN', 'XGBoost', 'CatBoost']
# model_names = ['KNN', 'XGBoost', 'LightGBM'] N
# model_names = ['KNN', 'CatBoost', 'LightGBM'] N
# model_names = ['XGBoost', 'CatBoost', 'LightGBM'] Y

# model_names = ['XGBoost', 'CatBoost', 'LightGBM'] Y

combinations = [
                ['KNN', 'CatBoost'], 
                # ['XGBoost', 'CatBoost'], 
                # ['KNN', 'XGBoost', 'CatBoost'], 
                # ['KNN', 'XGBoost', 'CatBoost', 'LightGBM'],
    ]

for model_names in combinations: 
    # Dictionary to store the evaluation metrics for each model
    metrics = {}

    # Active learning loop
    metrics = qbc(model_names, initial_data, unlabeled_data, test_data, k, budget)
    df_metrics = pd.DataFrame(metrics)
    filename = f"{RESULT_PATH}/{'+'.join(model_names)}_{split_method}_{n_initial}_initial_{k}_k.csv"
    df_metrics.to_csv(filename, index=False)
    print(f"Save to {filename}")


ID: [107561]; Label: [1]
ID: [106445]; Label: [1]
ID: [106522]; Label: [1]
ID: [106436]; Label: [1]
ID: [106427]; Label: [1]
ID: [107762]; Label: [1]
ID: [106309]; Label: [1]
ID: [107738]; Label: [1]
ID: [107291]; Label: [1]
ID: [107741]; Label: [1]
ID: [106622]; Label: [1]
ID: [107887]; Label: [1]
ID: [106649]; Label: [1]
ID: [107895]; Label: [1]
ID: [107920]; Label: [1]
ID: [106417]; Label: [1]
ID: [106442]; Label: [1]
ID: [108005]; Label: [1]
ID: [107994]; Label: [1]
ID: [106366]; Label: [1]
ID: [106347]; Label: [1]
ID: [106388]; Label: [1]
ID: [106393]; Label: [1]
ID: [93653]; Label: [0]
ID: [106392]; Label: [1]
ID: [104449]; Label: [0]
ID: [106413]; Label: [1]
ID: [106665]; Label: [1]
ID: [106409]; Label: [1]
ID: [104488]; Label: [0]
ID: [106339]; Label: [1]
ID: [106692]; Label: [1]
ID: [106742]; Label: [0]
ID: [106340]; Label: [1]
ID: [106731]; Label: [0]
ID: [106718]; Label: [1]
ID: [106764]; Label: [0]
ID: [106716]; Label: [1]
ID: [106740]; Label: [0]
ID: [106719]; Label: [1]
I