# Pool-based AL
This script run multiple classifiers in AL. You can specify the query strategy and float being used. 

In [79]:
TRAIN_PATH = "../data/randomsplit/train"
TEST_PATH = "../data/randomsplit/test"

float_numbers = [
    '4903052',
    '4903054',
    '4903058',
    '4903215',
    '4903217',
    '4903218',
    '4903220'
]

float_number = float_numbers[6]

# QUERY_STRATEGY = 'random'
QUERY_STRATEGY = 'uncertainty'
RESULT_PATH = f"../results/randomsplit/{float_number}/{QUERY_STRATEGY}"

import os
os.makedirs(RESULT_PATH, exist_ok=True)

n_initial = 630
k = 1  # Number of samples to query at each iteration
budget = 100  # Number of queried samples desired

split_method = 'random'
# split_method = 'ocsvm'
# split_method = 'lof'

In [80]:
RESULT_PATH

'../results/randomsplit/4903220/uncertainty'

In [81]:
def comp_ratio(dataset):
    ''' Compute anomaly ratio
    '''
    instance = dataset[(dataset['Label']==1)]
    rate=len(instance)/len(dataset)*100
    return round(rate,2), len(instance)

### Define model architectures

In [82]:
import random
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, cohen_kappa_score

from modAL.uncertainty import uncertainty_sampling

def create_model(model_name): 
    if model_name == 'KNN':
        model = KNeighborsClassifier()
    elif model_name == 'XGBoost':
        model = XGBClassifier()
    elif model_name == 'CatBoost':
        model = CatBoostClassifier()
    elif model_name == 'LightGBM':
        model = LGBMClassifier()
    else:
        raise ValueError(f"Invalid model name: {model_name}")
    return model


def fit_model(model, labeled_data, model_name):
    X_train = labeled_data.drop(['ID', 'Label'], axis=1).values
    y_train = labeled_data['Label']
    
    model.fit(X_train, y_train)
    return model

def evaluate_model(model, test_data, model_name):
    X_test = test_data.drop(['ID', 'Label'], axis=1).values
    y_test = test_data['Label'].values

    y_pred = model.predict(X_test)
    precision = precision_score(y_test, y_pred, zero_division=0)
    recall = recall_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)
    kappa = cohen_kappa_score(y_test, y_pred)
    return precision, recall, f1, kappa


### Define AL pipeline

In [83]:
def query_strategy(strategy_name, model, X_unlabeled, k, model_name): 
    if strategy_name == 'uncertainty': 
        probabilities = model.predict_proba(X_unlabeled)
        # uncertainty_scores = uncertainty_sampling(model, X_unlabeled, n_instances=k)
        uncertainty_scores = 1 - (probabilities.max(axis=1))
         # Select the top-k most uncertain samples
        query_indices = (-uncertainty_scores).argsort()[:k]
    elif strategy_name == 'random': 
        num_rows = len(X_unlabeled)
        if num_rows == 0:
            raise ValueError("The matrix is empty.")
        if k > num_rows:
            raise ValueError("The number of rows to select is greater than the number of rows in the matrix.")
        query_indices = random.sample(range(num_rows), k)
    
    return query_indices

def pool_based_active_learning(model_name, initial_data, unlabeled_data, test_data, k, budget):
    model = create_model(model_name=model_name)
    labeled_data = initial_data.copy()  # Initialize the labeled set with the initial data

    queried_samples = 0
    query_indices = []
    query_ids = []

    # model_name = model.__class__.__name__
    metrics = {
        # 'model_name': model_name, 
        'num_samples': [], 
        'query_ids': [], 
        'Precision': [],
        'Recall': [],
        'F1-score': [],
        'Kappa': []
        }
    while queried_samples <= budget:
        # Iterate over the models
        # Train the model on the initial data
        model = fit_model(model, labeled_data, model_name)
        
        # Evaluate the model
        precision, recall, f1, kappa = evaluate_model(model, test_data, model_name)

        # Store the metrics for the current model
        metrics['num_samples'].append(queried_samples)
        metrics['query_ids'].append(query_ids)
        metrics['Precision'].append(precision)
        metrics['Recall'].append(recall)
        metrics['F1-score'].append(f1)
        metrics['Kappa'].append(kappa)
        
        # Compute uncertainty scores for the remaining unlabeled set
        # uncertainty_scores = -model.predict_proba(unlabeled_data.drop(['ID', 'Label'], axis=1).values).max(axis=1)
        X_unlabeled = unlabeled_data.drop(['ID', 'Label'], axis=1).values
        query_indices = query_strategy(QUERY_STRATEGY, model, X_unlabeled, k, model_name)

        # Add the queried samples to the labeled set
        # labeled_data = np.concatenate((labeled_data, unlabeled_data.iloc[query_indices]))
        labeled_data = pd.concat([labeled_data, unlabeled_data.iloc[query_indices]])
        query_ids = unlabeled_data.iloc[query_indices]['ID'].to_list()

        print(f"# samples: {queried_samples}; ID: {unlabeled_data.iloc[query_indices]['ID'].to_list()}; Label: {unlabeled_data.iloc[query_indices]['Label'].to_list()}")

        # Remove the queried samples from the unlabeled set
        unlabeled_data = unlabeled_data.drop(unlabeled_data.index[query_indices])

        # Update the number of queried samples
        queried_samples += len(query_indices)

        # # Train the final model on the labeled set
        # model = fit_model(model, labeled_data)

    return metrics

### Load data

In [84]:
# %%capture
# Example usage
import os 
train_file = os.path.join(TRAIN_PATH, f'PR_PF_{float_number}.csv')
test_file = os.path.join(TEST_PATH, f'PR_PF_{float_number}.csv')
initial_file = os.path.join(TRAIN_PATH, f'{split_method}_PR_PF_{float_number}_{n_initial}_initial.csv')
unlabeled_file = os.path.join(TRAIN_PATH, f'{split_method}_PR_PF_{float_number}_{n_initial}_unlabeled.csv')

# Load the train and test datasets
train_data = pd.read_csv(train_file)
test_data = pd.read_csv(test_file)
initial_data = pd.read_csv(initial_file)
unlabeled_data = pd.read_csv(unlabeled_file)

train_data = train_data.drop('Date', axis=1)
test_data = test_data.drop('Date', axis=1)
initial_data = initial_data.drop('Date', axis=1)
unlabeled_data = unlabeled_data.drop('Date', axis=1)


In [85]:
print(f'------- {float_number} ------')
print(f'Train: {train_data.shape[0]}; {comp_ratio(train_data)[0]}%')
print(f'Test: {test_data.shape[0]}; {comp_ratio(test_data)[0]}%')

------- 4903220 ------
Train: 181009; 0.16%
Test: 60337; 0.16%


### Start AL pipeline

In [86]:
# model_names = ['KNN', 'XGBoost', 'CatBoost', 'LightGBM']
model_names = ['KNN', 'LightGBM']
# model_names = ['KNN']
# model_names = ['XGBoost']
# model_names = ['LightGBM']
# model_names = ['CatBoost']

# Dictionary to store the evaluation metrics for each model
metrics = {}

# Active learning loop
for model_name in model_names:
    metrics = pool_based_active_learning(model_name, initial_data, unlabeled_data, test_data, k, budget)
    df_metrics = pd.DataFrame(metrics)
    filename = f"{RESULT_PATH}/{model_name}_{split_method}_{n_initial}_initial_{k}_k.csv"
    df_metrics.to_csv(filename, index=False)
    print(f"Save to {filename}")


# samples: 0; ID: [246301]; Label: [0]
# samples: 1; ID: [248188]; Label: [0]
# samples: 2; ID: [250243]; Label: [0]
# samples: 3; ID: [249981]; Label: [0]
# samples: 4; ID: [246461]; Label: [0]
# samples: 5; ID: [249966]; Label: [0]
# samples: 6; ID: [250163]; Label: [0]
# samples: 7; ID: [248359]; Label: [0]
# samples: 8; ID: [248164]; Label: [0]
# samples: 9; ID: [250066]; Label: [0]
# samples: 10; ID: [248304]; Label: [0]
# samples: 11; ID: [250014]; Label: [0]
# samples: 12; ID: [246508]; Label: [0]
# samples: 13; ID: [248273]; Label: [0]
# samples: 14; ID: [248244]; Label: [0]
# samples: 15; ID: [250060]; Label: [0]
# samples: 16; ID: [248180]; Label: [0]
# samples: 17; ID: [246452]; Label: [0]
# samples: 18; ID: [248161]; Label: [0]
# samples: 19; ID: [248202]; Label: [0]
# samples: 20; ID: [246445]; Label: [0]
# samples: 21; ID: [248224]; Label: [0]
# samples: 22; ID: [248179]; Label: [0]
# samples: 23; ID: [248176]; Label: [0]
# samples: 24; ID: [248158]; Label: [0]
# samples:

KeyboardInterrupt: 

In [None]:
comp_ratio(train_data), comp_ratio(test_data)

((0.16, 290), (0.16, 97))

In [None]:
initial_data

Unnamed: 0,ID,Normalized_date,Latitude,Longitude,Pressure,Salinity,Temperature,Label
0,266464,1.318609,0.327703,-0.018813,-0.782253,1.016900,0.824227,0
1,30698,-1.376204,-1.000929,-2.142091,1.589323,-1.268391,-1.385611,0
2,31104,-1.376204,-1.000929,-2.142091,-0.855291,1.257146,1.532527,0
3,213338,0.728986,1.248950,0.328180,-0.408800,-0.010053,0.128183,0
4,14812,-1.565650,-1.857312,-2.204795,-0.767847,1.078931,0.977065,0
...,...,...,...,...,...,...,...,...
625,24665,-1.440062,-1.882308,-1.879867,0.264507,-1.027160,-0.780387,0
626,301584,1.718787,0.776962,0.512307,-0.754818,0.625023,0.650921,0
627,105610,-0.514119,-0.338618,0.456070,-0.433855,-0.010053,0.113721,0
628,78895,-0.829153,-0.192840,0.226571,-0.858986,0.866254,0.839670,0
