This notebook analyzes the instances sampled during each active learning iteration, focusing on the aggregation method. It compares least confidence sampling with minimum confidence score aggregation and entropy-based sampling with maximum entropy aggregation. The goal is to understand why the final mAP shows no statistically significant difference between a model using least confidence sampling with minimum confidence score aggregation and one using entropy-based sampling with maximum entropy aggregation.

To create the text files required to run this notebook, go to the logs and copy every line that looks like this:
"2026-01-07 08:59:04,811 - INFO - Moved 7426_F2_f000073.jpg and 7426_F2_f000073.txt to labeled image/labels directories."
and paste it to the text file corresponding to a sampling iteration of each seed. Each log file for a seed will have 3 sets of 227 lines that look like the one apsted above. Each set should be pasted in a separate file corresponding to the initially labeled random seed and training iteration it belongs to, and it should follow the directory structure shown in the sanity checks (you can also change the directories there if necessary)

The first step is to find the ratios of common to uncommon instances for each iteration in each seed for both samplers to see if mostly the same instances get sampled or not.

In [None]:
import re
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
import os
os.chdir(str(Path.cwd().parent))

In [None]:
def get_sampler_instances(file_path):
    sampler_file_list = []
    with open(file_path, 'r') as f:
        sampler_data = f.readlines()

    for i in range(len(sampler_data)):
        path = re.search(r'(\S+\.jpg)', sampler_data[i]).group(1)
        path = Path(path) 
        sampler_data[i] = path.stem
        sampler_file_list.append(path.stem)
    
    return sampler_file_list

In [None]:
def txt_in_log(log_file, txt_file):
    with open(log_file, 'r') as f:
        log_content = f.read()
    
    with open(txt_file, 'r') as f:
        txt_lines = f.readlines()
    
    for line in txt_lines:
        line = line.strip()

        if line not in log_content:
            return False
    
    return True

Sanity Check

In [None]:
print(txt_in_log('experiments/experiments_yolov8/seed1_42/ent_max/ent_max.log', 'experiments/breakdowns/seed1/iter1/ent_max.txt'))
print(txt_in_log('experiments/experiments_yolov8/seed1_42/lc_min/lc_min.log', 'experiments/breakdowns/seed1/iter1/lc_min.txt'))
print(txt_in_log('experiments/experiments_yolov8/seed1_42/ent_max/ent_max.log', 'experiments/breakdowns/seed1/iter2/ent_max.txt'))
print(txt_in_log('experiments/experiments_yolov8/seed1_42/lc_min/lc_min.log', 'experiments/breakdowns/seed1/iter2/lc_min.txt'))
print(txt_in_log('experiments/experiments_yolov8/seed1_42/ent_max/ent_max.log', 'experiments/breakdowns/seed1/iter3/ent_max.txt'))
print(txt_in_log('experiments/experiments_yolov8/seed1_42/lc_min/lc_min.log', 'experiments/breakdowns/seed1/iter3/lc_min.txt'))

In [None]:
print(txt_in_log('experiments/experiments_yolov8/seed2_21/ent_max/ent_max.log', 'experiments/breakdowns/seed2/iter1/ent_max.txt'))
print(txt_in_log('experiments/experiments_yolov8/seed2_21/lc_min/lc_min.log', 'experiments/breakdowns/seed2/iter1/lc_min.txt'))
print(txt_in_log('experiments/experiments_yolov8/seed2_21/ent_max/ent_max.log', 'experiments/breakdowns/seed2/iter2/ent_max.txt'))
print(txt_in_log('experiments/experiments_yolov8/seed2_21/lc_min/lc_min.log', 'experiments/breakdowns/seed2/iter2/lc_min.txt'))
print(txt_in_log('experiments/experiments_yolov8/seed2_21/ent_max/ent_max.log', 'experiments/breakdowns/seed2/iter3/ent_max.txt'))
print(txt_in_log('experiments/experiments_yolov8/seed2_21/lc_min/lc_min.log', 'experiments/breakdowns/seed2/iter3/lc_min.txt'))

In [None]:
print(txt_in_log('experiments/experiments_yolov8/seed3_69/ent_max/ent_max.log', 'experiments/breakdowns/seed3/iter1/ent_max.txt'))
print(txt_in_log('experiments/experiments_yolov8/seed3_69/lc_min/lc_min.log', 'experiments/breakdowns/seed3/iter1/lc_min.txt'))
print(txt_in_log('experiments/experiments_yolov8/seed3_69/ent_max/ent_max.log', 'experiments/breakdowns/seed3/iter2/ent_max.txt'))
print(txt_in_log('experiments/experiments_yolov8/seed3_69/lc_min/lc_min.log', 'experiments/breakdowns/seed3/iter2/lc_min.txt'))
print(txt_in_log('experiments/experiments_yolov8/seed3_69/ent_max/ent_max.log', 'experiments/breakdowns/seed3/iter3/ent_max.txt'))
print(txt_in_log('experiments/experiments_yolov8/seed3_69/lc_min/lc_min.log', 'experiments/breakdowns/seed3/iter3/lc_min.txt'))

In [None]:
print(txt_in_log('experiments/experiments_yolov8/seed4_10/ent_max/ent_max.log', 'experiments/breakdowns/seed4/iter1/ent_max.txt'))
print(txt_in_log('experiments/experiments_yolov8/seed4_10/lc_min/lc_min.log', 'experiments/breakdowns/seed4/iter1/lc_min.txt'))
print(txt_in_log('experiments/experiments_yolov8/seed4_10/ent_max/ent_max.log', 'experiments/breakdowns/seed4/iter2/ent_max.txt'))
print(txt_in_log('experiments/experiments_yolov8/seed4_10/lc_min/lc_min.log', 'experiments/breakdowns/seed4/iter2/lc_min.txt'))
print(txt_in_log('experiments/experiments_yolov8/seed4_10/ent_max/ent_max.log', 'experiments/breakdowns/seed4/iter3/ent_max.txt'))
print(txt_in_log('experiments/experiments_yolov8/seed4_10/lc_min/lc_min.log', 'experiments/breakdowns/seed4/iter3/lc_min.txt'))

In [None]:
print(txt_in_log('experiments/experiments_yolov8/seed5_99/ent_max/ent_max.log', 'experiments/breakdowns/seed5/iter1/ent_max.txt'))
print(txt_in_log('experiments/experiments_yolov8/seed5_99/lc_min/lc_min.log', 'experiments/breakdowns/seed5/iter1/lc_min.txt'))
print(txt_in_log('experiments/experiments_yolov8/seed5_99/ent_max/ent_max.log', 'experiments/breakdowns/seed5/iter2/ent_max.txt'))
print(txt_in_log('experiments/experiments_yolov8/seed5_99/lc_min/lc_min.log', 'experiments/breakdowns/seed5/iter2/lc_min.txt'))
print(txt_in_log('experiments/experiments_yolov8/seed5_99/ent_max/ent_max.log', 'experiments/breakdowns/seed5/iter3/ent_max.txt'))
print(txt_in_log('experiments/experiments_yolov8/seed5_99/lc_min/lc_min.log', 'experiments/breakdowns/seed5/iter3/lc_min.txt'))

Seed 1

Iteration 1

In [None]:
ent_max_list = get_sampler_instances('experiments/breakdowns/seed1/iter1/ent_max.txt')
lc_min_list = get_sampler_instances('experiments/breakdowns/seed1/iter1/lc_min.txt')
intersection = list(set(ent_max_list).intersection(lc_min_list))
print(f'Number of common instances between ent_max and lc_min: {len(intersection)}')
print(f'Ratio of common instances: {len(intersection)/len(ent_max_list)*100:.2f}%')

Iteration 2

In [None]:
ent_max_list = get_sampler_instances('experiments/breakdowns/seed1/iter2/ent_max.txt')
lc_min_list = get_sampler_instances('experiments/breakdowns/seed1/iter2/lc_min.txt')
intersection = list(set(ent_max_list).intersection(lc_min_list))
print(f'Number of common instances between ent_max and lc_min: {len(intersection)}')
print(f'Ratio of common instances: {len(intersection)/len(ent_max_list)*100:.2f}%')

Iteration 3

In [None]:
ent_max_list = get_sampler_instances('experiments/breakdowns/seed1/iter3/ent_max.txt')
lc_min_list = get_sampler_instances('experiments/breakdowns/seed1/iter3/lc_min.txt')
intersection = list(set(ent_max_list).intersection(lc_min_list))
print(f'Number of common instances between ent_max and lc_min: {len(intersection)}')
print(f'Ratio of common instances: {len(intersection)/len(ent_max_list)*100:.2f}%')

In [None]:
print(f'Total Percentage of common instances across 3 iterations: {(43+35+17)/681*100:.2f}%')

Each model was trained on a total of 908 instances out of which there were 227 common initial training instances with the other. In addition, across the other 3 instances, there were 95 common instances. Overall, 35% of the instances in each model was common with the other in the first seed of training.

Seed 2

Iteration 1

In [None]:
ent_max_list = get_sampler_instances('experiments/breakdowns/seed2/iter1/ent_max.txt')
lc_min_list = get_sampler_instances('experiments/breakdowns/seed2/iter1/lc_min.txt')
intersection = list(set(ent_max_list).intersection(lc_min_list))
print(f'Number of common instances between ent_max and lc_min: {len(intersection)}')
print(f'Ratio of common instances: {len(intersection)/len(ent_max_list)*100:.2f}%')

In [None]:
ent_max_list = get_sampler_instances('experiments/breakdowns/seed2/iter2/ent_max.txt')
lc_min_list = get_sampler_instances('experiments/breakdowns/seed2/iter2/lc_min.txt')
intersection = list(set(ent_max_list).intersection(lc_min_list))
print(f'Number of common instances between ent_max and lc_min: {len(intersection)}')
print(f'Ratio of common instances: {len(intersection)/len(ent_max_list)*100:.2f}%')

In [None]:
ent_max_list = get_sampler_instances('experiments/breakdowns/seed2/iter3/ent_max.txt')
lc_min_list = get_sampler_instances('experiments/breakdowns/seed2/iter3/lc_min.txt')
intersection = list(set(ent_max_list).intersection(lc_min_list))
print(f'Number of common instances between ent_max and lc_min: {len(intersection)}')
print(f'Ratio of common instances: {len(intersection)/len(ent_max_list)*100:.2f}%')

In [None]:
print(f'Total Percentage of common instances across 3 iterations of active learning: {(28+19+17)/681*100:.2f}%')

Seed 3

Iteration 1

In [None]:
ent_max_list = get_sampler_instances('experiments/breakdowns/seed3/iter1/ent_max.txt')
lc_min_list = get_sampler_instances('experiments/breakdowns/seed3/iter1/lc_min.txt')
intersection = list(set(ent_max_list).intersection(lc_min_list))
print(f'Number of common instances between ent_max and lc_min: {len(intersection)}')
print(f'Ratio of common instances: {len(intersection)/len(ent_max_list)*100:.2f}%')

Iteration 2

In [None]:
ent_max_list = get_sampler_instances('experiments/breakdowns/seed3/iter2/ent_max.txt')
lc_min_list = get_sampler_instances('experiments/breakdowns/seed3/iter2/lc_min.txt')
intersection = list(set(ent_max_list).intersection(lc_min_list))
print(f'Number of common instances between ent_max and lc_min: {len(intersection)}')
print(f'Ratio of common instances: {len(intersection)/len(ent_max_list)*100:.2f}%')

Iteration 3

In [None]:
ent_max_list = get_sampler_instances('experiments/breakdowns/seed3/iter3/ent_max.txt')
lc_min_list = get_sampler_instances('experiments/breakdowns/seed3/iter3/lc_min.txt')
intersection = list(set(ent_max_list).intersection(lc_min_list))
print(f'Number of common instances between ent_max and lc_min: {len(intersection)}')
print(f'Ratio of common instances: {len(intersection)/len(ent_max_list)*100:.2f}%')

In [None]:
print(f'Total Percentage of common instances across 3 iterations of active learning: {(38+27+13)/681*100:.2f}%')

Seed 4

Iteration 1

In [None]:
ent_max_list = get_sampler_instances('experiments/breakdowns/seed4/iter1/ent_max.txt')
lc_min_list = get_sampler_instances('experiments/breakdowns/seed4/iter1/lc_min.txt')
intersection = list(set(ent_max_list).intersection(lc_min_list))
print(f'Number of common instances between ent_max and lc_min: {len(intersection)}')
print(f'Ratio of common instances: {len(intersection)/len(ent_max_list)*100:.2f}%')

Iteration 2

In [None]:
ent_max_list = get_sampler_instances('experiments/breakdowns/seed4/iter2/ent_max.txt')
lc_min_list = get_sampler_instances('experiments/breakdowns/seed4/iter2/lc_min.txt')
intersection = list(set(ent_max_list).intersection(lc_min_list))
print(f'Number of common instances between ent_max and lc_min: {len(intersection)}')
print(f'Ratio of common instances: {len(intersection)/len(ent_max_list)*100:.2f}%')

Iteration 3

In [None]:
ent_max_list = get_sampler_instances('experiments/breakdowns/seed4/iter3/ent_max.txt')
lc_min_list = get_sampler_instances('experiments/breakdowns/seed4/iter3/lc_min.txt')
intersection = list(set(ent_max_list).intersection(lc_min_list))
print(f'Number of common instances between ent_max and lc_min: {len(intersection)}')
print(f'Ratio of common instances: {len(intersection)/len(ent_max_list)*100:.2f}%')

In [None]:
print(f'Total Percentage of common instances across 3 iterations of active learning: {(42+37+22)/681*100:.2f}%')

Seed 5

Iteration 1

In [None]:
ent_max_list = get_sampler_instances('experiments/breakdowns/seed5/iter1/ent_max.txt')
lc_min_list = get_sampler_instances('experiments/breakdowns/seed5/iter1/lc_min.txt')
intersection = list(set(ent_max_list).intersection(lc_min_list))
print(f'Number of common instances between ent_max and lc_min: {len(intersection)}')
print(f'Ratio of common instances: {len(intersection)/len(ent_max_list)*100:.2f}%')

Iteration 2

In [None]:
ent_max_list = get_sampler_instances('experiments/breakdowns/seed5/iter2/ent_max.txt')
lc_min_list = get_sampler_instances('experiments/breakdowns/seed5/iter2/lc_min.txt')
intersection = list(set(ent_max_list).intersection(lc_min_list))
print(f'Number of common instances between ent_max and lc_min: {len(intersection)}')
print(f'Ratio of common instances: {len(intersection)/len(ent_max_list)*100:.2f}%')

Iteration 3

In [None]:
ent_max_list = get_sampler_instances('experiments/breakdowns/seed5/iter3/ent_max.txt')
lc_min_list = get_sampler_instances('experiments/breakdowns/seed5/iter3/lc_min.txt')
intersection = list(set(ent_max_list).intersection(lc_min_list))
print(f'Number of common instances between ent_max and lc_min: {len(intersection)}')
print(f'Ratio of common instances: {len(intersection)/len(ent_max_list)*100:.2f}%')

In [None]:
print(f'Total Percentage of common instances across 3 iterations of active learning: {(27+39+22)/681*100:.2f}%')

In [None]:
print(f'Mean Percentage of common instances across 3 iterations of active learning across all 5 seeds: {np.mean([(43+35+17)/681*100, (28+19+17)/681*100, (38+27+13)/681*100, (42+37+22)/681*100, (27+39+22)/681*100]).item()}%')

In [None]:
print(f'Standard Deviation of common instances across 3 iterations of active learning across all 5 seeds: {np.std([(43+35+17)/681*100, (28+19+17)/681*100, (38+27+13)/681*100, (42+37+22)/681*100, (27+39+22)/681*100]).item()}%')

~88% of instances being different, however, presents the possibility of similarity between instances being selected by the model even though they aren't necessarily the same. That could be a possible explanation for the lack of difference with statistical significance between the 2 sampling+aggregation methods.

The DeepFish dataset is also split on the basis of habitat. It is possible that even though different instances are being sent to the models, similar instances from the same habitat could be being sent that is causing a similar performance.

In [None]:
habitat_dict = {'7482':'Low complexity reef', 
'7398':'Sandy mangrove prop roots',
'7426':'Complex reef',
'7463':'Seagrass bed',
'7434':'Low algal bed',
'7623':'Reef trench',
'7490':'Boulders',
'7585':'Mixed substratum mangrove - prop roots',
'7117':'Rocky Mangrove - prop roots',
'7393':'Upper Mangrove – medium Rhizophora',
'9907':'Rock shelf',
'9894':'Mangrove - mixed pneumatophore prop root',
'7268':'Sparse algal bed',
'9866':'Muddy mangrove - pneumatophores and trunk',
'9908':'Large boulder and pneumatophores',
'9898':'Rocky mangrove - large boulder and trunk',
'9892':'Bare substratum',
'9852':'Upper mangrove - tall rhizophora',
'9862':'Large boulder',
'9870':'Muddy mangrove – pneumatophores'
}

In [None]:
def get_habitats(file_path):
    habitat_list = []
    with open(file_path, 'r') as f:
        sampler_data = f.readlines()

    for i in range(len(sampler_data)):
        match = re.search(r"Moved\s+(\d+)_", sampler_data[i])
        leading_id = match.group(1)
        habitat_list.append(leading_id)

    return habitat_list

In [None]:
def get_habitat_count(habitat_list):
    habitat_counts = {}
    for habitat_id in habitat_list:
        if habitat_id in habitat_dict:
            habitat_name = habitat_dict[habitat_id]
            if habitat_name in habitat_counts:
                habitat_counts[habitat_name] += 1
            else:
                habitat_counts[habitat_name] = 1
    return sorted(habitat_counts.items(), key=lambda x: x[1], reverse=True)

Seed 1

Iteration 1

In [None]:
get_habitat_count(get_habitats('experiments/breakdowns/seed1/iter1/ent_max.txt'))

In [None]:
get_habitat_count(get_habitats('experiments/breakdowns/seed1/iter1/lc_min.txt'))

Iteration 2

In [None]:
get_habitat_count(get_habitats('experiments/breakdowns/seed1/iter2/ent_max.txt'))

In [None]:
get_habitat_count(get_habitats('experiments/breakdowns/seed1/iter2/lc_min.txt'))

Iteration 3

In [None]:
get_habitat_count(get_habitats('experiments/breakdowns/seed1/iter3/ent_max.txt'))

In [None]:
get_habitat_count(get_habitats('experiments/breakdowns/seed1/iter3/lc_min.txt'))

Seed 2

Iteration 1

In [None]:
get_habitat_count(get_habitats('experiments/breakdowns/seed2/iter1/ent_max.txt'))

In [None]:
get_habitat_count(get_habitats('experiments/breakdowns/seed2/iter1/lc_min.txt'))

Iteration 2

In [None]:
get_habitat_count(get_habitats('experiments/breakdowns/seed2/iter2/ent_max.txt'))

In [None]:
get_habitat_count(get_habitats('experiments/breakdowns/seed2/iter2/lc_min.txt'))

Iteration 3

In [None]:
get_habitat_count(get_habitats('experiments/breakdowns/seed2/iter3/ent_max.txt'))

In [None]:
get_habitat_count(get_habitats('experiments/breakdowns/seed2/iter3/lc_min.txt'))

Seed 3

Iteration 1

In [None]:
get_habitat_count(get_habitats('experiments/breakdowns/seed3/iter1/ent_max.txt'))

In [None]:
get_habitat_count(get_habitats('experiments/breakdowns/seed3/iter1/lc_min.txt'))

Iteration 2

In [None]:
get_habitat_count(get_habitats('experiments/breakdowns/seed3/iter2/ent_max.txt'))

In [None]:
get_habitat_count(get_habitats('experiments/breakdowns/seed3/iter2/lc_min.txt'))

Iteration 3

In [None]:
get_habitat_count(get_habitats('experiments/breakdowns/seed3/iter3/ent_max.txt'))

In [None]:
get_habitat_count(get_habitats('experiments/breakdowns/seed3/iter3/lc_min.txt'))

Seed 4

Iteration 1

In [None]:
get_habitat_count(get_habitats('experiments/breakdowns/seed4/iter1/ent_max.txt'))

In [None]:
get_habitat_count(get_habitats('experiments/breakdowns/seed4/iter1/lc_min.txt'))

Iteration 2

In [None]:
get_habitat_count(get_habitats('experiments/breakdowns/seed4/iter2/ent_max.txt'))

In [None]:
get_habitat_count(get_habitats('experiments/breakdowns/seed4/iter2/lc_min.txt'))

Iteration 3

In [None]:
get_habitat_count(get_habitats('experiments/breakdowns/seed4/iter3/ent_max.txt'))

In [None]:
get_habitat_count(get_habitats('experiments/breakdowns/seed4/iter3/lc_min.txt'))

Seed 5

Iteration 1

In [None]:
get_habitat_count(get_habitats('experiments/breakdowns/seed5/iter1/ent_max.txt'))

In [None]:
get_habitat_count(get_habitats('experiments/breakdowns/seed5/iter1/lc_min.txt'))

Iteration 2

In [None]:
get_habitat_count(get_habitats('experiments/breakdowns/seed5/iter2/ent_max.txt'))

In [None]:
get_habitat_count(get_habitats('experiments/breakdowns/seed5/iter2/lc_min.txt'))

Iteration 3

In [None]:
get_habitat_count(get_habitats('experiments/breakdowns/seed5/iter3/ent_max.txt'))

In [None]:
get_habitat_count(get_habitats('experiments/breakdowns/seed5/iter3/lc_min.txt'))