In [175]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import warnings
import sys
import os
from datetime import datetime
import copy
warnings.filterwarnings('ignore')

HOME_DIR = '/Users/lucasbraga/Documents/GitHub/active-learning'

In [176]:
from iteration_by_iteration_analysis import *

In [177]:
fraud_data_path = f'{HOME_DIR}/active-learning/data/european-credit-card-dataset/creditcard.csv'

In [178]:
X_train, X_test, y_train, y_test = load_and_split_data(fraud_data_path)

Loading and splitting credit card fraud data...
Dataset shape: (284807, 31)
🧹 Cleaning and preprocessing credit card fraud dataset...
  🎯 Target distribution: {0: 284315, 1: 492}
  ⚠️  Fraud percentage: 0.173%
  📊 Final dataset shape: (284807, 37)
Train set: 227845 samples
Test set: 56962 samples


In [179]:
y_train.sum(), y_test.sum()

(394, 98)

### detailed_iteration_analysis

In [180]:
random_seed=42
config = {
    'initial_samples': 300,
    'batch_size': 68,
    'n_iterations': 6,  # Stop at iteration 6 as requested
    'model_type': 'logistic',
    'strategy_sequence': ['uncertainty', 'uncertainty', 'uncertainty', 'uncertainty', 'diversity', 'uncertainty']  # First 6 from champion config
}

In [181]:
# Create IDENTICAL validation splits for both approaches
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=random_seed)
train_idx, val_idx = next(sss.split(X_train, y_train))

In [182]:
# Create IDENTICAL base data for both approaches
X_train_val = X_train.iloc[train_idx]
X_val = X_train.iloc[val_idx]
y_train_val = y_train.iloc[train_idx]
y_val = y_train.iloc[val_idx]

print(f"🔍 VALIDATION SPLIT:")
print(f"  Training pool: {len(X_train_val)} samples")
print(f"  Validation set: {len(X_val)} samples")
print(f"  Fraud in training pool: {(y_train_val == 1).sum()}")
print(f"  Fraud in validation: {(y_val == 1).sum()}")
print(f"🎯 STRATEGY SEQUENCE: {config['strategy_sequence']}")

🔍 VALIDATION SPLIT:
  Training pool: 182276 samples
  Validation set: 45569 samples
  Fraud in training pool: 315
  Fraud in validation: 79
🎯 STRATEGY SEQUENCE: ['uncertainty', 'uncertainty', 'uncertainty', 'uncertainty', 'diversity', 'uncertainty']


In [183]:
# Create IDENTICAL initial labeled pools for both approaches
initial_indices = stratified_initial_split(X_train_val, y_train_val, config['initial_samples'], random_seed)

  🎯 Creating stratified initial split with 300 samples...
    Available fraud samples: 315
    Available non-fraud samples: 181961
    ✓ Selected 10 fraud + 290 non-fraud samples
    ✓ Initial fraud percentage: 3.33%


In [184]:
# ACTIVE LEARNING SETUP - Gets its own independent copy
X_labeled_active = X_train_val.loc[initial_indices].copy()
y_labeled_active = y_train_val.loc[initial_indices].copy()
X_unlabeled_active = X_train_val.drop(index=initial_indices).copy()
y_unlabeled_active = y_train_val.drop(index=initial_indices).copy()

In [185]:
# PASSIVE LEARNING SETUP - Gets its own independent copy
X_labeled_passive = X_train_val.loc[initial_indices].copy()  # IDENTICAL initial pool
y_labeled_passive = y_train_val.loc[initial_indices].copy()
X_unlabeled_passive = X_train_val.drop(index=initial_indices).copy()  # IDENTICAL unlabeled pool
y_unlabeled_passive = y_train_val.drop(index=initial_indices).copy()

In [186]:
X_labeled_active

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V28,Time_hour_sin,Time_hour_cos,Amount_log,Amount_very_small,Amount_small,Amount_medium,Amount_large,Amount_very_large,Amount_extreme
106010,0.682070,-0.682406,-0.144220,-1.033447,-0.760468,-0.571584,-0.357477,-0.207845,-2.238134,1.445829,...,0.059564,-1.064028,0.690154,0.955679,False,False,False,True,False,False
59939,0.591546,0.125306,0.399343,0.350841,-0.196124,-0.222914,-0.051379,0.021068,-0.312604,0.033781,...,0.027961,-0.235260,-1.072529,-1.114639,True,False,False,False,False,False
275313,1.153450,-0.956763,-0.826917,-1.306220,-0.613648,0.110281,-0.995417,-0.026123,-1.162494,1.571756,...,-0.191480,-0.282310,1.445353,0.537751,False,False,True,False,False,False
278593,1.097762,-0.895010,-0.573975,-1.066476,-0.993461,-0.602941,-0.789431,-0.181738,-1.442861,1.571938,...,-0.152540,-0.085693,1.516791,0.645950,False,False,True,False,False,False
50199,0.305591,-0.438062,-0.126088,0.793291,-0.134750,0.088342,0.334355,0.008005,-0.232448,0.006627,...,0.167878,0.287246,-1.192767,1.554042,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
280454,-0.177321,0.401354,-0.252446,-0.391362,0.871508,-0.481571,0.637525,-0.001588,-0.263959,-0.511834,...,0.108163,0.050379,1.551298,-1.518497,True,False,False,False,False,False
257702,0.969499,-0.129577,-0.871865,0.297182,-0.024439,-0.895459,0.308341,-0.376659,0.412099,-0.063894,...,-0.123933,-0.959641,0.884676,0.793000,False,False,True,False,False,False
192292,-0.875332,-0.924951,0.917352,-1.459346,0.873086,-0.304408,-0.135306,0.105184,-0.923538,-0.518500,...,0.274085,0.421721,-1.197973,1.095829,False,False,False,True,False,False
208774,1.098189,-0.042680,-1.696275,-0.239223,0.725972,-0.448028,0.404946,-0.168672,0.233288,0.211473,...,-0.287417,-0.417440,-0.986932,-1.564953,True,False,False,False,False,False


In [187]:
X_labeled_passive

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V28,Time_hour_sin,Time_hour_cos,Amount_log,Amount_very_small,Amount_small,Amount_medium,Amount_large,Amount_very_large,Amount_extreme
106010,0.682070,-0.682406,-0.144220,-1.033447,-0.760468,-0.571584,-0.357477,-0.207845,-2.238134,1.445829,...,0.059564,-1.064028,0.690154,0.955679,False,False,False,True,False,False
59939,0.591546,0.125306,0.399343,0.350841,-0.196124,-0.222914,-0.051379,0.021068,-0.312604,0.033781,...,0.027961,-0.235260,-1.072529,-1.114639,True,False,False,False,False,False
275313,1.153450,-0.956763,-0.826917,-1.306220,-0.613648,0.110281,-0.995417,-0.026123,-1.162494,1.571756,...,-0.191480,-0.282310,1.445353,0.537751,False,False,True,False,False,False
278593,1.097762,-0.895010,-0.573975,-1.066476,-0.993461,-0.602941,-0.789431,-0.181738,-1.442861,1.571938,...,-0.152540,-0.085693,1.516791,0.645950,False,False,True,False,False,False
50199,0.305591,-0.438062,-0.126088,0.793291,-0.134750,0.088342,0.334355,0.008005,-0.232448,0.006627,...,0.167878,0.287246,-1.192767,1.554042,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
280454,-0.177321,0.401354,-0.252446,-0.391362,0.871508,-0.481571,0.637525,-0.001588,-0.263959,-0.511834,...,0.108163,0.050379,1.551298,-1.518497,True,False,False,False,False,False
257702,0.969499,-0.129577,-0.871865,0.297182,-0.024439,-0.895459,0.308341,-0.376659,0.412099,-0.063894,...,-0.123933,-0.959641,0.884676,0.793000,False,False,True,False,False,False
192292,-0.875332,-0.924951,0.917352,-1.459346,0.873086,-0.304408,-0.135306,0.105184,-0.923538,-0.518500,...,0.274085,0.421721,-1.197973,1.095829,False,False,False,True,False,False
208774,1.098189,-0.042680,-1.696275,-0.239223,0.725972,-0.448028,0.404946,-0.168672,0.233288,0.211473,...,-0.287417,-0.417440,-0.986932,-1.564953,True,False,False,False,False,False


In [188]:
print(f"\n🚀 INITIAL STATE (IDENTICAL FOR BOTH):")
print(f"  Labeled pool: {len(X_labeled_active)} samples")
print(f"  Fraud in labeled: {(y_labeled_active == 1).sum()} ({(y_labeled_active == 1).sum()/len(y_labeled_active)*100:.2f}%)")
print(f"  Unlabeled pool: {len(X_unlabeled_active)} samples")
print(f"  Fraud in unlabeled: {(y_unlabeled_active == 1).sum()} ({(y_unlabeled_active == 1).sum()/len(y_unlabeled_active)*100:.3f}%)")


🚀 INITIAL STATE (IDENTICAL FOR BOTH):
  Labeled pool: 300 samples
  Fraud in labeled: 10 (3.33%)
  Unlabeled pool: 181976 samples
  Fraud in unlabeled: 305 (0.168%)


In [189]:
# Storage for iteration progression
iteration_results = []

In [190]:
# PARALLEL ITERATION LOOP
for iteration in range(1, config['n_iterations'] + 1):
    print(iteration)

1
2
3
4
5
6


### ITERATION = 1

In [191]:
iteration = 1

In [192]:
# ===================
# ACTIVE LEARNING
# ===================
print(f"\n🔴 ACTIVE LEARNING - Iteration {iteration}")
print(f"  📊 Current labeled pool: {len(X_labeled_active)} samples")
print(f"  🎯 Fraud in labeled: {(y_labeled_active == 1).sum()} samples ({(y_labeled_active == 1).sum()/len(y_labeled_active)*100:.2f}%)")

active_model, active_metrics = train_and_evaluate(
    X_labeled_active, y_labeled_active, X_val, y_val, config['model_type']
)

print(f"  📈 Validation Performance: F1={active_metrics['f1']:.4f}, Acc={active_metrics['accuracy']:.4f}, Prec={active_metrics['precision']:.4f}, Rec={active_metrics['recall']:.4f}")

# Select new samples using active learning strategy
active_new_samples = None
active_fraud_count = 0
active_non_fraud_count = 0

if iteration < config['n_iterations'] and len(X_unlabeled_active) > 0:
    # Get the strategy for this specific iteration
    current_strategy = config['strategy_sequence'][iteration - 1] if iteration <= len(config['strategy_sequence']) else 'uncertainty'
    print(f"  🎯 Selecting {config['batch_size']} new samples using {current_strategy} sampling...")
    
    if current_strategy == 'uncertainty':
        active_new_samples = uncertainty_sampling(active_model, X_unlabeled_active, config['batch_size'])
    elif current_strategy == 'diversity':
        active_new_samples = diversity_sampling(X_unlabeled_active, config['batch_size'])
    else:
        active_new_samples = X_unlabeled_active.sample(config['batch_size'], random_state=random_seed + iteration)
    
    # Count fraud/non-fraud in active learning's selection
    active_new_labels = y_unlabeled_active.loc[active_new_samples.index]
    active_fraud_count = (active_new_labels == 1).sum()
    active_non_fraud_count = (active_new_labels == 0).sum()
    
    print(f"  ✅ Active Learning found: {active_fraud_count} fraud + {active_non_fraud_count} non-fraud = {len(active_new_samples)} total")
    print(f"  📊 Active Learning fraud rate in batch: {active_fraud_count/len(active_new_samples)*100:.2f}%")
    
    # Add to active learning labeled pool
    X_labeled_active = pd.concat([X_labeled_active, active_new_samples])
    y_labeled_active = pd.concat([y_labeled_active, active_new_labels])
    
    # Remove from active learning unlabeled pool
    X_unlabeled_active = X_unlabeled_active.drop(index=active_new_samples.index)
    y_unlabeled_active = y_unlabeled_active.drop(index=active_new_samples.index)


🔴 ACTIVE LEARNING - Iteration 1
  📊 Current labeled pool: 300 samples
  🎯 Fraud in labeled: 10 samples (3.33%)
  📈 Validation Performance: F1=0.6936, Acc=0.9988, Prec=0.6383, Rec=0.7595
  🎯 Selecting 68 new samples using uncertainty sampling...
  ✅ Active Learning found: 0 fraud + 68 non-fraud = 68 total
  📊 Active Learning fraud rate in batch: 0.00%


In [193]:
X_labeled_active

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V28,Time_hour_sin,Time_hour_cos,Amount_log,Amount_very_small,Amount_small,Amount_medium,Amount_large,Amount_very_large,Amount_extreme
106010,0.682070,-0.682406,-0.144220,-1.033447,-0.760468,-0.571584,-0.357477,-0.207845,-2.238134,1.445829,...,0.059564,-1.064028,0.690154,0.955679,False,False,False,True,False,False
59939,0.591546,0.125306,0.399343,0.350841,-0.196124,-0.222914,-0.051379,0.021068,-0.312604,0.033781,...,0.027961,-0.235260,-1.072529,-1.114639,True,False,False,False,False,False
275313,1.153450,-0.956763,-0.826917,-1.306220,-0.613648,0.110281,-0.995417,-0.026123,-1.162494,1.571756,...,-0.191480,-0.282310,1.445353,0.537751,False,False,True,False,False,False
278593,1.097762,-0.895010,-0.573975,-1.066476,-0.993461,-0.602941,-0.789431,-0.181738,-1.442861,1.571938,...,-0.152540,-0.085693,1.516791,0.645950,False,False,True,False,False,False
50199,0.305591,-0.438062,-0.126088,0.793291,-0.134750,0.088342,0.334355,0.008005,-0.232448,0.006627,...,0.167878,0.287246,-1.192767,1.554042,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
127244,-0.345886,1.076449,0.683442,1.662303,0.164737,-0.651629,0.686320,0.128872,-1.863006,0.204218,...,0.204678,-0.463598,1.353170,-0.866083,True,False,False,False,False,False
51670,-3.635334,-3.567519,0.586897,2.326277,3.598085,-2.711976,-2.329101,0.500073,-0.324441,-0.631061,...,-5.855942,0.209127,-1.185131,-0.605292,True,False,False,False,False,False
251119,-0.304432,0.258023,0.980422,-0.314449,0.518519,-0.019149,0.251173,0.054679,0.033561,-0.856806,...,0.492100,-1.099546,0.598588,-0.094743,False,True,False,False,False,False
96331,0.602365,-0.106213,0.399823,0.925081,-0.308533,0.422527,-0.438591,0.275222,0.843644,0.008708,...,0.010353,-1.163725,0.289516,-0.715347,True,False,False,False,False,False


In [194]:
X_unlabeled_active

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V28,Time_hour_sin,Time_hour_cos,Amount_log,Amount_very_small,Amount_small,Amount_medium,Amount_large,Amount_very_large,Amount_extreme
1458,-0.225640,0.654553,0.930534,0.068293,0.097969,-0.530334,0.538452,0.011392,-0.462076,-0.526294,...,0.304609,0.555525,1.585677,-0.636328,True,False,False,False,False,False
119883,0.626228,0.102507,0.215307,0.348311,-0.089246,-0.235341,-0.007871,-0.040686,0.053203,-0.105051,...,0.040865,-0.699444,1.183404,-0.455862,True,False,False,False,False,False
177219,0.872106,-0.423046,-0.636374,0.335275,-0.188945,0.178914,-0.476561,0.046681,1.060740,-0.591898,...,0.069050,1.150794,-1.044542,1.206248,False,False,False,True,False,False
191980,1.070430,0.056710,-1.423928,0.675074,0.775867,-0.048140,0.349212,-0.103700,0.176043,0.425025,...,-0.271746,0.438277,-1.197925,-1.484349,True,False,False,False,False,False
252575,-0.290734,-0.579083,1.323932,-1.123243,-0.135298,-0.632335,-0.563976,-0.004633,1.776121,-1.092948,...,-0.024037,-1.076325,0.660735,-0.031447,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24156,0.008268,0.506874,0.931599,1.202478,-0.447450,0.195929,-0.433090,-1.275878,0.136690,-0.075396,...,0.725106,1.492411,-0.837970,-1.426817,True,False,False,False,False,False
258612,-1.728628,1.967535,-1.481608,0.701573,-0.528670,0.711485,-2.279238,-3.178181,-1.781841,-1.632174,...,-0.234764,-0.934264,0.922417,0.303863,False,True,False,False,False,False
19453,-0.886862,-0.539047,2.256032,0.279483,-0.048984,-0.251840,-0.174346,0.253323,1.368394,-1.084568,...,-0.398225,1.708885,-0.627859,0.746128,False,False,True,False,False,False
129088,-0.325680,0.692435,1.330881,-0.040989,0.104750,-0.485390,0.787091,-0.249193,-0.444161,-0.101846,...,-0.134209,-0.396711,1.390471,-0.936098,True,False,False,False,False,False


In [195]:
# ===================
# PASSIVE LEARNING - MATCHED QUANTITIES
# ===================
print(f"\n🔵 PASSIVE LEARNING - Iteration {iteration}")
print(f"  📊 Current labeled pool: {len(X_labeled_passive)} samples")
print(f"  🎯 Fraud in labeled: {(y_labeled_passive == 1).sum()} samples ({(y_labeled_passive == 1).sum()/len(y_labeled_passive)*100:.2f}%)")

passive_model, passive_metrics = train_and_evaluate(
    X_labeled_passive, y_labeled_passive, X_val, y_val, config['model_type']
)

print(f"  📈 Validation Performance: F1={passive_metrics['f1']:.4f}, Acc={passive_metrics['accuracy']:.4f}, Prec={passive_metrics['precision']:.4f}, Rec={passive_metrics['recall']:.4f}")

# Select new samples using MATCHED QUANTITIES from active learning
if iteration < config['n_iterations'] and len(X_unlabeled_passive) > 0 and active_new_samples is not None:
    print(f"  🎯 Matching Active Learning's selection: {active_fraud_count} fraud + {active_non_fraud_count} non-fraud...")
    
    # Match the exact composition that active learning found
    passive_new_samples = matched_quantity_random_sampling(
        X_unlabeled_passive, y_unlabeled_passive,
        active_fraud_count, active_non_fraud_count,
        random_seed + iteration + 1000  # Different seed than active learning
    )
    
    print(f"  ✅ Passive Learning selected: {(y_unlabeled_passive.loc[passive_new_samples.index] == 1).sum()} fraud + {(y_unlabeled_passive.loc[passive_new_samples.index] == 0).sum()} non-fraud = {len(passive_new_samples)} total")
    print(f"  📊 Passive Learning fraud rate in batch: {(y_unlabeled_passive.loc[passive_new_samples.index] == 1).sum()/len(passive_new_samples)*100:.2f}%")
    
    # Add to passive learning labeled pool
    passive_new_labels = y_unlabeled_passive.loc[passive_new_samples.index]
    X_labeled_passive = pd.concat([X_labeled_passive, passive_new_samples])
    y_labeled_passive = pd.concat([y_labeled_passive, passive_new_labels])
    
    # Remove from passive learning unlabeled pool
    X_unlabeled_passive = X_unlabeled_passive.drop(index=passive_new_samples.index)
    y_unlabeled_passive = y_unlabeled_passive.drop(index=passive_new_samples.index)


🔵 PASSIVE LEARNING - Iteration 1
  📊 Current labeled pool: 300 samples
  🎯 Fraud in labeled: 10 samples (3.33%)
  📈 Validation Performance: F1=0.6936, Acc=0.9988, Prec=0.6383, Rec=0.7595
  🎯 Matching Active Learning's selection: 0 fraud + 68 non-fraud...
  ✅ Passive Learning selected: 0 fraud + 68 non-fraud = 68 total
  📊 Passive Learning fraud rate in batch: 0.00%


In [196]:
# Calculate differences
f1_difference = active_metrics['f1'] - passive_metrics['f1']
f1_improvement_pct = (f1_difference / passive_metrics['f1'] * 100) if passive_metrics['f1'] > 0 else 0

print(f"\n📊 ITERATION {iteration} SUMMARY:")
print(f"  🔴 Active Learning F1:  {active_metrics['f1']:.4f}")
print(f"  🔵 Passive Learning F1: {passive_metrics['f1']:.4f}")
print(f"  📈 Difference: {f1_difference:+.4f} ({f1_improvement_pct:+.1f}%)")
print(f"  🎯 Both labeled pools now have: {len(X_labeled_active)} samples")
print(f"  🔴 Active fraud in pool: {(y_labeled_active == 1).sum()} ({(y_labeled_active == 1).sum()/len(y_labeled_active)*100:.2f}%)")
print(f"  🔵 Passive fraud in pool: {(y_labeled_passive == 1).sum()} ({(y_labeled_passive == 1).sum()/len(y_labeled_passive)*100:.2f}%)")

# Store results
current_strategy = config['strategy_sequence'][iteration - 1] if iteration <= len(config['strategy_sequence']) else 'uncertainty'
iteration_results.append({
    'iteration': iteration,
    'strategy_used': current_strategy,
    'active_f1': active_metrics['f1'],
    'passive_f1': passive_metrics['f1'],
    'active_accuracy': active_metrics['accuracy'],
    'passive_accuracy': passive_metrics['accuracy'],
    'active_precision': active_metrics['precision'],
    'passive_precision': passive_metrics['precision'],
    'active_recall': active_metrics['recall'],
    'passive_recall': passive_metrics['recall'],
    'f1_difference': f1_difference,
    'f1_improvement_pct': f1_improvement_pct,
    'active_labeled_count': len(X_labeled_active),
    'passive_labeled_count': len(X_labeled_passive),
    'active_fraud_count': (y_labeled_active == 1).sum(),
    'passive_fraud_count': (y_labeled_passive == 1).sum(),
    'active_fraud_rate': (y_labeled_active == 1).sum()/len(y_labeled_active)*100,
    'passive_fraud_rate': (y_labeled_passive == 1).sum()/len(y_labeled_passive)*100,
    'batch_fraud_count': active_fraud_count if active_new_samples is not None else 0,
    'batch_non_fraud_count': active_non_fraud_count if active_new_samples is not None else 0
})


📊 ITERATION 1 SUMMARY:
  🔴 Active Learning F1:  0.6936
  🔵 Passive Learning F1: 0.6936
  📈 Difference: +0.0000 (+0.0%)
  🎯 Both labeled pools now have: 368 samples
  🔴 Active fraud in pool: 10 (2.72%)
  🔵 Passive fraud in pool: 10 (2.72%)


### ITERATION = 2

In [197]:
iteration = 2

In [198]:
# ===================
# ACTIVE LEARNING
# ===================
print(f"\n🔴 ACTIVE LEARNING - Iteration {iteration}")
print(f"  📊 Current labeled pool: {len(X_labeled_active)} samples")
print(f"  🎯 Fraud in labeled: {(y_labeled_active == 1).sum()} samples ({(y_labeled_active == 1).sum()/len(y_labeled_active)*100:.2f}%)")

active_model, active_metrics = train_and_evaluate(
    X_labeled_active, y_labeled_active, X_val, y_val, config['model_type']
)

print(f"  📈 Validation Performance: F1={active_metrics['f1']:.4f}, Acc={active_metrics['accuracy']:.4f}, Prec={active_metrics['precision']:.4f}, Rec={active_metrics['recall']:.4f}")

# Select new samples using active learning strategy
active_new_samples = None
active_fraud_count = 0
active_non_fraud_count = 0

if iteration < config['n_iterations'] and len(X_unlabeled_active) > 0:
    # Get the strategy for this specific iteration
    current_strategy = config['strategy_sequence'][iteration - 1] if iteration <= len(config['strategy_sequence']) else 'uncertainty'
    print(f"  🎯 Selecting {config['batch_size']} new samples using {current_strategy} sampling...")
    
    if current_strategy == 'uncertainty':
        active_new_samples = uncertainty_sampling(active_model, X_unlabeled_active, config['batch_size'])
    elif current_strategy == 'diversity':
        active_new_samples = diversity_sampling(X_unlabeled_active, config['batch_size'])
    else:
        active_new_samples = X_unlabeled_active.sample(config['batch_size'], random_state=random_seed + iteration)
    
    # Count fraud/non-fraud in active learning's selection
    active_new_labels = y_unlabeled_active.loc[active_new_samples.index]
    active_fraud_count = (active_new_labels == 1).sum()
    active_non_fraud_count = (active_new_labels == 0).sum()
    
    print(f"  ✅ Active Learning found: {active_fraud_count} fraud + {active_non_fraud_count} non-fraud = {len(active_new_samples)} total")
    print(f"  📊 Active Learning fraud rate in batch: {active_fraud_count/len(active_new_samples)*100:.2f}%")
    
    # Add to active learning labeled pool
    X_labeled_active = pd.concat([X_labeled_active, active_new_samples])
    y_labeled_active = pd.concat([y_labeled_active, active_new_labels])
    
    # Remove from active learning unlabeled pool
    X_unlabeled_active = X_unlabeled_active.drop(index=active_new_samples.index)
    y_unlabeled_active = y_unlabeled_active.drop(index=active_new_samples.index)


🔴 ACTIVE LEARNING - Iteration 2
  📊 Current labeled pool: 368 samples
  🎯 Fraud in labeled: 10 samples (2.72%)
  📈 Validation Performance: F1=0.7947, Acc=0.9993, Prec=0.8333, Rec=0.7595
  🎯 Selecting 68 new samples using uncertainty sampling...
  ✅ Active Learning found: 0 fraud + 68 non-fraud = 68 total
  📊 Active Learning fraud rate in batch: 0.00%


In [199]:
X_labeled_active

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V28,Time_hour_sin,Time_hour_cos,Amount_log,Amount_very_small,Amount_small,Amount_medium,Amount_large,Amount_very_large,Amount_extreme
106010,0.682070,-0.682406,-0.144220,-1.033447,-0.760468,-0.571584,-0.357477,-0.207845,-2.238134,1.445829,...,0.059564,-1.064028,0.690154,0.955679,False,False,False,True,False,False
59939,0.591546,0.125306,0.399343,0.350841,-0.196124,-0.222914,-0.051379,0.021068,-0.312604,0.033781,...,0.027961,-0.235260,-1.072529,-1.114639,True,False,False,False,False,False
275313,1.153450,-0.956763,-0.826917,-1.306220,-0.613648,0.110281,-0.995417,-0.026123,-1.162494,1.571756,...,-0.191480,-0.282310,1.445353,0.537751,False,False,True,False,False,False
278593,1.097762,-0.895010,-0.573975,-1.066476,-0.993461,-0.602941,-0.789431,-0.181738,-1.442861,1.571938,...,-0.152540,-0.085693,1.516791,0.645950,False,False,True,False,False,False
50199,0.305591,-0.438062,-0.126088,0.793291,-0.134750,0.088342,0.334355,0.008005,-0.232448,0.006627,...,0.167878,0.287246,-1.192767,1.554042,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
138959,0.538125,-0.191224,0.288621,0.085109,0.243442,1.575633,-0.716042,0.679334,0.328667,-0.060995,...,-0.029936,0.028026,1.546412,-0.691347,True,False,False,False,False,False
199529,0.974572,0.232869,-1.532120,0.990490,0.752714,-0.478289,0.523500,-0.185264,-0.182257,-0.204073,...,-0.073613,0.034420,-1.155426,0.433262,False,True,False,False,False,False
54792,0.642585,-0.044987,0.204289,-0.077026,-0.377797,-0.532180,-0.137820,-0.033325,0.364625,-0.187451,...,-0.006624,0.032737,-1.155052,-1.558093,True,False,False,False,False,False
116477,0.712518,-0.943477,1.143595,-0.544553,-1.772230,0.226512,-1.574063,0.161404,-0.269233,0.906783,...,0.103658,-0.802978,1.083798,-0.301179,False,True,False,False,False,False


In [200]:
X_unlabeled_active

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V28,Time_hour_sin,Time_hour_cos,Amount_log,Amount_very_small,Amount_small,Amount_medium,Amount_large,Amount_very_large,Amount_extreme
1458,-0.225640,0.654553,0.930534,0.068293,0.097969,-0.530334,0.538452,0.011392,-0.462076,-0.526294,...,0.304609,0.555525,1.585677,-0.636328,True,False,False,False,False,False
119883,0.626228,0.102507,0.215307,0.348311,-0.089246,-0.235341,-0.007871,-0.040686,0.053203,-0.105051,...,0.040865,-0.699444,1.183404,-0.455862,True,False,False,False,False,False
177219,0.872106,-0.423046,-0.636374,0.335275,-0.188945,0.178914,-0.476561,0.046681,1.060740,-0.591898,...,0.069050,1.150794,-1.044542,1.206248,False,False,False,True,False,False
191980,1.070430,0.056710,-1.423928,0.675074,0.775867,-0.048140,0.349212,-0.103700,0.176043,0.425025,...,-0.271746,0.438277,-1.197925,-1.484349,True,False,False,False,False,False
252575,-0.290734,-0.579083,1.323932,-1.123243,-0.135298,-0.632335,-0.563976,-0.004633,1.776121,-1.092948,...,-0.024037,-1.076325,0.660735,-0.031447,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24156,0.008268,0.506874,0.931599,1.202478,-0.447450,0.195929,-0.433090,-1.275878,0.136690,-0.075396,...,0.725106,1.492411,-0.837970,-1.426817,True,False,False,False,False,False
258612,-1.728628,1.967535,-1.481608,0.701573,-0.528670,0.711485,-2.279238,-3.178181,-1.781841,-1.632174,...,-0.234764,-0.934264,0.922417,0.303863,False,True,False,False,False,False
19453,-0.886862,-0.539047,2.256032,0.279483,-0.048984,-0.251840,-0.174346,0.253323,1.368394,-1.084568,...,-0.398225,1.708885,-0.627859,0.746128,False,False,True,False,False,False
129088,-0.325680,0.692435,1.330881,-0.040989,0.104750,-0.485390,0.787091,-0.249193,-0.444161,-0.101846,...,-0.134209,-0.396711,1.390471,-0.936098,True,False,False,False,False,False


In [201]:
# ===================
# PASSIVE LEARNING - MATCHED QUANTITIES
# ===================
print(f"\n🔵 PASSIVE LEARNING - Iteration {iteration}")
print(f"  📊 Current labeled pool: {len(X_labeled_passive)} samples")
print(f"  🎯 Fraud in labeled: {(y_labeled_passive == 1).sum()} samples ({(y_labeled_passive == 1).sum()/len(y_labeled_passive)*100:.2f}%)")

passive_model, passive_metrics = train_and_evaluate(
    X_labeled_passive, y_labeled_passive, X_val, y_val, config['model_type']
)

print(f"  📈 Validation Performance: F1={passive_metrics['f1']:.4f}, Acc={passive_metrics['accuracy']:.4f}, Prec={passive_metrics['precision']:.4f}, Rec={passive_metrics['recall']:.4f}")

# Select new samples using MATCHED QUANTITIES from active learning
if iteration < config['n_iterations'] and len(X_unlabeled_passive) > 0 and active_new_samples is not None:
    print(f"  🎯 Matching Active Learning's selection: {active_fraud_count} fraud + {active_non_fraud_count} non-fraud...")
    
    # Match the exact composition that active learning found
    passive_new_samples = matched_quantity_random_sampling(
        X_unlabeled_passive, y_unlabeled_passive,
        active_fraud_count, active_non_fraud_count,
        random_seed + iteration + 1000  # Different seed than active learning
    )
    
    print(f"  ✅ Passive Learning selected: {(y_unlabeled_passive.loc[passive_new_samples.index] == 1).sum()} fraud + {(y_unlabeled_passive.loc[passive_new_samples.index] == 0).sum()} non-fraud = {len(passive_new_samples)} total")
    print(f"  📊 Passive Learning fraud rate in batch: {(y_unlabeled_passive.loc[passive_new_samples.index] == 1).sum()/len(passive_new_samples)*100:.2f}%")
    
    # Add to passive learning labeled pool
    passive_new_labels = y_unlabeled_passive.loc[passive_new_samples.index]
    X_labeled_passive = pd.concat([X_labeled_passive, passive_new_samples])
    y_labeled_passive = pd.concat([y_labeled_passive, passive_new_labels])
    
    # Remove from passive learning unlabeled pool
    X_unlabeled_passive = X_unlabeled_passive.drop(index=passive_new_samples.index)
    y_unlabeled_passive = y_unlabeled_passive.drop(index=passive_new_samples.index)


🔵 PASSIVE LEARNING - Iteration 2
  📊 Current labeled pool: 368 samples
  🎯 Fraud in labeled: 10 samples (2.72%)
  📈 Validation Performance: F1=0.6857, Acc=0.9988, Prec=0.6250, Rec=0.7595
  🎯 Matching Active Learning's selection: 0 fraud + 68 non-fraud...
  ✅ Passive Learning selected: 0 fraud + 68 non-fraud = 68 total
  📊 Passive Learning fraud rate in batch: 0.00%


In [202]:
X_labeled_passive

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V28,Time_hour_sin,Time_hour_cos,Amount_log,Amount_very_small,Amount_small,Amount_medium,Amount_large,Amount_very_large,Amount_extreme
106010,0.682070,-0.682406,-0.144220,-1.033447,-0.760468,-0.571584,-0.357477,-0.207845,-2.238134,1.445829,...,0.059564,-1.064028,0.690154,0.955679,False,False,False,True,False,False
59939,0.591546,0.125306,0.399343,0.350841,-0.196124,-0.222914,-0.051379,0.021068,-0.312604,0.033781,...,0.027961,-0.235260,-1.072529,-1.114639,True,False,False,False,False,False
275313,1.153450,-0.956763,-0.826917,-1.306220,-0.613648,0.110281,-0.995417,-0.026123,-1.162494,1.571756,...,-0.191480,-0.282310,1.445353,0.537751,False,False,True,False,False,False
278593,1.097762,-0.895010,-0.573975,-1.066476,-0.993461,-0.602941,-0.789431,-0.181738,-1.442861,1.571938,...,-0.152540,-0.085693,1.516791,0.645950,False,False,True,False,False,False
50199,0.305591,-0.438062,-0.126088,0.793291,-0.134750,0.088342,0.334355,0.008005,-0.232448,0.006627,...,0.167878,0.287246,-1.192767,1.554042,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
181566,1.050320,-0.018308,-1.202557,0.341585,0.389721,-0.590089,0.385360,-0.281193,0.314915,0.100963,...,-0.198942,0.944248,-1.121655,0.005724,False,True,False,False,False,False
240654,0.918555,-0.340087,-0.809525,-0.037056,0.350733,0.733917,-0.210651,0.266094,0.739510,-0.190498,...,-0.219428,-1.166221,0.145018,0.443995,False,True,False,False,False,False
110828,-0.223391,0.662190,1.132326,0.028607,-0.122757,-0.810701,0.596546,-0.098460,-0.455225,-0.299669,...,0.380221,-0.954524,0.892508,-1.518497,True,False,False,False,False,False
153036,1.088526,0.203487,-1.248412,0.186384,0.434923,-1.324975,0.656545,-0.649596,1.184380,-0.354041,...,-0.248822,1.598808,1.137804,-0.768253,True,False,False,False,False,False


In [203]:
# Calculate differences
f1_difference = active_metrics['f1'] - passive_metrics['f1']
f1_improvement_pct = (f1_difference / passive_metrics['f1'] * 100) if passive_metrics['f1'] > 0 else 0

print(f"\n📊 ITERATION {iteration} SUMMARY:")
print(f"  🔴 Active Learning F1:  {active_metrics['f1']:.4f}")
print(f"  🔵 Passive Learning F1: {passive_metrics['f1']:.4f}")
print(f"  📈 Difference: {f1_difference:+.4f} ({f1_improvement_pct:+.1f}%)")
print(f"  🎯 Both labeled pools now have: {len(X_labeled_active)} samples")
print(f"  🔴 Active fraud in pool: {(y_labeled_active == 1).sum()} ({(y_labeled_active == 1).sum()/len(y_labeled_active)*100:.2f}%)")
print(f"  🔵 Passive fraud in pool: {(y_labeled_passive == 1).sum()} ({(y_labeled_passive == 1).sum()/len(y_labeled_passive)*100:.2f}%)")

# Store results
current_strategy = config['strategy_sequence'][iteration - 1] if iteration <= len(config['strategy_sequence']) else 'uncertainty'
iteration_results.append({
    'iteration': iteration,
    'strategy_used': current_strategy,
    'active_f1': active_metrics['f1'],
    'passive_f1': passive_metrics['f1'],
    'active_accuracy': active_metrics['accuracy'],
    'passive_accuracy': passive_metrics['accuracy'],
    'active_precision': active_metrics['precision'],
    'passive_precision': passive_metrics['precision'],
    'active_recall': active_metrics['recall'],
    'passive_recall': passive_metrics['recall'],
    'f1_difference': f1_difference,
    'f1_improvement_pct': f1_improvement_pct,
    'active_labeled_count': len(X_labeled_active),
    'passive_labeled_count': len(X_labeled_passive),
    'active_fraud_count': (y_labeled_active == 1).sum(),
    'passive_fraud_count': (y_labeled_passive == 1).sum(),
    'active_fraud_rate': (y_labeled_active == 1).sum()/len(y_labeled_active)*100,
    'passive_fraud_rate': (y_labeled_passive == 1).sum()/len(y_labeled_passive)*100,
    'batch_fraud_count': active_fraud_count if active_new_samples is not None else 0,
    'batch_non_fraud_count': active_non_fraud_count if active_new_samples is not None else 0
})


📊 ITERATION 2 SUMMARY:
  🔴 Active Learning F1:  0.7947
  🔵 Passive Learning F1: 0.6857
  📈 Difference: +0.1090 (+15.9%)
  🎯 Both labeled pools now have: 436 samples
  🔴 Active fraud in pool: 10 (2.29%)
  🔵 Passive fraud in pool: 10 (2.29%)


### ITERATION = 3

In [204]:
iteration = 3

In [205]:
# ===================
# ACTIVE LEARNING
# ===================
print(f"\n🔴 ACTIVE LEARNING - Iteration {iteration}")
print(f"  📊 Current labeled pool: {len(X_labeled_active)} samples")
print(f"  🎯 Fraud in labeled: {(y_labeled_active == 1).sum()} samples ({(y_labeled_active == 1).sum()/len(y_labeled_active)*100:.2f}%)")

active_model, active_metrics = train_and_evaluate(
    X_labeled_active, y_labeled_active, X_val, y_val, config['model_type']
)

print(f"  📈 Validation Performance: F1={active_metrics['f1']:.4f}, Acc={active_metrics['accuracy']:.4f}, Prec={active_metrics['precision']:.4f}, Rec={active_metrics['recall']:.4f}")

# Select new samples using active learning strategy
active_new_samples = None
active_fraud_count = 0
active_non_fraud_count = 0

if iteration < config['n_iterations'] and len(X_unlabeled_active) > 0:
    # Get the strategy for this specific iteration
    current_strategy = config['strategy_sequence'][iteration - 1] if iteration <= len(config['strategy_sequence']) else 'uncertainty'
    print(f"  🎯 Selecting {config['batch_size']} new samples using {current_strategy} sampling...")
    
    if current_strategy == 'uncertainty':
        active_new_samples = uncertainty_sampling(active_model, X_unlabeled_active, config['batch_size'])
    elif current_strategy == 'diversity':
        active_new_samples = diversity_sampling(X_unlabeled_active, config['batch_size'])
    else:
        active_new_samples = X_unlabeled_active.sample(config['batch_size'], random_state=random_seed + iteration)
    
    # Count fraud/non-fraud in active learning's selection
    active_new_labels = y_unlabeled_active.loc[active_new_samples.index]
    active_fraud_count = (active_new_labels == 1).sum()
    active_non_fraud_count = (active_new_labels == 0).sum()
    
    print(f"  ✅ Active Learning found: {active_fraud_count} fraud + {active_non_fraud_count} non-fraud = {len(active_new_samples)} total")
    print(f"  📊 Active Learning fraud rate in batch: {active_fraud_count/len(active_new_samples)*100:.2f}%")
    
    # Add to active learning labeled pool
    X_labeled_active = pd.concat([X_labeled_active, active_new_samples])
    y_labeled_active = pd.concat([y_labeled_active, active_new_labels])
    
    # Remove from active learning unlabeled pool
    X_unlabeled_active = X_unlabeled_active.drop(index=active_new_samples.index)
    y_unlabeled_active = y_unlabeled_active.drop(index=active_new_samples.index)


🔴 ACTIVE LEARNING - Iteration 3
  📊 Current labeled pool: 436 samples
  🎯 Fraud in labeled: 10 samples (2.29%)
  📈 Validation Performance: F1=0.7947, Acc=0.9993, Prec=0.8333, Rec=0.7595
  🎯 Selecting 68 new samples using uncertainty sampling...
  ✅ Active Learning found: 1 fraud + 67 non-fraud = 68 total
  📊 Active Learning fraud rate in batch: 1.47%


In [206]:
X_labeled_active

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V28,Time_hour_sin,Time_hour_cos,Amount_log,Amount_very_small,Amount_small,Amount_medium,Amount_large,Amount_very_large,Amount_extreme
106010,0.682070,-0.682406,-0.144220,-1.033447,-0.760468,-0.571584,-0.357477,-0.207845,-2.238134,1.445829,...,0.059564,-1.064028,0.690154,0.955679,False,False,False,True,False,False
59939,0.591546,0.125306,0.399343,0.350841,-0.196124,-0.222914,-0.051379,0.021068,-0.312604,0.033781,...,0.027961,-0.235260,-1.072529,-1.114639,True,False,False,False,False,False
275313,1.153450,-0.956763,-0.826917,-1.306220,-0.613648,0.110281,-0.995417,-0.026123,-1.162494,1.571756,...,-0.191480,-0.282310,1.445353,0.537751,False,False,True,False,False,False
278593,1.097762,-0.895010,-0.573975,-1.066476,-0.993461,-0.602941,-0.789431,-0.181738,-1.442861,1.571938,...,-0.152540,-0.085693,1.516791,0.645950,False,False,True,False,False,False
50199,0.305591,-0.438062,-0.126088,0.793291,-0.134750,0.088342,0.334355,0.008005,-0.232448,0.006627,...,0.167878,0.287246,-1.192767,1.554042,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88342,0.622449,0.327235,-0.317385,0.575696,-0.013072,-0.956018,0.198502,-0.107506,-0.266033,-0.473207,...,0.098935,-1.136659,-0.075983,-1.561513,True,False,False,False,False,False
8020,0.662444,-0.016001,0.372750,0.110176,-0.353065,-0.316291,-0.371796,-0.124951,1.641387,-0.473707,...,-0.005049,1.566474,1.167819,-0.194321,False,True,False,False,False,False
186756,-0.658890,0.663510,-0.321903,0.361763,1.215428,-0.543463,1.389960,-0.304082,-0.613545,-0.801647,...,-0.061374,0.696393,-1.177527,1.030164,False,False,False,True,False,False
98299,-3.003501,2.799350,-0.693141,-1.699526,-1.202165,-0.627010,-0.372286,1.097064,2.942529,4.696931,...,3.860294,-1.153598,0.378687,-0.977657,True,False,False,False,False,False


In [207]:
X_unlabeled_active

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V28,Time_hour_sin,Time_hour_cos,Amount_log,Amount_very_small,Amount_small,Amount_medium,Amount_large,Amount_very_large,Amount_extreme
1458,-0.225640,0.654553,0.930534,0.068293,0.097969,-0.530334,0.538452,0.011392,-0.462076,-0.526294,...,0.304609,0.555525,1.585677,-0.636328,True,False,False,False,False,False
119883,0.626228,0.102507,0.215307,0.348311,-0.089246,-0.235341,-0.007871,-0.040686,0.053203,-0.105051,...,0.040865,-0.699444,1.183404,-0.455862,True,False,False,False,False,False
177219,0.872106,-0.423046,-0.636374,0.335275,-0.188945,0.178914,-0.476561,0.046681,1.060740,-0.591898,...,0.069050,1.150794,-1.044542,1.206248,False,False,False,True,False,False
191980,1.070430,0.056710,-1.423928,0.675074,0.775867,-0.048140,0.349212,-0.103700,0.176043,0.425025,...,-0.271746,0.438277,-1.197925,-1.484349,True,False,False,False,False,False
252575,-0.290734,-0.579083,1.323932,-1.123243,-0.135298,-0.632335,-0.563976,-0.004633,1.776121,-1.092948,...,-0.024037,-1.076325,0.660735,-0.031447,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24156,0.008268,0.506874,0.931599,1.202478,-0.447450,0.195929,-0.433090,-1.275878,0.136690,-0.075396,...,0.725106,1.492411,-0.837970,-1.426817,True,False,False,False,False,False
258612,-1.728628,1.967535,-1.481608,0.701573,-0.528670,0.711485,-2.279238,-3.178181,-1.781841,-1.632174,...,-0.234764,-0.934264,0.922417,0.303863,False,True,False,False,False,False
19453,-0.886862,-0.539047,2.256032,0.279483,-0.048984,-0.251840,-0.174346,0.253323,1.368394,-1.084568,...,-0.398225,1.708885,-0.627859,0.746128,False,False,True,False,False,False
129088,-0.325680,0.692435,1.330881,-0.040989,0.104750,-0.485390,0.787091,-0.249193,-0.444161,-0.101846,...,-0.134209,-0.396711,1.390471,-0.936098,True,False,False,False,False,False


In [208]:
# ===================
# PASSIVE LEARNING - MATCHED QUANTITIES
# ===================
print(f"\n🔵 PASSIVE LEARNING - Iteration {iteration}")
print(f"  📊 Current labeled pool: {len(X_labeled_passive)} samples")
print(f"  🎯 Fraud in labeled: {(y_labeled_passive == 1).sum()} samples ({(y_labeled_passive == 1).sum()/len(y_labeled_passive)*100:.2f}%)")

passive_model, passive_metrics = train_and_evaluate(
    X_labeled_passive, y_labeled_passive, X_val, y_val, config['model_type']
)

print(f"  📈 Validation Performance: F1={passive_metrics['f1']:.4f}, Acc={passive_metrics['accuracy']:.4f}, Prec={passive_metrics['precision']:.4f}, Rec={passive_metrics['recall']:.4f}")

# Select new samples using MATCHED QUANTITIES from active learning
if iteration < config['n_iterations'] and len(X_unlabeled_passive) > 0 and active_new_samples is not None:
    print(f"  🎯 Matching Active Learning's selection: {active_fraud_count} fraud + {active_non_fraud_count} non-fraud...")
    
    # Match the exact composition that active learning found
    passive_new_samples = matched_quantity_random_sampling(
        X_unlabeled_passive, y_unlabeled_passive,
        active_fraud_count, active_non_fraud_count,
        random_seed + iteration + 1000  # Different seed than active learning
    )
    
    print(f"  ✅ Passive Learning selected: {(y_unlabeled_passive.loc[passive_new_samples.index] == 1).sum()} fraud + {(y_unlabeled_passive.loc[passive_new_samples.index] == 0).sum()} non-fraud = {len(passive_new_samples)} total")
    print(f"  📊 Passive Learning fraud rate in batch: {(y_unlabeled_passive.loc[passive_new_samples.index] == 1).sum()/len(passive_new_samples)*100:.2f}%")
    
    # Add to passive learning labeled pool
    passive_new_labels = y_unlabeled_passive.loc[passive_new_samples.index]
    X_labeled_passive = pd.concat([X_labeled_passive, passive_new_samples])
    y_labeled_passive = pd.concat([y_labeled_passive, passive_new_labels])
    
    # Remove from passive learning unlabeled pool
    X_unlabeled_passive = X_unlabeled_passive.drop(index=passive_new_samples.index)
    y_unlabeled_passive = y_unlabeled_passive.drop(index=passive_new_samples.index)


🔵 PASSIVE LEARNING - Iteration 3
  📊 Current labeled pool: 436 samples
  🎯 Fraud in labeled: 10 samples (2.29%)
  📈 Validation Performance: F1=0.7018, Acc=0.9989, Prec=0.6522, Rec=0.7595
  🎯 Matching Active Learning's selection: 1 fraud + 67 non-fraud...
  ✅ Passive Learning selected: 1 fraud + 67 non-fraud = 68 total
  📊 Passive Learning fraud rate in batch: 1.47%


In [209]:
X_labeled_passive

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V28,Time_hour_sin,Time_hour_cos,Amount_log,Amount_very_small,Amount_small,Amount_medium,Amount_large,Amount_very_large,Amount_extreme
106010,0.682070,-0.682406,-0.144220,-1.033447,-0.760468,-0.571584,-0.357477,-0.207845,-2.238134,1.445829,...,0.059564,-1.064028,0.690154,0.955679,False,False,False,True,False,False
59939,0.591546,0.125306,0.399343,0.350841,-0.196124,-0.222914,-0.051379,0.021068,-0.312604,0.033781,...,0.027961,-0.235260,-1.072529,-1.114639,True,False,False,False,False,False
275313,1.153450,-0.956763,-0.826917,-1.306220,-0.613648,0.110281,-0.995417,-0.026123,-1.162494,1.571756,...,-0.191480,-0.282310,1.445353,0.537751,False,False,True,False,False,False
278593,1.097762,-0.895010,-0.573975,-1.066476,-0.993461,-0.602941,-0.789431,-0.181738,-1.442861,1.571938,...,-0.152540,-0.085693,1.516791,0.645950,False,False,True,False,False,False
50199,0.305591,-0.438062,-0.126088,0.793291,-0.134750,0.088342,0.334355,0.008005,-0.232448,0.006627,...,0.167878,0.287246,-1.192767,1.554042,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75660,-0.185196,0.298919,0.927410,-1.071679,0.408459,-0.127878,0.712870,-0.279768,0.102709,-0.042598,...,-1.336653,-0.864876,-0.621303,-1.558093,True,False,False,False,False,False
96269,-0.735924,0.185935,1.388694,-1.284496,-0.497819,0.743699,-1.424756,-1.891966,-1.102778,-0.464209,...,0.299889,-1.163938,0.286683,0.516191,False,False,True,False,False,False
68154,0.527899,-0.834224,0.954079,-0.190806,-1.319927,0.491140,-1.182618,0.353932,0.236088,0.361339,...,0.094325,-0.603738,-0.868009,0.900738,False,False,False,True,False,False
78654,0.432294,-0.635052,0.939687,0.457010,-1.058595,0.473560,-0.756137,0.262525,1.494663,-0.578572,...,0.121234,-0.955967,-0.497896,1.128696,False,False,False,True,False,False


In [210]:
# Calculate differences
f1_difference = active_metrics['f1'] - passive_metrics['f1']
f1_improvement_pct = (f1_difference / passive_metrics['f1'] * 100) if passive_metrics['f1'] > 0 else 0

print(f"\n📊 ITERATION {iteration} SUMMARY:")
print(f"  🔴 Active Learning F1:  {active_metrics['f1']:.4f}")
print(f"  🔵 Passive Learning F1: {passive_metrics['f1']:.4f}")
print(f"  📈 Difference: {f1_difference:+.4f} ({f1_improvement_pct:+.1f}%)")
print(f"  🎯 Both labeled pools now have: {len(X_labeled_active)} samples")
print(f"  🔴 Active fraud in pool: {(y_labeled_active == 1).sum()} ({(y_labeled_active == 1).sum()/len(y_labeled_active)*100:.2f}%)")
print(f"  🔵 Passive fraud in pool: {(y_labeled_passive == 1).sum()} ({(y_labeled_passive == 1).sum()/len(y_labeled_passive)*100:.2f}%)")

# Store results
current_strategy = config['strategy_sequence'][iteration - 1] if iteration <= len(config['strategy_sequence']) else 'uncertainty'
iteration_results.append({
    'iteration': iteration,
    'strategy_used': current_strategy,
    'active_f1': active_metrics['f1'],
    'passive_f1': passive_metrics['f1'],
    'active_accuracy': active_metrics['accuracy'],
    'passive_accuracy': passive_metrics['accuracy'],
    'active_precision': active_metrics['precision'],
    'passive_precision': passive_metrics['precision'],
    'active_recall': active_metrics['recall'],
    'passive_recall': passive_metrics['recall'],
    'f1_difference': f1_difference,
    'f1_improvement_pct': f1_improvement_pct,
    'active_labeled_count': len(X_labeled_active),
    'passive_labeled_count': len(X_labeled_passive),
    'active_fraud_count': (y_labeled_active == 1).sum(),
    'passive_fraud_count': (y_labeled_passive == 1).sum(),
    'active_fraud_rate': (y_labeled_active == 1).sum()/len(y_labeled_active)*100,
    'passive_fraud_rate': (y_labeled_passive == 1).sum()/len(y_labeled_passive)*100,
    'batch_fraud_count': active_fraud_count if active_new_samples is not None else 0,
    'batch_non_fraud_count': active_non_fraud_count if active_new_samples is not None else 0
})


📊 ITERATION 3 SUMMARY:
  🔴 Active Learning F1:  0.7947
  🔵 Passive Learning F1: 0.7018
  📈 Difference: +0.0929 (+13.2%)
  🎯 Both labeled pools now have: 504 samples
  🔴 Active fraud in pool: 11 (2.18%)
  🔵 Passive fraud in pool: 11 (2.18%)


### ITERATION = 4

In [211]:
iteration = 4

In [212]:
# ===================
# ACTIVE LEARNING
# ===================
print(f"\n🔴 ACTIVE LEARNING - Iteration {iteration}")
print(f"  📊 Current labeled pool: {len(X_labeled_active)} samples")
print(f"  🎯 Fraud in labeled: {(y_labeled_active == 1).sum()} samples ({(y_labeled_active == 1).sum()/len(y_labeled_active)*100:.2f}%)")

active_model, active_metrics = train_and_evaluate(
    X_labeled_active, y_labeled_active, X_val, y_val, config['model_type']
)

print(f"  📈 Validation Performance: F1={active_metrics['f1']:.4f}, Acc={active_metrics['accuracy']:.4f}, Prec={active_metrics['precision']:.4f}, Rec={active_metrics['recall']:.4f}")

# Select new samples using active learning strategy
active_new_samples = None
active_fraud_count = 0
active_non_fraud_count = 0

if iteration < config['n_iterations'] and len(X_unlabeled_active) > 0:
    # Get the strategy for this specific iteration
    current_strategy = config['strategy_sequence'][iteration - 1] if iteration <= len(config['strategy_sequence']) else 'uncertainty'
    print(f"  🎯 Selecting {config['batch_size']} new samples using {current_strategy} sampling...")
    
    if current_strategy == 'uncertainty':
        active_new_samples = uncertainty_sampling(active_model, X_unlabeled_active, config['batch_size'])
    elif current_strategy == 'diversity':
        active_new_samples = diversity_sampling(X_unlabeled_active, config['batch_size'])
    else:
        active_new_samples = X_unlabeled_active.sample(config['batch_size'], random_state=random_seed + iteration)
    
    # Count fraud/non-fraud in active learning's selection
    active_new_labels = y_unlabeled_active.loc[active_new_samples.index]
    active_fraud_count = (active_new_labels == 1).sum()
    active_non_fraud_count = (active_new_labels == 0).sum()
    
    print(f"  ✅ Active Learning found: {active_fraud_count} fraud + {active_non_fraud_count} non-fraud = {len(active_new_samples)} total")
    print(f"  📊 Active Learning fraud rate in batch: {active_fraud_count/len(active_new_samples)*100:.2f}%")
    
    # Add to active learning labeled pool
    X_labeled_active = pd.concat([X_labeled_active, active_new_samples])
    y_labeled_active = pd.concat([y_labeled_active, active_new_labels])
    
    # Remove from active learning unlabeled pool
    X_unlabeled_active = X_unlabeled_active.drop(index=active_new_samples.index)
    y_unlabeled_active = y_unlabeled_active.drop(index=active_new_samples.index)


🔴 ACTIVE LEARNING - Iteration 4
  📊 Current labeled pool: 504 samples
  🎯 Fraud in labeled: 11 samples (2.18%)
  📈 Validation Performance: F1=0.2367, Acc=0.9905, Prec=0.1376, Rec=0.8481
  🎯 Selecting 68 new samples using uncertainty sampling...
  ✅ Active Learning found: 0 fraud + 68 non-fraud = 68 total
  📊 Active Learning fraud rate in batch: 0.00%


In [213]:
X_labeled_active

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V28,Time_hour_sin,Time_hour_cos,Amount_log,Amount_very_small,Amount_small,Amount_medium,Amount_large,Amount_very_large,Amount_extreme
106010,0.682070,-0.682406,-0.144220,-1.033447,-0.760468,-0.571584,-0.357477,-0.207845,-2.238134,1.445829,...,0.059564,-1.064028,0.690154,0.955679,False,False,False,True,False,False
59939,0.591546,0.125306,0.399343,0.350841,-0.196124,-0.222914,-0.051379,0.021068,-0.312604,0.033781,...,0.027961,-0.235260,-1.072529,-1.114639,True,False,False,False,False,False
275313,1.153450,-0.956763,-0.826917,-1.306220,-0.613648,0.110281,-0.995417,-0.026123,-1.162494,1.571756,...,-0.191480,-0.282310,1.445353,0.537751,False,False,True,False,False,False
278593,1.097762,-0.895010,-0.573975,-1.066476,-0.993461,-0.602941,-0.789431,-0.181738,-1.442861,1.571938,...,-0.152540,-0.085693,1.516791,0.645950,False,False,True,False,False,False
50199,0.305591,-0.438062,-0.126088,0.793291,-0.134750,0.088342,0.334355,0.008005,-0.232448,0.006627,...,0.167878,0.287246,-1.192767,1.554042,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
154390,0.958514,0.103895,-0.732169,2.011628,1.695788,3.488902,-0.960148,0.918330,0.594681,0.996394,...,-0.133494,1.836072,0.841318,-1.561513,True,False,False,False,False,False
13480,-0.697601,0.307216,1.602611,1.843568,0.832488,0.758381,-0.262311,0.374358,0.220836,0.068673,...,0.020848,1.995322,-0.031799,-0.603887,True,False,False,False,False,False
44234,-1.197862,0.298620,-1.809636,1.531771,0.770342,-1.636075,0.557557,-0.030013,0.234765,-0.119226,...,-2.029493,0.580201,-1.191311,0.820068,False,False,True,False,False,False
13314,0.293355,-0.608541,0.700450,1.405108,-0.899606,0.140709,-0.265864,-0.010741,2.300695,-0.745552,...,0.223721,2.002042,0.007095,1.549740,False,False,False,True,False,False


In [214]:
X_val

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V28,Time_hour_sin,Time_hour_cos,Amount_log,Amount_very_small,Amount_small,Amount_medium,Amount_large,Amount_very_large,Amount_extreme
131037,0.413776,-0.272350,0.502477,0.864394,-0.434353,0.304336,-0.208801,0.154426,0.204434,-0.070863,...,0.131172,-0.342436,1.417841,1.156986,False,False,False,True,False,False
86287,-0.803151,0.999960,0.352067,1.173051,0.323342,-0.390637,0.203734,0.224163,-0.809853,-0.459535,...,-1.431986,-1.111963,-0.168158,-1.484349,True,False,False,False,False,False
106692,-0.827662,0.647472,0.264245,-0.101676,0.345140,1.339951,-0.305697,1.224059,-0.235401,-0.926992,...,-0.103894,-1.051456,0.718289,-0.163768,False,True,False,False,False,False
263003,0.903859,-0.595063,-1.235551,-0.348055,1.209799,2.881719,-0.855052,0.827306,0.781591,0.059239,...,-0.093929,-0.798322,1.088715,0.982066,False,False,False,True,False,False
151514,-0.786965,0.556142,0.455014,-0.147336,0.581776,-0.058680,0.494172,0.014961,2.105721,-0.165235,...,0.823314,1.414430,1.288252,-1.896747,True,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
248721,-0.021501,0.643496,-0.703290,-0.192692,0.365217,-0.593289,0.501638,0.244761,-0.391814,-0.938692,...,0.076270,-1.132865,0.484586,0.312845,False,True,False,False,False,False
272603,0.801469,-1.552412,-2.228371,-0.981991,-0.304439,-0.729208,0.401542,-0.504420,-1.650827,1.460564,...,-0.036080,-0.413605,1.381433,1.820141,False,False,False,True,False,False
90858,-0.711083,1.037614,0.582867,-0.062166,0.006756,-0.715608,0.268302,-0.715732,0.190409,0.460707,...,0.883142,-1.156610,0.034957,-0.592759,True,False,False,False,False,False
194445,-0.833234,0.672831,-0.043365,-0.322130,-1.045517,-0.693127,-0.358149,1.005568,-0.630802,-0.725263,...,-0.443147,0.310790,-1.194400,0.711405,False,False,True,False,False,False


In [215]:
# ===================
# PASSIVE LEARNING - MATCHED QUANTITIES
# ===================
print(f"\n🔵 PASSIVE LEARNING - Iteration {iteration}")
print(f"  📊 Current labeled pool: {len(X_labeled_passive)} samples")
print(f"  🎯 Fraud in labeled: {(y_labeled_passive == 1).sum()} samples ({(y_labeled_passive == 1).sum()/len(y_labeled_passive)*100:.2f}%)")

passive_model, passive_metrics = train_and_evaluate(
    X_labeled_passive, y_labeled_passive, X_val, y_val, config['model_type']
)

print(f"  📈 Validation Performance: F1={passive_metrics['f1']:.4f}, Acc={passive_metrics['accuracy']:.4f}, Prec={passive_metrics['precision']:.4f}, Rec={passive_metrics['recall']:.4f}")

# Select new samples using MATCHED QUANTITIES from active learning
if iteration < config['n_iterations'] and len(X_unlabeled_passive) > 0 and active_new_samples is not None:
    print(f"  🎯 Matching Active Learning's selection: {active_fraud_count} fraud + {active_non_fraud_count} non-fraud...")
    
    # Match the exact composition that active learning found
    passive_new_samples = matched_quantity_random_sampling(
        X_unlabeled_passive, y_unlabeled_passive,
        active_fraud_count, active_non_fraud_count,
        random_seed + iteration + 1000  # Different seed than active learning
    )
    
    print(f"  ✅ Passive Learning selected: {(y_unlabeled_passive.loc[passive_new_samples.index] == 1).sum()} fraud + {(y_unlabeled_passive.loc[passive_new_samples.index] == 0).sum()} non-fraud = {len(passive_new_samples)} total")
    print(f"  📊 Passive Learning fraud rate in batch: {(y_unlabeled_passive.loc[passive_new_samples.index] == 1).sum()/len(passive_new_samples)*100:.2f}%")
    
    # Add to passive learning labeled pool
    passive_new_labels = y_unlabeled_passive.loc[passive_new_samples.index]
    X_labeled_passive = pd.concat([X_labeled_passive, passive_new_samples])
    y_labeled_passive = pd.concat([y_labeled_passive, passive_new_labels])
    
    # Remove from passive learning unlabeled pool
    X_unlabeled_passive = X_unlabeled_passive.drop(index=passive_new_samples.index)
    y_unlabeled_passive = y_unlabeled_passive.drop(index=passive_new_samples.index)


🔵 PASSIVE LEARNING - Iteration 4
  📊 Current labeled pool: 504 samples
  🎯 Fraud in labeled: 11 samples (2.18%)
  📈 Validation Performance: F1=0.7229, Acc=0.9990, Prec=0.6897, Rec=0.7595
  🎯 Matching Active Learning's selection: 0 fraud + 68 non-fraud...
  ✅ Passive Learning selected: 0 fraud + 68 non-fraud = 68 total
  📊 Passive Learning fraud rate in batch: 0.00%


In [216]:
X_labeled_passive

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V28,Time_hour_sin,Time_hour_cos,Amount_log,Amount_very_small,Amount_small,Amount_medium,Amount_large,Amount_very_large,Amount_extreme
106010,0.682070,-0.682406,-0.144220,-1.033447,-0.760468,-0.571584,-0.357477,-0.207845,-2.238134,1.445829,...,0.059564,-1.064028,0.690154,0.955679,False,False,False,True,False,False
59939,0.591546,0.125306,0.399343,0.350841,-0.196124,-0.222914,-0.051379,0.021068,-0.312604,0.033781,...,0.027961,-0.235260,-1.072529,-1.114639,True,False,False,False,False,False
275313,1.153450,-0.956763,-0.826917,-1.306220,-0.613648,0.110281,-0.995417,-0.026123,-1.162494,1.571756,...,-0.191480,-0.282310,1.445353,0.537751,False,False,True,False,False,False
278593,1.097762,-0.895010,-0.573975,-1.066476,-0.993461,-0.602941,-0.789431,-0.181738,-1.442861,1.571938,...,-0.152540,-0.085693,1.516791,0.645950,False,False,True,False,False,False
50199,0.305591,-0.438062,-0.126088,0.793291,-0.134750,0.088342,0.334355,0.008005,-0.232448,0.006627,...,0.167878,0.287246,-1.192767,1.554042,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210088,-1.575887,0.328395,0.366679,3.248627,-0.157881,1.471685,-0.136240,0.282282,-1.867417,1.142298,...,-1.006722,-0.472799,-0.955296,1.001139,False,False,False,True,False,False
218014,-1.335078,-0.421065,-0.698039,0.169028,-0.712856,0.154786,0.055791,0.753740,0.994122,-1.189955,...,-1.039848,-0.759381,-0.735703,1.471793,False,False,False,True,False,False
1122,0.498921,-0.002863,1.331967,2.153574,-0.777468,0.582621,-0.671427,0.313747,0.527224,0.224617,...,0.123256,0.525504,1.587594,-0.222758,False,True,False,False,False,False
124302,-0.468834,0.285240,1.316983,-0.748065,-0.508785,0.045705,0.150973,-0.032894,0.998404,-1.076897,...,0.450135,-0.558243,1.292900,0.470612,False,True,False,False,False,False


In [217]:
# Calculate differences
f1_difference = active_metrics['f1'] - passive_metrics['f1']
f1_improvement_pct = (f1_difference / passive_metrics['f1'] * 100) if passive_metrics['f1'] > 0 else 0

print(f"\n📊 ITERATION {iteration} SUMMARY:")
print(f"  🔴 Active Learning F1:  {active_metrics['f1']:.4f}")
print(f"  🔵 Passive Learning F1: {passive_metrics['f1']:.4f}")
print(f"  📈 Difference: {f1_difference:+.4f} ({f1_improvement_pct:+.1f}%)")
print(f"  🎯 Both labeled pools now have: {len(X_labeled_active)} samples")
print(f"  🔴 Active fraud in pool: {(y_labeled_active == 1).sum()} ({(y_labeled_active == 1).sum()/len(y_labeled_active)*100:.2f}%)")
print(f"  🔵 Passive fraud in pool: {(y_labeled_passive == 1).sum()} ({(y_labeled_passive == 1).sum()/len(y_labeled_passive)*100:.2f}%)")

# Store results
current_strategy = config['strategy_sequence'][iteration - 1] if iteration <= len(config['strategy_sequence']) else 'uncertainty'
iteration_results.append({
    'iteration': iteration,
    'strategy_used': current_strategy,
    'active_f1': active_metrics['f1'],
    'passive_f1': passive_metrics['f1'],
    'active_accuracy': active_metrics['accuracy'],
    'passive_accuracy': passive_metrics['accuracy'],
    'active_precision': active_metrics['precision'],
    'passive_precision': passive_metrics['precision'],
    'active_recall': active_metrics['recall'],
    'passive_recall': passive_metrics['recall'],
    'f1_difference': f1_difference,
    'f1_improvement_pct': f1_improvement_pct,
    'active_labeled_count': len(X_labeled_active),
    'passive_labeled_count': len(X_labeled_passive),
    'active_fraud_count': (y_labeled_active == 1).sum(),
    'passive_fraud_count': (y_labeled_passive == 1).sum(),
    'active_fraud_rate': (y_labeled_active == 1).sum()/len(y_labeled_active)*100,
    'passive_fraud_rate': (y_labeled_passive == 1).sum()/len(y_labeled_passive)*100,
    'batch_fraud_count': active_fraud_count if active_new_samples is not None else 0,
    'batch_non_fraud_count': active_non_fraud_count if active_new_samples is not None else 0
})


📊 ITERATION 4 SUMMARY:
  🔴 Active Learning F1:  0.2367
  🔵 Passive Learning F1: 0.7229
  📈 Difference: -0.4861 (-67.2%)
  🎯 Both labeled pools now have: 572 samples
  🔴 Active fraud in pool: 11 (1.92%)
  🔵 Passive fraud in pool: 11 (1.92%)


### ITERATION = 5

In [218]:
iteration = 5

In [219]:
# ===================
# ACTIVE LEARNING
# ===================
print(f"\n🔴 ACTIVE LEARNING - Iteration {iteration}")
print(f"  📊 Current labeled pool: {len(X_labeled_active)} samples")
print(f"  🎯 Fraud in labeled: {(y_labeled_active == 1).sum()} samples ({(y_labeled_active == 1).sum()/len(y_labeled_active)*100:.2f}%)")

active_model, active_metrics = train_and_evaluate(
    X_labeled_active, y_labeled_active, X_val, y_val, config['model_type']
)

print(f"  📈 Validation Performance: F1={active_metrics['f1']:.4f}, Acc={active_metrics['accuracy']:.4f}, Prec={active_metrics['precision']:.4f}, Rec={active_metrics['recall']:.4f}")

# Select new samples using active learning strategy
active_new_samples = None
active_fraud_count = 0
active_non_fraud_count = 0

if iteration < config['n_iterations'] and len(X_unlabeled_active) > 0:
    # Get the strategy for this specific iteration
    current_strategy = config['strategy_sequence'][iteration - 1] if iteration <= len(config['strategy_sequence']) else 'uncertainty'
    print(f"  🎯 Selecting {config['batch_size']} new samples using {current_strategy} sampling...")
    
    if current_strategy == 'uncertainty':
        active_new_samples = uncertainty_sampling(active_model, X_unlabeled_active, config['batch_size'])
    elif current_strategy == 'diversity':
        active_new_samples = diversity_sampling(X_unlabeled_active, config['batch_size'])
    else:
        active_new_samples = X_unlabeled_active.sample(config['batch_size'], random_state=random_seed + iteration)
    
    # Count fraud/non-fraud in active learning's selection
    active_new_labels = y_unlabeled_active.loc[active_new_samples.index]
    active_fraud_count = (active_new_labels == 1).sum()
    active_non_fraud_count = (active_new_labels == 0).sum()
    
    print(f"  ✅ Active Learning found: {active_fraud_count} fraud + {active_non_fraud_count} non-fraud = {len(active_new_samples)} total")
    print(f"  📊 Active Learning fraud rate in batch: {active_fraud_count/len(active_new_samples)*100:.2f}%")
    
    # Add to active learning labeled pool
    X_labeled_active = pd.concat([X_labeled_active, active_new_samples])
    y_labeled_active = pd.concat([y_labeled_active, active_new_labels])
    
    # Remove from active learning unlabeled pool
    X_unlabeled_active = X_unlabeled_active.drop(index=active_new_samples.index)
    y_unlabeled_active = y_unlabeled_active.drop(index=active_new_samples.index)


🔴 ACTIVE LEARNING - Iteration 5
  📊 Current labeled pool: 572 samples
  🎯 Fraud in labeled: 11 samples (1.92%)
  📈 Validation Performance: F1=0.6220, Acc=0.9983, Prec=0.5000, Rec=0.8228
  🎯 Selecting 68 new samples using diversity sampling...
  ✅ Active Learning found: 9 fraud + 59 non-fraud = 68 total
  📊 Active Learning fraud rate in batch: 13.24%


In [220]:
X_labeled_active

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V28,Time_hour_sin,Time_hour_cos,Amount_log,Amount_very_small,Amount_small,Amount_medium,Amount_large,Amount_very_large,Amount_extreme
106010,0.682070,-0.682406,-0.144220,-1.033447,-0.760468,-0.571584,-0.357477,-0.207845,-2.238134,1.445829,...,0.059564,-1.064028,0.690154,0.955679,False,False,False,True,False,False
59939,0.591546,0.125306,0.399343,0.350841,-0.196124,-0.222914,-0.051379,0.021068,-0.312604,0.033781,...,0.027961,-0.235260,-1.072529,-1.114639,True,False,False,False,False,False
275313,1.153450,-0.956763,-0.826917,-1.306220,-0.613648,0.110281,-0.995417,-0.026123,-1.162494,1.571756,...,-0.191480,-0.282310,1.445353,0.537751,False,False,True,False,False,False
278593,1.097762,-0.895010,-0.573975,-1.066476,-0.993461,-0.602941,-0.789431,-0.181738,-1.442861,1.571938,...,-0.152540,-0.085693,1.516791,0.645950,False,False,True,False,False,False
50199,0.305591,-0.438062,-0.126088,0.793291,-0.134750,0.088342,0.334355,0.008005,-0.232448,0.006627,...,0.167878,0.287246,-1.192767,1.554042,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
283139,-3.603817,2.733453,-0.134628,-0.826866,0.020105,2.910298,-3.951393,-17.173362,2.577026,2.167931,...,-9.942436,0.262465,1.583129,0.063929,False,True,False,False,False,False
188560,-6.583583,-12.409652,-8.438324,3.313854,-12.689839,11.685948,22.690176,-1.799868,-3.956346,-6.188139,...,1.516743,0.606449,-1.188863,3.504388,False,False,False,False,False,True
209292,-13.788790,-14.487686,-6.274862,3.439887,-1.773577,1.421034,2.606124,-5.759431,1.993797,1.215655,...,-3.242084,-0.439822,-0.974486,1.798196,False,False,False,True,False,False
244693,-10.238943,-13.550041,-2.076706,7.259328,9.064981,-3.811749,-3.623480,-1.266440,1.292375,5.379504,...,13.962264,-1.160731,0.322667,0.244049,False,True,False,False,False,False


In [221]:
X_val

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V28,Time_hour_sin,Time_hour_cos,Amount_log,Amount_very_small,Amount_small,Amount_medium,Amount_large,Amount_very_large,Amount_extreme
131037,0.413776,-0.272350,0.502477,0.864394,-0.434353,0.304336,-0.208801,0.154426,0.204434,-0.070863,...,0.131172,-0.342436,1.417841,1.156986,False,False,False,True,False,False
86287,-0.803151,0.999960,0.352067,1.173051,0.323342,-0.390637,0.203734,0.224163,-0.809853,-0.459535,...,-1.431986,-1.111963,-0.168158,-1.484349,True,False,False,False,False,False
106692,-0.827662,0.647472,0.264245,-0.101676,0.345140,1.339951,-0.305697,1.224059,-0.235401,-0.926992,...,-0.103894,-1.051456,0.718289,-0.163768,False,True,False,False,False,False
263003,0.903859,-0.595063,-1.235551,-0.348055,1.209799,2.881719,-0.855052,0.827306,0.781591,0.059239,...,-0.093929,-0.798322,1.088715,0.982066,False,False,False,True,False,False
151514,-0.786965,0.556142,0.455014,-0.147336,0.581776,-0.058680,0.494172,0.014961,2.105721,-0.165235,...,0.823314,1.414430,1.288252,-1.896747,True,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
248721,-0.021501,0.643496,-0.703290,-0.192692,0.365217,-0.593289,0.501638,0.244761,-0.391814,-0.938692,...,0.076270,-1.132865,0.484586,0.312845,False,True,False,False,False,False
272603,0.801469,-1.552412,-2.228371,-0.981991,-0.304439,-0.729208,0.401542,-0.504420,-1.650827,1.460564,...,-0.036080,-0.413605,1.381433,1.820141,False,False,False,True,False,False
90858,-0.711083,1.037614,0.582867,-0.062166,0.006756,-0.715608,0.268302,-0.715732,0.190409,0.460707,...,0.883142,-1.156610,0.034957,-0.592759,True,False,False,False,False,False
194445,-0.833234,0.672831,-0.043365,-0.322130,-1.045517,-0.693127,-0.358149,1.005568,-0.630802,-0.725263,...,-0.443147,0.310790,-1.194400,0.711405,False,False,True,False,False,False


In [222]:
# ===================
# PASSIVE LEARNING - MATCHED QUANTITIES
# ===================
print(f"\n🔵 PASSIVE LEARNING - Iteration {iteration}")
print(f"  📊 Current labeled pool: {len(X_labeled_passive)} samples")
print(f"  🎯 Fraud in labeled: {(y_labeled_passive == 1).sum()} samples ({(y_labeled_passive == 1).sum()/len(y_labeled_passive)*100:.2f}%)")

passive_model, passive_metrics = train_and_evaluate(
    X_labeled_passive, y_labeled_passive, X_val, y_val, config['model_type']
)

print(f"  📈 Validation Performance: F1={passive_metrics['f1']:.4f}, Acc={passive_metrics['accuracy']:.4f}, Prec={passive_metrics['precision']:.4f}, Rec={passive_metrics['recall']:.4f}")

# Select new samples using MATCHED QUANTITIES from active learning
if iteration < config['n_iterations'] and len(X_unlabeled_passive) > 0 and active_new_samples is not None:
    print(f"  🎯 Matching Active Learning's selection: {active_fraud_count} fraud + {active_non_fraud_count} non-fraud...")
    
    # Match the exact composition that active learning found
    passive_new_samples = matched_quantity_random_sampling(
        X_unlabeled_passive, y_unlabeled_passive,
        active_fraud_count, active_non_fraud_count,
        random_seed + iteration + 1000  # Different seed than active learning
    )
    
    print(f"  ✅ Passive Learning selected: {(y_unlabeled_passive.loc[passive_new_samples.index] == 1).sum()} fraud + {(y_unlabeled_passive.loc[passive_new_samples.index] == 0).sum()} non-fraud = {len(passive_new_samples)} total")
    print(f"  📊 Passive Learning fraud rate in batch: {(y_unlabeled_passive.loc[passive_new_samples.index] == 1).sum()/len(passive_new_samples)*100:.2f}%")
    
    # Add to passive learning labeled pool
    passive_new_labels = y_unlabeled_passive.loc[passive_new_samples.index]
    X_labeled_passive = pd.concat([X_labeled_passive, passive_new_samples])
    y_labeled_passive = pd.concat([y_labeled_passive, passive_new_labels])
    
    # Remove from passive learning unlabeled pool
    X_unlabeled_passive = X_unlabeled_passive.drop(index=passive_new_samples.index)
    y_unlabeled_passive = y_unlabeled_passive.drop(index=passive_new_samples.index)


🔵 PASSIVE LEARNING - Iteration 5
  📊 Current labeled pool: 572 samples
  🎯 Fraud in labeled: 11 samples (1.92%)
  📈 Validation Performance: F1=0.7643, Acc=0.9992, Prec=0.7692, Rec=0.7595
  🎯 Matching Active Learning's selection: 9 fraud + 59 non-fraud...
  ✅ Passive Learning selected: 9 fraud + 59 non-fraud = 68 total
  📊 Passive Learning fraud rate in batch: 13.24%


In [223]:
X_labeled_passive

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V28,Time_hour_sin,Time_hour_cos,Amount_log,Amount_very_small,Amount_small,Amount_medium,Amount_large,Amount_very_large,Amount_extreme
106010,0.682070,-0.682406,-0.144220,-1.033447,-0.760468,-0.571584,-0.357477,-0.207845,-2.238134,1.445829,...,0.059564,-1.064028,0.690154,0.955679,False,False,False,True,False,False
59939,0.591546,0.125306,0.399343,0.350841,-0.196124,-0.222914,-0.051379,0.021068,-0.312604,0.033781,...,0.027961,-0.235260,-1.072529,-1.114639,True,False,False,False,False,False
275313,1.153450,-0.956763,-0.826917,-1.306220,-0.613648,0.110281,-0.995417,-0.026123,-1.162494,1.571756,...,-0.191480,-0.282310,1.445353,0.537751,False,False,True,False,False,False
278593,1.097762,-0.895010,-0.573975,-1.066476,-0.993461,-0.602941,-0.789431,-0.181738,-1.442861,1.571938,...,-0.152540,-0.085693,1.516791,0.645950,False,False,True,False,False,False
50199,0.305591,-0.438062,-0.126088,0.793291,-0.134750,0.088342,0.334355,0.008005,-0.232448,0.006627,...,0.167878,0.287246,-1.192767,1.554042,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
240222,0.945983,0.647726,-1.171376,3.261191,0.558171,-0.300885,-0.033118,0.074945,-0.198161,-0.343416,...,0.122917,-1.165196,0.124659,-1.484349,True,False,False,False,False,False
113400,-0.425823,0.664648,0.447071,0.624740,0.037366,-0.552561,0.304211,0.292190,-0.585062,-0.226501,...,0.366165,-0.889197,0.983456,0.129845,False,True,False,False,False,False
29540,-0.412381,0.837613,1.217058,1.936435,-0.442994,0.223530,-0.264962,0.647993,-1.268883,0.759313,...,0.356323,1.268670,-0.985965,-0.420658,False,True,False,False,False,False
59569,0.628468,-0.005344,-0.799820,0.062465,1.569275,2.547559,-0.330689,0.662830,-0.136987,0.102610,...,0.062276,-0.216558,-1.079863,0.252771,False,True,False,False,False,False


In [224]:
# Calculate differences
f1_difference = active_metrics['f1'] - passive_metrics['f1']
f1_improvement_pct = (f1_difference / passive_metrics['f1'] * 100) if passive_metrics['f1'] > 0 else 0

print(f"\n📊 ITERATION {iteration} SUMMARY:")
print(f"  🔴 Active Learning F1:  {active_metrics['f1']:.4f}")
print(f"  🔵 Passive Learning F1: {passive_metrics['f1']:.4f}")
print(f"  📈 Difference: {f1_difference:+.4f} ({f1_improvement_pct:+.1f}%)")
print(f"  🎯 Both labeled pools now have: {len(X_labeled_active)} samples")
print(f"  🔴 Active fraud in pool: {(y_labeled_active == 1).sum()} ({(y_labeled_active == 1).sum()/len(y_labeled_active)*100:.2f}%)")
print(f"  🔵 Passive fraud in pool: {(y_labeled_passive == 1).sum()} ({(y_labeled_passive == 1).sum()/len(y_labeled_passive)*100:.2f}%)")

# Store results
current_strategy = config['strategy_sequence'][iteration - 1] if iteration <= len(config['strategy_sequence']) else 'uncertainty'
iteration_results.append({
    'iteration': iteration,
    'strategy_used': current_strategy,
    'active_f1': active_metrics['f1'],
    'passive_f1': passive_metrics['f1'],
    'active_accuracy': active_metrics['accuracy'],
    'passive_accuracy': passive_metrics['accuracy'],
    'active_precision': active_metrics['precision'],
    'passive_precision': passive_metrics['precision'],
    'active_recall': active_metrics['recall'],
    'passive_recall': passive_metrics['recall'],
    'f1_difference': f1_difference,
    'f1_improvement_pct': f1_improvement_pct,
    'active_labeled_count': len(X_labeled_active),
    'passive_labeled_count': len(X_labeled_passive),
    'active_fraud_count': (y_labeled_active == 1).sum(),
    'passive_fraud_count': (y_labeled_passive == 1).sum(),
    'active_fraud_rate': (y_labeled_active == 1).sum()/len(y_labeled_active)*100,
    'passive_fraud_rate': (y_labeled_passive == 1).sum()/len(y_labeled_passive)*100,
    'batch_fraud_count': active_fraud_count if active_new_samples is not None else 0,
    'batch_non_fraud_count': active_non_fraud_count if active_new_samples is not None else 0
})


📊 ITERATION 5 SUMMARY:
  🔴 Active Learning F1:  0.6220
  🔵 Passive Learning F1: 0.7643
  📈 Difference: -0.1423 (-18.6%)
  🎯 Both labeled pools now have: 640 samples
  🔴 Active fraud in pool: 20 (3.12%)
  🔵 Passive fraud in pool: 20 (3.12%)


### ITERATION = 6

In [226]:
iteration = 6

In [227]:
# ===================
# ACTIVE LEARNING
# ===================
print(f"\n🔴 ACTIVE LEARNING - Iteration {iteration}")
print(f"  📊 Current labeled pool: {len(X_labeled_active)} samples")
print(f"  🎯 Fraud in labeled: {(y_labeled_active == 1).sum()} samples ({(y_labeled_active == 1).sum()/len(y_labeled_active)*100:.2f}%)")

active_model, active_metrics = train_and_evaluate(
    X_labeled_active, y_labeled_active, X_val, y_val, config['model_type']
)

print(f"  📈 Validation Performance: F1={active_metrics['f1']:.4f}, Acc={active_metrics['accuracy']:.4f}, Prec={active_metrics['precision']:.4f}, Rec={active_metrics['recall']:.4f}")

# Select new samples using active learning strategy
active_new_samples = None
active_fraud_count = 0
active_non_fraud_count = 0

if iteration < config['n_iterations'] and len(X_unlabeled_active) > 0:
    # Get the strategy for this specific iteration
    current_strategy = config['strategy_sequence'][iteration - 1] if iteration <= len(config['strategy_sequence']) else 'uncertainty'
    print(f"  🎯 Selecting {config['batch_size']} new samples using {current_strategy} sampling...")
    
    if current_strategy == 'uncertainty':
        active_new_samples = uncertainty_sampling(active_model, X_unlabeled_active, config['batch_size'])
    elif current_strategy == 'diversity':
        active_new_samples = diversity_sampling(X_unlabeled_active, config['batch_size'])
    else:
        active_new_samples = X_unlabeled_active.sample(config['batch_size'], random_state=random_seed + iteration)
    
    # Count fraud/non-fraud in active learning's selection
    active_new_labels = y_unlabeled_active.loc[active_new_samples.index]
    active_fraud_count = (active_new_labels == 1).sum()
    active_non_fraud_count = (active_new_labels == 0).sum()
    
    print(f"  ✅ Active Learning found: {active_fraud_count} fraud + {active_non_fraud_count} non-fraud = {len(active_new_samples)} total")
    print(f"  📊 Active Learning fraud rate in batch: {active_fraud_count/len(active_new_samples)*100:.2f}%")
    
    # Add to active learning labeled pool
    X_labeled_active = pd.concat([X_labeled_active, active_new_samples])
    y_labeled_active = pd.concat([y_labeled_active, active_new_labels])
    
    # Remove from active learning unlabeled pool
    X_unlabeled_active = X_unlabeled_active.drop(index=active_new_samples.index)
    y_unlabeled_active = y_unlabeled_active.drop(index=active_new_samples.index)


🔴 ACTIVE LEARNING - Iteration 6
  📊 Current labeled pool: 640 samples
  🎯 Fraud in labeled: 20 samples (3.12%)
  📈 Validation Performance: F1=0.7711, Acc=0.9992, Prec=0.7356, Rec=0.8101


In [228]:
X_labeled_active

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V28,Time_hour_sin,Time_hour_cos,Amount_log,Amount_very_small,Amount_small,Amount_medium,Amount_large,Amount_very_large,Amount_extreme
106010,0.682070,-0.682406,-0.144220,-1.033447,-0.760468,-0.571584,-0.357477,-0.207845,-2.238134,1.445829,...,0.059564,-1.064028,0.690154,0.955679,False,False,False,True,False,False
59939,0.591546,0.125306,0.399343,0.350841,-0.196124,-0.222914,-0.051379,0.021068,-0.312604,0.033781,...,0.027961,-0.235260,-1.072529,-1.114639,True,False,False,False,False,False
275313,1.153450,-0.956763,-0.826917,-1.306220,-0.613648,0.110281,-0.995417,-0.026123,-1.162494,1.571756,...,-0.191480,-0.282310,1.445353,0.537751,False,False,True,False,False,False
278593,1.097762,-0.895010,-0.573975,-1.066476,-0.993461,-0.602941,-0.789431,-0.181738,-1.442861,1.571938,...,-0.152540,-0.085693,1.516791,0.645950,False,False,True,False,False,False
50199,0.305591,-0.438062,-0.126088,0.793291,-0.134750,0.088342,0.334355,0.008005,-0.232448,0.006627,...,0.167878,0.287246,-1.192767,1.554042,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
283139,-3.603817,2.733453,-0.134628,-0.826866,0.020105,2.910298,-3.951393,-17.173362,2.577026,2.167931,...,-9.942436,0.262465,1.583129,0.063929,False,True,False,False,False,False
188560,-6.583583,-12.409652,-8.438324,3.313854,-12.689839,11.685948,22.690176,-1.799868,-3.956346,-6.188139,...,1.516743,0.606449,-1.188863,3.504388,False,False,False,False,False,True
209292,-13.788790,-14.487686,-6.274862,3.439887,-1.773577,1.421034,2.606124,-5.759431,1.993797,1.215655,...,-3.242084,-0.439822,-0.974486,1.798196,False,False,False,True,False,False
244693,-10.238943,-13.550041,-2.076706,7.259328,9.064981,-3.811749,-3.623480,-1.266440,1.292375,5.379504,...,13.962264,-1.160731,0.322667,0.244049,False,True,False,False,False,False


In [229]:
X_val

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V28,Time_hour_sin,Time_hour_cos,Amount_log,Amount_very_small,Amount_small,Amount_medium,Amount_large,Amount_very_large,Amount_extreme
131037,0.413776,-0.272350,0.502477,0.864394,-0.434353,0.304336,-0.208801,0.154426,0.204434,-0.070863,...,0.131172,-0.342436,1.417841,1.156986,False,False,False,True,False,False
86287,-0.803151,0.999960,0.352067,1.173051,0.323342,-0.390637,0.203734,0.224163,-0.809853,-0.459535,...,-1.431986,-1.111963,-0.168158,-1.484349,True,False,False,False,False,False
106692,-0.827662,0.647472,0.264245,-0.101676,0.345140,1.339951,-0.305697,1.224059,-0.235401,-0.926992,...,-0.103894,-1.051456,0.718289,-0.163768,False,True,False,False,False,False
263003,0.903859,-0.595063,-1.235551,-0.348055,1.209799,2.881719,-0.855052,0.827306,0.781591,0.059239,...,-0.093929,-0.798322,1.088715,0.982066,False,False,False,True,False,False
151514,-0.786965,0.556142,0.455014,-0.147336,0.581776,-0.058680,0.494172,0.014961,2.105721,-0.165235,...,0.823314,1.414430,1.288252,-1.896747,True,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
248721,-0.021501,0.643496,-0.703290,-0.192692,0.365217,-0.593289,0.501638,0.244761,-0.391814,-0.938692,...,0.076270,-1.132865,0.484586,0.312845,False,True,False,False,False,False
272603,0.801469,-1.552412,-2.228371,-0.981991,-0.304439,-0.729208,0.401542,-0.504420,-1.650827,1.460564,...,-0.036080,-0.413605,1.381433,1.820141,False,False,False,True,False,False
90858,-0.711083,1.037614,0.582867,-0.062166,0.006756,-0.715608,0.268302,-0.715732,0.190409,0.460707,...,0.883142,-1.156610,0.034957,-0.592759,True,False,False,False,False,False
194445,-0.833234,0.672831,-0.043365,-0.322130,-1.045517,-0.693127,-0.358149,1.005568,-0.630802,-0.725263,...,-0.443147,0.310790,-1.194400,0.711405,False,False,True,False,False,False


In [230]:
# ===================
# PASSIVE LEARNING - MATCHED QUANTITIES
# ===================
print(f"\n🔵 PASSIVE LEARNING - Iteration {iteration}")
print(f"  📊 Current labeled pool: {len(X_labeled_passive)} samples")
print(f"  🎯 Fraud in labeled: {(y_labeled_passive == 1).sum()} samples ({(y_labeled_passive == 1).sum()/len(y_labeled_passive)*100:.2f}%)")

passive_model, passive_metrics = train_and_evaluate(
    X_labeled_passive, y_labeled_passive, X_val, y_val, config['model_type']
)

print(f"  📈 Validation Performance: F1={passive_metrics['f1']:.4f}, Acc={passive_metrics['accuracy']:.4f}, Prec={passive_metrics['precision']:.4f}, Rec={passive_metrics['recall']:.4f}")

# Select new samples using MATCHED QUANTITIES from active learning
if iteration < config['n_iterations'] and len(X_unlabeled_passive) > 0 and active_new_samples is not None:
    print(f"  🎯 Matching Active Learning's selection: {active_fraud_count} fraud + {active_non_fraud_count} non-fraud...")
    
    # Match the exact composition that active learning found
    passive_new_samples = matched_quantity_random_sampling(
        X_unlabeled_passive, y_unlabeled_passive,
        active_fraud_count, active_non_fraud_count,
        random_seed + iteration + 1000  # Different seed than active learning
    )
    
    print(f"  ✅ Passive Learning selected: {(y_unlabeled_passive.loc[passive_new_samples.index] == 1).sum()} fraud + {(y_unlabeled_passive.loc[passive_new_samples.index] == 0).sum()} non-fraud = {len(passive_new_samples)} total")
    print(f"  📊 Passive Learning fraud rate in batch: {(y_unlabeled_passive.loc[passive_new_samples.index] == 1).sum()/len(passive_new_samples)*100:.2f}%")
    
    # Add to passive learning labeled pool
    passive_new_labels = y_unlabeled_passive.loc[passive_new_samples.index]
    X_labeled_passive = pd.concat([X_labeled_passive, passive_new_samples])
    y_labeled_passive = pd.concat([y_labeled_passive, passive_new_labels])
    
    # Remove from passive learning unlabeled pool
    X_unlabeled_passive = X_unlabeled_passive.drop(index=passive_new_samples.index)
    y_unlabeled_passive = y_unlabeled_passive.drop(index=passive_new_samples.index)


🔵 PASSIVE LEARNING - Iteration 6
  📊 Current labeled pool: 640 samples
  🎯 Fraud in labeled: 20 samples (3.12%)
  📈 Validation Performance: F1=0.2614, Acc=0.9918, Prec=0.1549, Rec=0.8354


In [231]:
X_labeled_passive

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V28,Time_hour_sin,Time_hour_cos,Amount_log,Amount_very_small,Amount_small,Amount_medium,Amount_large,Amount_very_large,Amount_extreme
106010,0.682070,-0.682406,-0.144220,-1.033447,-0.760468,-0.571584,-0.357477,-0.207845,-2.238134,1.445829,...,0.059564,-1.064028,0.690154,0.955679,False,False,False,True,False,False
59939,0.591546,0.125306,0.399343,0.350841,-0.196124,-0.222914,-0.051379,0.021068,-0.312604,0.033781,...,0.027961,-0.235260,-1.072529,-1.114639,True,False,False,False,False,False
275313,1.153450,-0.956763,-0.826917,-1.306220,-0.613648,0.110281,-0.995417,-0.026123,-1.162494,1.571756,...,-0.191480,-0.282310,1.445353,0.537751,False,False,True,False,False,False
278593,1.097762,-0.895010,-0.573975,-1.066476,-0.993461,-0.602941,-0.789431,-0.181738,-1.442861,1.571938,...,-0.152540,-0.085693,1.516791,0.645950,False,False,True,False,False,False
50199,0.305591,-0.438062,-0.126088,0.793291,-0.134750,0.088342,0.334355,0.008005,-0.232448,0.006627,...,0.167878,0.287246,-1.192767,1.554042,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
240222,0.945983,0.647726,-1.171376,3.261191,0.558171,-0.300885,-0.033118,0.074945,-0.198161,-0.343416,...,0.122917,-1.165196,0.124659,-1.484349,True,False,False,False,False,False
113400,-0.425823,0.664648,0.447071,0.624740,0.037366,-0.552561,0.304211,0.292190,-0.585062,-0.226501,...,0.366165,-0.889197,0.983456,0.129845,False,True,False,False,False,False
29540,-0.412381,0.837613,1.217058,1.936435,-0.442994,0.223530,-0.264962,0.647993,-1.268883,0.759313,...,0.356323,1.268670,-0.985965,-0.420658,False,True,False,False,False,False
59569,0.628468,-0.005344,-0.799820,0.062465,1.569275,2.547559,-0.330689,0.662830,-0.136987,0.102610,...,0.062276,-0.216558,-1.079863,0.252771,False,True,False,False,False,False


In [232]:
# Calculate differences
f1_difference = active_metrics['f1'] - passive_metrics['f1']
f1_improvement_pct = (f1_difference / passive_metrics['f1'] * 100) if passive_metrics['f1'] > 0 else 0

print(f"\n📊 ITERATION {iteration} SUMMARY:")
print(f"  🔴 Active Learning F1:  {active_metrics['f1']:.4f}")
print(f"  🔵 Passive Learning F1: {passive_metrics['f1']:.4f}")
print(f"  📈 Difference: {f1_difference:+.4f} ({f1_improvement_pct:+.1f}%)")
print(f"  🎯 Both labeled pools now have: {len(X_labeled_active)} samples")
print(f"  🔴 Active fraud in pool: {(y_labeled_active == 1).sum()} ({(y_labeled_active == 1).sum()/len(y_labeled_active)*100:.2f}%)")
print(f"  🔵 Passive fraud in pool: {(y_labeled_passive == 1).sum()} ({(y_labeled_passive == 1).sum()/len(y_labeled_passive)*100:.2f}%)")

# Store results
current_strategy = config['strategy_sequence'][iteration - 1] if iteration <= len(config['strategy_sequence']) else 'uncertainty'
iteration_results.append({
    'iteration': iteration,
    'strategy_used': current_strategy,
    'active_f1': active_metrics['f1'],
    'passive_f1': passive_metrics['f1'],
    'active_accuracy': active_metrics['accuracy'],
    'passive_accuracy': passive_metrics['accuracy'],
    'active_precision': active_metrics['precision'],
    'passive_precision': passive_metrics['precision'],
    'active_recall': active_metrics['recall'],
    'passive_recall': passive_metrics['recall'],
    'f1_difference': f1_difference,
    'f1_improvement_pct': f1_improvement_pct,
    'active_labeled_count': len(X_labeled_active),
    'passive_labeled_count': len(X_labeled_passive),
    'active_fraud_count': (y_labeled_active == 1).sum(),
    'passive_fraud_count': (y_labeled_passive == 1).sum(),
    'active_fraud_rate': (y_labeled_active == 1).sum()/len(y_labeled_active)*100,
    'passive_fraud_rate': (y_labeled_passive == 1).sum()/len(y_labeled_passive)*100,
    'batch_fraud_count': active_fraud_count if active_new_samples is not None else 0,
    'batch_non_fraud_count': active_non_fraud_count if active_new_samples is not None else 0
})


📊 ITERATION 6 SUMMARY:
  🔴 Active Learning F1:  0.7711
  🔵 Passive Learning F1: 0.2614
  📈 Difference: +0.5097 (+195.0%)
  🎯 Both labeled pools now have: 640 samples
  🔴 Active fraud in pool: 20 (3.12%)
  🔵 Passive fraud in pool: 20 (3.12%)


### Create summary table

In [233]:
# Create summary table
print(f"\n{'='*100}")
print(f"ITERATION-BY-ITERATION DIVERGENCE ANALYSIS")
print(f"{'='*100}")

print(f"{'Iter':>4} {'Strategy':>11} {'Active_F1':>10} {'Passive_F1':>11} {'Difference':>11} {'Improvement%':>12} {'AL_Fraud%':>10} {'PL_Fraud%':>10} {'Batch_Fraud':>11}")
print(f"{'-'*4} {'-'*11} {'-'*10} {'-'*11} {'-'*11} {'-'*12} {'-'*10} {'-'*10} {'-'*11}")

for result in iteration_results:
    print(f"{result['iteration']:4d} {result['strategy_used']:>11} {result['active_f1']:10.4f} {result['passive_f1']:11.4f} {result['f1_difference']:11.4f} {result['f1_improvement_pct']:11.1f}% {result['active_fraud_rate']:9.2f}% {result['passive_fraud_rate']:9.2f}% {result['batch_fraud_count']:11d}")

# Analysis insights
print(f"\n💡 KEY INSIGHTS:")

# Find when divergence starts
significant_divergence = next((r for r in iteration_results if abs(r['f1_improvement_pct']) > 10), None)
if significant_divergence:
    print(f"  🎯 Significant divergence (>10%) starts at iteration {significant_divergence['iteration']}")

# Fraud accumulation analysis
final_result = iteration_results[-1]
print(f"  📊 By iteration {config['n_iterations']}:")
print(f"    - Active Learning fraud rate: {final_result['active_fraud_rate']:.2f}%")
print(f"    - Passive Learning fraud rate: {final_result['passive_fraud_rate']:.2f}%")
print(f"    - Final F1 difference: {final_result['f1_difference']:.4f} ({final_result['f1_improvement_pct']:+.1f}%)")

# Batch analysis
total_batch_fraud = sum(r['batch_fraud_count'] for r in iteration_results)
total_batch_samples = sum(r['batch_fraud_count'] + r['batch_non_fraud_count'] for r in iteration_results)
if total_batch_samples > 0:
    avg_batch_fraud_rate = total_batch_fraud / total_batch_samples * 100
    print(f"  🎯 Active Learning's average batch fraud rate: {avg_batch_fraud_rate:.2f}%")
    natural_fraud_rate = (y_train_val == 1).sum() / len(y_train_val) * 100
    print(f"  📊 Natural fraud rate in dataset: {natural_fraud_rate:.3f}%")
    print(f"  🚀 Active Learning finds fraud at {avg_batch_fraud_rate/natural_fraud_rate:.1f}x the natural rate!")

# Save detailed results
results_df = pd.DataFrame(iteration_results)
results_filename = f'{HOME_DIR}/experimentation-fraud/data/iteration_by_iteration_analysis.csv'
results_df.to_csv(results_filename, index=False)
print(f"\n💾 Detailed results saved to: {results_filename}")


ITERATION-BY-ITERATION DIVERGENCE ANALYSIS
Iter    Strategy  Active_F1  Passive_F1  Difference Improvement%  AL_Fraud%  PL_Fraud% Batch_Fraud
---- ----------- ---------- ----------- ----------- ------------ ---------- ---------- -----------
   1 uncertainty     0.6936      0.6936      0.0000         0.0%      2.72%      2.72%           0
   2 uncertainty     0.7947      0.6857      0.1090        15.9%      2.29%      2.29%           0
   3 uncertainty     0.7947      0.7018      0.0929        13.2%      2.18%      2.18%           1
   4 uncertainty     0.2367      0.7229     -0.4861       -67.2%      1.92%      1.92%           0
   5   diversity     0.6220      0.7643     -0.1423       -18.6%      3.12%      3.12%           9
   6 uncertainty     0.7711      0.2614      0.5097       195.0%      3.12%      3.12%           0

💡 KEY INSIGHTS:
  🎯 Significant divergence (>10%) starts at iteration 2
  📊 By iteration 6:
    - Active Learning fraud rate: 3.12%
    - Passive Learning fraud ra