In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.optim import AdamW
import torch
import pandas as pd
from sklearn.metrics import classification_report
from torch.amp import GradScaler, autocast
from tqdm import tqdm
import numpy as np
import random
import transformers, sklearn, platform, sys
import matplotlib.pyplot as plt

seed = 677
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False


tokenizer_model = "roberta-base"
code = "AL"
initial_training_group = "whole"
portion_of_initial_samples = 0.05
portion_of_AL_samples = 0.20

epochs = 1  # initial training epochs
al_epochs = 1  # number of AL rounds
epochs_per_al_epoch = 1  # training per AL round


training_df = pd.read_csv("PAN16_training_df.csv")
validation_df = pd.read_csv("PAN16_validation_df.csv")
test_df = pd.read_csv("PAN16_test_df.csv")


X_train = training_df.drop(columns='task_label')
y_train = training_df['task_label']
X_valid = validation_df.drop(columns='task_label')
y_valid = validation_df['task_label']
X_test = test_df.drop(columns='task_label')
y_test = test_df['task_label']


assert len(X_train) == len(y_train), "Mismatch in X_train and y_train lengths"
assert len(X_valid) == len(y_valid), "Mismatch in X_valid and y_valid lengths"
assert len(X_test) == len(y_test), "Mismatch in X_test and y_test lengths"
print("Initial splits: feature-label alignment confirmed")


X_train_males = X_train[X_train['gender'] == 'male'].copy()
y_train_males = y_train[X_train['gender'] == 'male']
X_train_females = X_train[X_train['gender'] == 'female'].copy()
y_train_females = y_train[X_train['gender'] == 'female']

X_valid_males = X_valid[X_valid['gender'] == 'male'].copy()
y_valid_males = y_valid[X_valid['gender'] == 'male']
X_valid_females = X_valid[X_valid['gender'] == 'female'].copy()
y_valid_females = y_valid[X_valid['gender'] == 'female']

X_test_males = X_test[X_test['gender'] == 'male'].copy()
y_test_males = y_test[X_test['gender'] == 'male']
X_test_females = X_test[X_test['gender'] == 'female'].copy()
y_test_females = y_test[X_test['gender'] == 'female']


assert X_train_males['gender'].nunique() == 1 and X_train_males['gender'].iloc[0] == 'male'
assert X_train_females['gender'].nunique() == 1 and X_train_females['gender'].iloc[0] == 'female'
assert X_valid_males['gender'].nunique() == 1 and X_valid_males['gender'].iloc[0] == 'male'
assert X_valid_females['gender'].nunique() == 1 and X_valid_females['gender'].iloc[0] == 'female'
assert X_test_males['gender'].nunique() == 1 and X_test_males['gender'].iloc[0] == 'male'
assert X_test_females['gender'].nunique() == 1 and X_test_females['gender'].iloc[0] == 'female'
print("Gender splits: all subsets contain only the expected gender")

# Only keep text
X_train = X_train['text']
X_train_males = X_train_males['text']
X_train_females = X_train_females['text']
X_valid = X_valid['text']
X_valid_males = X_valid_males['text']
X_valid_females = X_valid_females['text']
X_test = X_test['text']
X_test_males = X_test_males['text']
X_test_females = X_test_females['text']

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(tokenizer_model)

def tokenize_function(texts):
    return tokenizer(list(texts), padding="max_length", truncation=True, max_length=64, return_tensors="pt")

class PAN16DATASET(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        return {
            'input_ids': self.encodings['input_ids'][idx],
            'attention_mask': self.encodings['attention_mask'][idx],
            'labels': torch.tensor(self.labels.iloc[idx], dtype=torch.long)
        }

    def __len__(self):
        return len(self.labels)

# Select training group
if initial_training_group == "females":
    training_texts = X_train_females
    training_labels = y_train_females
    X_valid_group, y_valid_group = X_valid_females, y_valid_females
elif initial_training_group == "males":
    training_texts = X_train_males
    training_labels = y_train_males
    X_valid_group, y_valid_group = X_valid_males, y_valid_males
elif initial_training_group == "whole":
    training_texts = X_train
    training_labels = y_train
    X_valid_group, y_valid_group = X_valid, y_valid

print(f"[INFO] Training group: {initial_training_group} — Number of available training examples: {len(training_texts)}")



initial_samples = int(portion_of_initial_samples * len(training_texts))
samples_per_epoch = int(portion_of_AL_samples * len(training_texts))
train_indices = np.random.choice(training_texts.index, size=initial_samples, replace=False)
current_train_X = training_texts.loc[train_indices]
current_train_y = training_labels.loc[train_indices]
remaining_indices = list(set(training_texts.index) - set(train_indices))
unlabeled_X = training_texts.loc[remaining_indices]
unlabeled_y = training_labels.loc[remaining_indices]

# Model
num_labels = len(set(training_labels)) 
model = AutoModelForSequenceClassification.from_pretrained(tokenizer_model, num_labels=num_labels)
if tokenizer_model == "bert-base-uncased":
    lr = 2e-5
    index_file = f"AL_indices_{initial_training_group}_BERT_base_{seed}.csv"
elif tokenizer_model == "roberta-base":
    lr = 2e-5
    index_file = f"AL_indices_{initial_training_group}_RoBERTa_base_{seed}.csv"
elif tokenizer_model == "distilroberta-base":
    lr = 5e-5
    index_file = f"AL_indices_{initial_training_group}_Distil_RoBERTa_{seed}.csv"

optimizer = AdamW(model.parameters(), lr=lr)
scaler = GradScaler()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)


train_data = PAN16DATASET(tokenize_function(current_train_X), current_train_y)
train_loader = torch.utils.data.DataLoader(train_data, batch_size=32, shuffle=True)
valid_data = PAN16DATASET(tokenize_function(X_valid_group), y_valid_group)
valid_loader = torch.utils.data.DataLoader(valid_data, batch_size=32)

for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader, desc=f"Initial training epoch {epoch+1}"):
        input_ids = batch['input_ids'].to(device)
        mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        optimizer.zero_grad()
        with autocast(device_type='cuda'):
            outputs = model(input_ids, attention_mask=mask, labels=labels)
            loss = outputs.loss
        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        scaler.step(optimizer)
        scaler.update()
        total_loss += loss.item()
    print(f"Initial epoch {epoch+1} - Loss: {total_loss / len(train_loader):.4f}")

    # Validation
    model.eval()
    predictions, labels_all = [], []
    with torch.no_grad():
        for batch in tqdm(valid_loader, desc="Initial validation"):
            input_ids = batch['input_ids'].to(device)
            mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            logits = model(input_ids, attention_mask=mask).logits
            preds = torch.argmax(logits, dim=1)
            predictions.extend(preds.cpu().numpy())
            labels_all.extend(labels.cpu().numpy())
    print(classification_report(labels_all, predictions, target_names=["No Mention (0)", "Mention (1)"]))

# AL
for al_epoch in range(al_epochs):
    print(f"\n=== Active Learning Round {al_epoch+1}/{al_epochs} ===")


    model.eval()
    uncertainties = []
    with torch.no_grad():
        for i in range(0, len(unlabeled_X), 32):
            batch = tokenize_function(unlabeled_X.iloc[i:i+32])
            input_ids = batch['input_ids'].to(device)
            mask = batch['attention_mask'].to(device)
            logits = model(input_ids, attention_mask=mask).logits
            probs = torch.nn.functional.softmax(logits, dim=1)
            entropy = -torch.sum(probs * torch.log(probs + 1e-9), dim=1)
            uncertainties.extend(entropy.cpu().numpy())

    top_idxs = np.argsort(uncertainties)[-samples_per_epoch:]
    new_indices = unlabeled_X.index[top_idxs]
    current_train_X = pd.concat([current_train_X, unlabeled_X.loc[new_indices]])
    print(f"INFO: Number of current training examples: {len(current_train_X)}")
    current_train_y = pd.concat([current_train_y, unlabeled_y.loc[new_indices]])
    unlabeled_X = unlabeled_X.drop(new_indices)
    unlabeled_y = unlabeled_y.drop(new_indices)

    # Visualization
    entropy_array = np.array(uncertainties)
    selected_entropy = entropy_array[top_idxs]
    shared_bins = np.histogram_bin_edges(entropy_array, bins=50)

    plt.figure(figsize=(14, 12))
    plt.hist(entropy_array, bins=shared_bins, alpha=0.5, label='All Unlabeled Samples', color='blue')
    plt.hist(selected_entropy, bins=shared_bins, alpha=0.7, label='Selected Samples', color='red')
    plt.xlabel('Entropy (Uncertainty)')
    plt.ylabel('Frequency')
    plt.title('Entropy Distribution of Unlabeled Samples (Active Learning)\nRoBERTa - Female Data Points - PAN16')
    plt.legend()
    plt.grid(True)

    # plt.savefig(f"AL_{initial_training_group}_{tokenizer_model}_{seed}.png", dpi=300) # png
    plt.savefig(f"AL.pdf", bbox_inches='tight') # pdf
    print("Saved uncertainty visualization")

    train_data = PAN16DATASET(tokenize_function(current_train_X), current_train_y)
    train_loader = torch.utils.data.DataLoader(train_data, batch_size=32, shuffle=True)

    for epoch in range(epochs_per_al_epoch):
        model.train()
        total_loss = 0
        for batch in tqdm(train_loader, desc=f"AL epoch {al_epoch+1} - Epoch {epoch+1}"):
            input_ids = batch['input_ids'].to(device)
            mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            optimizer.zero_grad()
            with autocast(device_type='cuda'):
                outputs = model(input_ids, attention_mask=mask, labels=labels)
                loss = outputs.loss
            scaler.scale(loss).backward()
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            scaler.step(optimizer)
            scaler.update()
            total_loss += loss.item()
        print(f"AL {al_epoch+1} epoch {epoch+1} - Loss: {total_loss / len(train_loader):.4f}")

    model.eval()
    predictions, labels_all = [], []
    with torch.no_grad():
        for batch in tqdm(valid_loader, desc="Validation"):
            input_ids = batch['input_ids'].to(device)
            mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            logits = model(input_ids, attention_mask=mask).logits
            preds = torch.argmax(logits, dim=1)
            predictions.extend(preds.cpu().numpy())
            labels_all.extend(labels.cpu().numpy())
    print(classification_report(labels_all, predictions, target_names=["No Mention (0)", "Mention (1)"]))


initial_indices_sorted = sorted(train_indices)
al_indices_sorted = sorted(set(current_train_X.index) - set(initial_indices_sorted))

index_df = pd.DataFrame({
    "initial_indices": pd.Series(initial_indices_sorted, dtype="Int64"),
    "al_indices": pd.Series(al_indices_sorted, dtype="Int64")
})

index_df.to_csv(index_file, index=False)
print(f"Saved index tracking file to {index_file}")


test_sets = {
    "whole": (X_test, y_test),
    "males": (X_test_males, y_test_males),
    "females": (X_test_females, y_test_females)
}

print(f"\n*** Used code: {code}. Training group: {initial_training_group}. Model: {tokenizer_model}. Seed: {seed} ***")

for name, (Xg, yg) in test_sets.items():
    print(f"\n--- Testing on {name.upper()} ---")
    test_data = PAN16DATASET(tokenize_function(Xg), yg)
    test_loader = torch.utils.data.DataLoader(test_data, batch_size=32)
    model.eval()
    preds, labels_all = [], []
    with torch.no_grad():
        for batch in tqdm(test_loader, desc=f"Testing on {name.upper()}"):
            input_ids = batch['input_ids'].to(device)
            mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            logits = model(input_ids, attention_mask=mask).logits
            pred = torch.argmax(logits, dim=1)
            preds.extend(pred.cpu().numpy())
            labels_all.extend(labels.cpu().numpy())
    print(classification_report(labels_all, preds, target_names=["No Mention (0)", "Mention (1)"]))
    print("#####################################################")
