In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.optim import AdamW
import torch
import pandas as pd
from sklearn.metrics import classification_report
from torch.amp import GradScaler, autocast
from tqdm import tqdm
import numpy as np
import random
import transformers, sklearn, platform, sys
import time

seed = 677
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

tokenizer_model = "roberta-base"
code = "base_model"
training_group = "whole"  # "males", "females", "whole"
epochs = 2
use_amp = True

# Load datasets
training_df = pd.read_csv("PAN16_training_df.csv")
validation_df = pd.read_csv("PAN16_validation_df.csv")
test_df = pd.read_csv("PAN16_test_df.csv")

X_train = training_df.drop(columns='task_label')
y_train = training_df['task_label']
X_valid = validation_df.drop(columns='task_label')
y_valid = validation_df['task_label']
X_test = test_df.drop(columns='task_label')
y_test = test_df['task_label']

assert len(X_train) == len(y_train), "Mismatch in X_train and y_train lengths"
assert len(X_valid) == len(y_valid), "Mismatch in X_valid and y_valid lengths"
assert len(X_test) == len(y_test), "Mismatch in X_test and y_test lengths"
print("Initial splits: feature-label alignment confirmed")

X_train_males = X_train[X_train['gender'] == 'male'].copy()
y_train_males = y_train[X_train['gender'] == 'male']
X_train_females = X_train[X_train['gender'] == 'female'].copy()
y_train_females = y_train[X_train['gender'] == 'female']

X_valid_males = X_valid[X_valid['gender'] == 'male'].copy()
y_valid_males = y_valid[X_valid['gender'] == 'male']
X_valid_females = X_valid[X_valid['gender'] == 'female'].copy()
y_valid_females = y_valid[X_valid['gender'] == 'female']

X_test_males = X_test[X_test['gender'] == 'male'].copy()
y_test_males = y_test[X_test['gender'] == 'male']
X_test_females = X_test[X_test['gender'] == 'female'].copy()
y_test_females = y_test[X_test['gender'] == 'female']

assert X_train_males['gender'].nunique() == 1 and X_train_males['gender'].iloc[0] == 'male', "X_train_males has unexpected gender values"
assert X_train_females['gender'].nunique() == 1 and X_train_females['gender'].iloc[0] == 'female', "X_train_females has unexpected gender values"
assert X_valid_males['gender'].nunique() == 1 and X_valid_males['gender'].iloc[0] == 'male', "X_valid_males has unexpected gender values"
assert X_valid_females['gender'].nunique() == 1 and X_valid_females['gender'].iloc[0] == 'female', "X_valid_females has unexpected gender values"
assert X_test_males['gender'].nunique() == 1 and X_test_males['gender'].iloc[0] == 'male', "X_test_males has unexpected gender values"
assert X_test_females['gender'].nunique() == 1 and X_test_females['gender'].iloc[0] == 'female', "X_test_females has unexpected gender values"
print("Gender splits: all subsets contain only the expected gender")

for name, X_split, y_split in [
    ("X_train_males", X_train_males, y_train_males),
    ("X_train_females", X_train_females, y_train_females),
    ("X_valid_males", X_valid_males, y_valid_males),
    ("X_valid_females", X_valid_females, y_valid_females),
    ("X_test_males", X_test_males, y_test_males),
    ("X_test_females", X_test_females, y_test_females),
]:
    assert len(X_split) == len(y_split), f"Mismatch in {name} and corresponding labels"
    print(f"{name}: {len(X_split)} samples")

print("All splits are valid and properly structured.")


# Select training group
if training_group == "females":
    X_train_group, y_train_group = X_train_females['text'], y_train_females
    X_valid_group, y_valid_group = X_valid_females['text'], y_valid_females
    assert X_train_females['gender'].nunique() == 1, "Training set contains non-female entries"
    assert X_valid_females['gender'].nunique() == 1, "Validation set contains non-female entries"
    print("Training and validation datasets contain only FEMALES")
elif training_group == "males":
    X_train_group, y_train_group = X_train_males['text'], y_train_males
    X_valid_group, y_valid_group = X_valid_males['text'], y_valid_males
    assert X_train_males['gender'].nunique() == 1, "Training set contains non-male entries"
    assert X_valid_males['gender'].nunique() == 1, "Validation set contains non-male entries"
    print("Training and validation datasets contain only MALES")
elif training_group == "whole":
    X_train_group, y_train_group = X_train['text'], y_train
    X_valid_group, y_valid_group = X_valid['text'], y_valid
    assert set(X_train['gender'].unique()) == {'male', 'female'}, "Training set does not contain both genders"
    assert set(X_valid['gender'].unique()) == {'male', 'female'}, "Validation set does not contain both genders"
    print("Training and validation datasets contain males and females")
else:
    raise ValueError(f"Invalid training_group '{training_group}'. Must be one of: 'females', 'males', 'whole'.")

print(f"[INFO] Training group: {training_group} — Number of training examples: {len(X_train_group)}")


# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(tokenizer_model)

def tokenize_function(texts):
    return tokenizer(list(texts), padding="max_length", truncation=True, max_length=64, return_tensors="pt")

class PAN16DATASET(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        return {
            'input_ids': self.encodings['input_ids'][idx],
            'attention_mask': self.encodings['attention_mask'][idx],
            'labels': torch.tensor(self.labels.iloc[idx], dtype=torch.long)
        }

    def __len__(self):
        return len(self.labels)

train_dataset = PAN16DATASET(tokenize_function(X_train_group), y_train_group)
valid_dataset = PAN16DATASET(tokenize_function(X_valid_group), y_valid_group)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)
valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=32)

# Model
num_labels = len(set(y_train_group))
model = AutoModelForSequenceClassification.from_pretrained(tokenizer_model, num_labels=num_labels)

if tokenizer_model == "bert-base-uncased":
    lr = 2e-5
elif tokenizer_model == "roberta-base":
    lr = 2e-5
elif tokenizer_model == "distilroberta-base":
    lr = 5e-5

optimizer = AdamW(model.parameters(), lr=lr)
scaler = GradScaler()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
print(f"\nUsing device: {device}")


start_time = time.time()
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader, desc=f"Training Epoch {epoch + 1}"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        if use_amp:
            with autocast(device_type='cuda'):
                outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss
            scaler.scale(loss).backward()
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            scaler.step(optimizer)
            scaler.update()
        else:
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1} - Average Training Loss: {avg_loss:.4f}")

    # Validation
    model.eval()
    predictions, true_labels = [], []
    with torch.no_grad():
        for batch in tqdm(valid_loader, desc=f"Validation Epoch {epoch + 1}"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs.logits, dim=-1)

            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    print(f"\nValidation Report for Epoch {epoch + 1}:")
    print(classification_report(true_labels, predictions, target_names=["No Mention (0)", "Mention (1)"]))
end_time = time.time()
print(f"Training took {end_time - start_time:.2f} seconds")

test_groups = {
    "whole": (X_test['text'], y_test),
    "males": (X_test_males['text'], y_test_males),
    "females": (X_test_females['text'], y_test_females)
}
print(f"\n*** Used code: {code}. Training group: {training_group}. Model: {tokenizer_model}. Seed: {seed} ***")

for group_name, (X_group, y_group) in test_groups.items():
    print(f"\nTesting on {group_name.upper()} group")
    test_dataset = PAN16DATASET(tokenize_function(X_group), y_group)
    test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=32)

    predictions, true_labels = [], []
    model.eval()
    with torch.no_grad():
        for batch in tqdm(test_loader, desc=f"Testing {group_name}"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs.logits, dim=-1)

            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    print("\nClassification Report:")
    print(classification_report(true_labels, predictions, target_names=["No Mention (0)", "Mention (1)"]))
    print("#####################################################")
