### Initial Multitask Model

In [1]:
import socket
print(f"Running on: {socket.gethostname()}")

Running on: landonia13.inf.ed.ac.uk


In [2]:
# check CUDA available
import torch
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"CUDA device count: {torch.cuda.device_count()}")
if torch.cuda.is_available():
    print(f"Current device: {torch.cuda.current_device()}")
    print(f"Device name: {torch.cuda.get_device_name(0)}")

CUDA available: True
CUDA device count: 1
Current device: 0
Device name: NVIDIA GeForce GTX 1060 6GB


### Import Necessary Libraries

In [3]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, WeightedRandomSampler
from transformers import T5Tokenizer, T5ForConditionalGeneration # huggingface T5 model + tokenizer
from sklearn.metrics import accuracy_score, f1_score, recall_score, roc_auc_score, confusion_matrix, classification_report
import torchmetrics
from model import MultitaskModel, MultitaskDataset, train, evaluate, evaluate_with_sampling, create_weighted_sampler, train_model
from sklearn.model_selection import train_test_split
from bert_score import score as bert_score
from nltk.translate.meteor_score import meteor_score
import nltk
import random

### Import Dataset

In [4]:
mwr_df_simple = pd.read_csv('mwr_simple.csv')

### Extract Features

In [5]:
print(mwr_df_simple.columns)

Index(['Examination ID', 'Conclusion', 'r:Th', 'Weight', 'Height',
       'Ambient temperature', 'r:AgeInYears', 'Mammary diameter', 'Cycle',
       'Day from the first day', 'Hormonal medications',
       'Cancer family history', 'Breast operations', 'Num of pregnancies',
       'R1 int', 'L1 int', 'R2 int', 'L2 int', 'R3 int', 'L3 int', 'R4 int',
       'L4 int', 'R5 int', 'L5 int', 'R6 int', 'L6 int', 'R7 int', 'L7 int',
       'R8 int', 'L8 int', 'R9 int', 'L9 int', 'T1 int', 'T2 int', 'R0 int',
       'L0 int', 'R1 sk', 'L1 sk', 'R2 sk', 'L2 sk', 'R3 sk', 'L3 sk', 'R4 sk',
       'L4 sk', 'R5 sk', 'L5 sk', 'R6 sk', 'L6 sk', 'R7 sk', 'L7 sk', 'R8 sk',
       'L8 sk', 'R9 sk', 'L9 sk', 'T1 sk', 'T2 sk', 'R0 sk', 'L0 sk',
       'Conclusion (Tr)', 'Synthetic_Conclusion', 'y_binary'],
      dtype='object')


In [6]:
# Select feature columns (temperature readings)
feature_cols = [col for col in mwr_df_simple.columns if col.endswith('int') or col.endswith('sk')]
mwr_df_simple['features'] = mwr_df_simple[feature_cols].values.tolist()

# Prepare labels and text targets (binary classification)
mwr_df_simple['class_label'] = mwr_df_simple['y_binary'].astype(int)
mwr_df_simple['synthetic_description'] = mwr_df_simple['Synthetic_Conclusion']

In [7]:
# check class distribution
print(mwr_df_simple['class_label'].value_counts(normalize=True))

class_label
1    0.75883
0    0.24117
Name: proportion, dtype: float64


### Split Data into Test and Train

In [8]:
# 70% training, 15% validation, 15% testing
train_simple_df, temp_simple_df = train_test_split(mwr_df_simple, test_size=0.3, random_state=42)
val_simple_df, test_simple_df = train_test_split(temp_simple_df, test_size=0.5, random_state=42)

### Model Initialisation

In [9]:
print("Loading tokenizer...")
tokenizer = T5Tokenizer.from_pretrained('./t5-small-local/', local_files_only=True)

print("Creating datasets...")
train_dataset = MultitaskDataset(train_simple_df, tokenizer)
val_dataset = MultitaskDataset(val_simple_df, tokenizer)
test_dataset = MultitaskDataset(test_simple_df, tokenizer)

# print("Creatinf Weighted Sampler")
# weighted_sampler = create_weighted_sampler(train_dataset)

print("Creating dataloaders...")
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True) #sampler=weighted_sampler)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

print("Setting up device...")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print("Initializing model...")
model = MultitaskModel(num_classes=2, t5_model_name='./t5-small-local/')
print(model.generation_weight, model.classification_weight)

print("Moving model to device...")
model = model.to(device)

#print("Getting parameter groups")
#encoder_params, clf_params, gen_params = get_parameter_groups(model)
#print("✓ Parameter groups created")

print("Setting up optimizer...")
optimizer = optim.AdamW(model.parameters(), lr=0.00005, weight_decay=0.01)
#optimizer = optim.AdamW([
#    {'params': encoder_params, 'lr': 0.000025},     # Conservative for shared encoder
#    {'params': clf_params, 'lr': 0.00005},          # Higher for classification + projection
#    {'params': gen_params, 'lr': 0.0001}            # Higher for T5 decoder
#], weight_decay=0.01)
print(f"After .to(device) - CLF: {model.classification_weight}, GEN: {model.generation_weight}")
print("✓ Setup complete!")

Loading tokenizer...


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


✓ Tokenizer loaded
Creating datasets...
✓ Datasets created
Creating weighted sampler for balanced training...
✓ Weighted sampler created
Creating dataloaders...
✓ Dataloaders created
Setting up device...
✓ Using device: cuda
Initializing model...
0.001 1.0
✓ Model initialized
Moving model to device...
✓ Model moved to device
After .to(device) - CLF: 1.0, GEN: 0.001
Setting up optimizer...
After .to(device) - CLF: 1.0, GEN: 0.001
✓ Setup complete!


In [None]:
tokenizer, train_loader, val_loader, test_loader, model, optimizer, device = setup_training_pipeline(
    df_train=train_simple_df,
    df_val=val_simple_df,
    df_test=test_simple_df,
    multitask_model_class=MultitaskModel,
    multitask_dataset_class=MultitaskDataset
)

### Train Model

In [None]:
best_acc, best_f1, best_auc = train_and_validate_model(
    model, train_loader, val_loader, optimizer, device, num_epochs=30
)

In [10]:
# Training loop, gen=0
num_epochs = 30
best_accuracy = 0
best_f1 = 0
best_auc = 0

for epoch in range(num_epochs):
    print(f"\nEpoch {epoch + 1}/{num_epochs}")
    
    # Train
    train_loss = train(model, train_loader, optimizer, device)
    print(f"Training loss: {train_loss:.4f}")
    
    # Validate
    val_loss, val_metrics = evaluate(model, val_loader, device)
    print(f"Validation loss: {val_loss:.4f}")
    print(f"Validation metrics:")
    print(f"  Accuracy     : {val_metrics['accuracy']:.4f}")
    print(f"  F1-Score     : {val_metrics['f1_score']:.4f}")
    print(f"  Sensitivity  : {val_metrics['sensitivity']:.4f}")
    print(f"  Specificity  : {val_metrics['specificity']:.4f}")
    print(f"  AUC-ROC      : {val_metrics['auc_roc']:.4f}" if val_metrics['auc_roc'] is not None else "  AUC-ROC      : N/A")
    print(f"  Confusion Matrix: {val_metrics['confusion_matrix']}")

    # Track best accuracy (or optionally best F1/AUC too)
    if val_metrics['accuracy'] > best_accuracy:
        best_accuracy = val_metrics['accuracy']
        print(f"✓ New best accuracy: {best_accuracy:.4f}")
    
    # Track best AUC and F1 if relevant
    if val_metrics['f1_score'] > best_f1:
        best_f1 = val_metrics['f1_score']
        print(f"✓ New best F1: {best_f1:.4f}")

    if val_metrics['auc_roc'] is not None and val_metrics['auc_roc'] > best_auc:
        best_auc = val_metrics['auc_roc']
        print(f"✓ New best AUC-ROC: {best_auc:.4f}")
    
    # Print the learned loss weights every few epochs
    if (epoch + 1) % 3 == 0:
        print(f"Current loss weights - CLF: {model.classification_weight:.3f}, GEN: {model.generation_weight:.3f}")


Epoch 1/30


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Training loss: 0.6356
Validation loss: 0.5216
Validation metrics:
  Accuracy     : 0.7494
  F1-Score     : 0.7314
  Sensitivity  : 0.8865
  Specificity  : 0.3247
  AUC-ROC      : 0.7126
  Confusion Matrix: [[288, 599], [312, 2436]]
✓ New best accuracy: 0.7494
✓ New best F1: 0.7314
✓ New best AUC-ROC: 0.7126

Epoch 2/30
Training loss: 0.5556
Validation loss: 0.5306
Validation metrics:
  Accuracy     : 0.7365
  F1-Score     : 0.7339
  Sensitivity  : 0.8344
  Specificity  : 0.4329
  AUC-ROC      : 0.7294
  Confusion Matrix: [[384, 503], [455, 2293]]
✓ New best F1: 0.7339
✓ New best AUC-ROC: 0.7294

Epoch 3/30
Training loss: 0.5283
Validation loss: 0.5307
Validation metrics:
  Accuracy     : 0.7227
  F1-Score     : 0.7285
  Sensitivity  : 0.7944
  Specificity  : 0.5006
  AUC-ROC      : 0.7429
  Confusion Matrix: [[444, 443], [565, 2183]]
✓ New best AUC-ROC: 0.7429
Current loss weights - CLF: 1.000, GEN: 0.000

Epoch 4/30
Training loss: 0.5217
Validation loss: 0.5108
Validation metrics:
  A

Training loss: 0.4465
Validation loss: 0.4997
Validation metrics:
  Accuracy     : 0.7367
  F1-Score     : 0.7508
  Sensitivity  : 0.7515
  Specificity  : 0.6911
  AUC-ROC      : 0.8020
  Confusion Matrix: [[613, 274], [683, 2065]]

Epoch 30/30
Training loss: 0.4485
Validation loss: 0.4987
Validation metrics:
  Accuracy     : 0.7400
  F1-Score     : 0.7524
  Sensitivity  : 0.7656
  Specificity  : 0.6607
  AUC-ROC      : 0.7962
  Confusion Matrix: [[586, 301], [644, 2104]]
Current loss weights - CLF: 1.000, GEN: 0.000


In [11]:
# test evaluation, gen=0
print(f"\nTraining completed!")
print(f"Best results from validation:")
print(f"  Best Accuracy: {best_accuracy:.4f}")
print(f"  Best F1: {best_f1:.4f}")
print(f"  Best AUC-ROC: {best_auc:.4f}")

print("\n" + "="*50)
print("FINAL TEST SET EVALUATION")
print("="*50)

# Quick classification metrics on full test set
print("\n1. Full Classification Performance:")
test_loss, test_metrics = evaluate(model, test_loader, device)
print(f"Test loss: {test_loss:.4f}")
print(f"Classification metrics (all {len(test_dataset)} samples):")
print(f"  Accuracy     : {test_metrics['accuracy']:.4f}")
print(f"  F1-Score     : {test_metrics['f1_score']:.4f}")
print(f"  Sensitivity  : {test_metrics['sensitivity']:.4f}")
print(f"  Specificity  : {test_metrics['specificity']:.4f}")
print(f"  AUC-ROC      : {test_metrics['auc_roc']:.4f}" if test_metrics['auc_roc'] is not None else "  AUC-ROC      : N/A")

# Sampled text generation metrics
print("\n2. Sampled Text Generation Performance:")
# Use 500 samples for text metrics (adjust as needed)
test_loss_sampled, test_metrics_sampled = evaluate_with_sampling(
    model, test_loader, device, tokenizer, text_sample_size=500
)

print(f"Text generation metrics ({test_metrics_sampled['text_samples_used']} samples):")
if test_metrics_sampled['avg_bertscore_f1'] is not None:
    print(f"  BERTScore F1 : {test_metrics_sampled['avg_bertscore_f1']:.4f}")
else:
    print(f"  BERTScore F1 : N/A")

if test_metrics_sampled['avg_meteor'] is not None:
    print(f"  METEOR       : {test_metrics_sampled['avg_meteor']:.4f}")
else:
    print(f"  METEOR       : N/A")

print(f"\nSample efficiency: {test_metrics_sampled['text_samples_used']}/{test_metrics_sampled['total_samples']} samples used for text metrics")


Training completed!
Best results from validation:
  Best Accuracy: 0.7838
  Best F1: 0.7776
  Best AUC-ROC: 0.8060

FINAL TEST SET EVALUATION

1. Full Classification Performance:


The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Test loss: 0.4777
Classification metrics (all 3636 samples):
  Accuracy     : 0.7591
  F1-Score     : 0.7708
  Sensitivity  : 0.7881
  Specificity  : 0.6619
  AUC-ROC      : 0.8179

2. Sampled Text Generation Performance:
Sampling 500 out of 3636 samples for text generation metrics


The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not va

The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not va

The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not va

The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not va

The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not va

The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not va

The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not va

The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not va

The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not va

Computing text metrics on 500 samples...
Error calculating BERT Score: '/home/s2080063/MWR-to-Text/models/roberta-large'
Text generation metrics (500 samples):
  BERTScore F1 : N/A
  METEOR       : 0.4531

Sample efficiency: 500/3636 samples used for text metrics


In [10]:
# Training loop, 0.1, gen weight=0.001
num_epochs = 30
best_accuracy = 0
best_f1 = 0
best_auc = 0

for epoch in range(num_epochs):
    print(f"\nEpoch {epoch + 1}/{num_epochs}")
    
    # Train
    train_loss = train(model, train_loader, optimizer, device)
    print(f"Training loss: {train_loss:.4f}")
    
    # Validate
    val_loss, val_metrics = evaluate(model, val_loader, device)
    print(f"Validation loss: {val_loss:.4f}")
    print(f"Validation metrics:")
    print(f"  Accuracy     : {val_metrics['accuracy']:.4f}")
    print(f"  F1-Score     : {val_metrics['f1_score']:.4f}")
    print(f"  Sensitivity  : {val_metrics['sensitivity']:.4f}")
    print(f"  Specificity  : {val_metrics['specificity']:.4f}")
    print(f"  AUC-ROC      : {val_metrics['auc_roc']:.4f}" if val_metrics['auc_roc'] is not None else "  AUC-ROC      : N/A")
    print(f"  Confusion Matrix: {val_metrics['confusion_matrix']}")

    # Track best accuracy (or optionally best F1/AUC too)
    if val_metrics['accuracy'] > best_accuracy:
        best_accuracy = val_metrics['accuracy']
        print(f"✓ New best accuracy: {best_accuracy:.4f}")
    
    # Track best AUC and F1 if relevant
    if val_metrics['f1_score'] > best_f1:
        best_f1 = val_metrics['f1_score']
        print(f"✓ New best F1: {best_f1:.4f}")

    if val_metrics['auc_roc'] is not None and val_metrics['auc_roc'] > best_auc:
        best_auc = val_metrics['auc_roc']
        print(f"✓ New best AUC-ROC: {best_auc:.4f}")
    
    # Print the learned loss weights every few epochs
    if (epoch + 1) % 3 == 0:
        print(f"Current loss weights - CLF: {model.classification_weight:.3f}, GEN: {model.generation_weight:.3f}")


Epoch 1/30


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Training loss: 0.7206
Validation loss: 0.5232
Validation metrics:
  Accuracy     : 0.7486
  F1-Score     : 0.7113
  Sensitivity  : 0.9221
  Specificity  : 0.2108
  AUC-ROC      : 0.6963
  Confusion Matrix: [[187, 700], [214, 2534]]
✓ New best accuracy: 0.7486
✓ New best F1: 0.7113
✓ New best AUC-ROC: 0.6963

Epoch 2/30
Training loss: 0.5599
Validation loss: 0.5088
Validation metrics:
  Accuracy     : 0.7629
  F1-Score     : 0.7282
  Sensitivity  : 0.9309
  Specificity  : 0.2424
  AUC-ROC      : 0.7228
  Confusion Matrix: [[215, 672], [190, 2558]]
✓ New best accuracy: 0.7629
✓ New best F1: 0.7282
✓ New best AUC-ROC: 0.7228

Epoch 3/30
Training loss: 0.5391
Validation loss: 0.5003
Validation metrics:
  Accuracy     : 0.7554
  F1-Score     : 0.7336
  Sensitivity  : 0.8999
  Specificity  : 0.3078
  AUC-ROC      : 0.7387
  Confusion Matrix: [[273, 614], [275, 2473]]
✓ New best F1: 0.7336
✓ New best AUC-ROC: 0.7387
Current loss weights - CLF: 1.000, GEN: 0.001

Epoch 4/30
Training loss: 0.52

Training loss: 0.4408
Validation loss: 0.4697
Validation metrics:
  Accuracy     : 0.7618
  F1-Score     : 0.7671
  Sensitivity  : 0.8184
  Specificity  : 0.5862
  AUC-ROC      : 0.8041
  Confusion Matrix: [[520, 367], [499, 2249]]

Epoch 30/30
Training loss: 0.4410
Validation loss: 0.4848
Validation metrics:
  Accuracy     : 0.7538
  F1-Score     : 0.7614
  Sensitivity  : 0.8020
  Specificity  : 0.6043
  AUC-ROC      : 0.7976
  Confusion Matrix: [[536, 351], [544, 2204]]
Current loss weights - CLF: 1.000, GEN: 0.001


In [11]:
# test evaluation, gen=0.001
print(f"\nTraining completed!")
print(f"Best results from validation:")
print(f"  Best Accuracy: {best_accuracy:.4f}")
print(f"  Best F1: {best_f1:.4f}")
print(f"  Best AUC-ROC: {best_auc:.4f}")

print("\n" + "="*50)
print("FINAL TEST SET EVALUATION")
print("="*50)

# Quick classification metrics on full test set
print("\n1. Full Classification Performance:")
test_loss, test_metrics = evaluate(model, test_loader, device)
print(f"Test loss: {test_loss:.4f}")
print(f"Classification metrics (all {len(test_dataset)} samples):")
print(f"  Accuracy     : {test_metrics['accuracy']:.4f}")
print(f"  F1-Score     : {test_metrics['f1_score']:.4f}")
print(f"  Sensitivity  : {test_metrics['sensitivity']:.4f}")
print(f"  Specificity  : {test_metrics['specificity']:.4f}")
print(f"  AUC-ROC      : {test_metrics['auc_roc']:.4f}" if test_metrics['auc_roc'] is not None else "  AUC-ROC      : N/A")

# Sampled text generation metrics
print("\n2. Sampled Text Generation Performance:")
# Use 500 samples for text metrics (adjust as needed)
test_loss_sampled, test_metrics_sampled = evaluate_with_sampling(
    model, test_loader, device, tokenizer, text_sample_size=500
)

print(f"Text generation metrics ({test_metrics_sampled['text_samples_used']} samples):")
if test_metrics_sampled['avg_bertscore_f1'] is not None:
    print(f"  BERTScore F1 : {test_metrics_sampled['avg_bertscore_f1']:.4f}")
else:
    print(f"  BERTScore F1 : N/A")

if test_metrics_sampled['avg_meteor'] is not None:
    print(f"  METEOR       : {test_metrics_sampled['avg_meteor']:.4f}")
else:
    print(f"  METEOR       : N/A")

print(f"\nSample efficiency: {test_metrics_sampled['text_samples_used']}/{test_metrics_sampled['total_samples']} samples used for text metrics")


Training completed!
Best results from validation:
  Best Accuracy: 0.7846
  Best F1: 0.7750
  Best AUC-ROC: 0.8084

FINAL TEST SET EVALUATION

1. Full Classification Performance:


The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Test loss: 0.4716
Classification metrics (all 3636 samples):
  Accuracy     : 0.7701
  F1-Score     : 0.7773
  Sensitivity  : 0.8192
  Specificity  : 0.6057
  AUC-ROC      : 0.8068

2. Sampled Text Generation Performance:
Sampling 500 out of 3636 samples for text generation metrics


The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not va

The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not va

The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not va

The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not va

The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not va

The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not va

The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not va

The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not va

The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not va

Computing text metrics on 500 samples...
Error calculating BERT Score: '/home/s2080063/MWR-to-Text/models/roberta-large'
Text generation metrics (500 samples):
  BERTScore F1 : N/A
  METEOR       : 0.9961

Sample efficiency: 500/3636 samples used for text metrics


In [12]:
def evaluate_with_sampling(model, dataloader, device, tokenizer, threshold=0.5, text_sample_size=500):
    """
    Evaluate classification metrics on the full dataset and text generation metrics on a sampled subset.
    
    Args:
        model: The model to evaluate.
        dataloader: DataLoader for the evaluation set.
        device: Device to run the model on.
        tokenizer: Tokenizer for decoding generated text.
        threshold: Threshold for converting probabilities to binary predictions.
        text_sample_size: Number of samples to use for generation and text metrics.
        
    Returns:
        Tuple containing:
            - average loss,
            - classification and generation metric dictionary,
            - list of generated texts,
            - list of reference texts
    """
    model.eval()
    clf_loss_fn = nn.CrossEntropyLoss()
    total_loss = 0
    all_probs = []
    all_labels = []
    
    generated_texts = []
    reference_texts = []
    sample_indices = set()
    total_samples = len(dataloader.dataset)
    
    # Randomly select indices for text generation sampling
    if text_sample_size < total_samples:
        sample_indices = set(random.sample(range(total_samples), text_sample_size))
        print(f"Sampling {text_sample_size} out of {total_samples} samples for text generation metrics")
    else:
        sample_indices = set(range(total_samples))
        print(f"Using all {total_samples} samples for text generation metrics")
    
    current_idx = 0

    with torch.no_grad():
        for features, labels, input_ids, attention_mask, target_ids in dataloader:
            features, labels = features.to(device), labels.to(device)
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            target_ids = target_ids.to(device)
            
            class_logits, gen_loss, _, clf_weight, gen_weight = model(
                features, input_ids, attention_mask, target_ids
            )
            
            clf_loss = clf_loss_fn(class_logits, labels)
            
            if gen_loss is not None:
                batch_loss = clf_weight * clf_loss + gen_weight * gen_loss
            else:
                batch_loss = clf_loss
            
            total_loss += batch_loss.item()
            
            # Classification prediction
            probs = torch.softmax(class_logits, dim=1)
            all_probs.extend(probs[:, 1].detach().cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
            
            # Text generation for sampled indices
            batch_size = features.size(0)
            for i in range(batch_size):
                if current_idx + i in sample_indices:
                    sample_input_ids = input_ids[i:i+1]
                    sample_attention_mask = attention_mask[i:i+1]
                    sample_target_ids = target_ids[i:i+1]
                    
                    generated_ids = model.t5.generate(
                        input_ids=sample_input_ids,
                        attention_mask=sample_attention_mask,
                        max_length=64,
                        do_sample=False,
                        early_stopping=True
                    )
                    
                    gen_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
                    ref_text = tokenizer.decode(sample_target_ids[0], skip_special_tokens=True)
                    
                    generated_texts.append(gen_text)
                    reference_texts.append(ref_text)
            
            current_idx += batch_size

    # Compute classification metrics
    all_probs = np.array(all_probs)
    all_labels = np.array(all_labels)
    preds = (all_probs > threshold).astype(int)
    
    accuracy = accuracy_score(all_labels, preds)
    f1 = f1_score(all_labels, preds, average='weighted')
    sensitivity = recall_score(all_labels, preds, pos_label=1)
    specificity = recall_score(all_labels, preds, pos_label=0)
    try:
        auc = roc_auc_score(all_labels, all_probs)
    except ValueError:
        auc = None
    conf_matrix = confusion_matrix(all_labels, preds)

    # Compute text generation metrics
    bert_f1_score = None
    meteor_mean_score = None

    if generated_texts and reference_texts:
        print(f"Computing text metrics on {len(generated_texts)} samples...")

        # BERTScore
        try:
            from bert_score import score as bert_score
            P, R, F1 = bert_score(
                generated_texts,
                reference_texts,
                model_type="/home/s2080063/MWR-to-Text/models/roberta-large",  # Use your local model path
                lang="en",
                verbose=False
            )
            bert_f1_score = F1.mean().item()
        except Exception as e:
            print(f"Error calculating BERT Score: {e}")

        # METEOR
        try:
            import nltk
            from nltk.translate.meteor_score import meteor_score
            nltk.data.path.append('/home/s2080063/nltk_data')  # Adjust if needed
            meteor_scores = []
            for gen, ref in zip(generated_texts, reference_texts):
                gen_tokens = nltk.word_tokenize(gen.lower())
                ref_tokens = nltk.word_tokenize(ref.lower())
                meteor_scores.append(meteor_score([ref_tokens], gen_tokens))
            meteor_mean_score = np.mean(meteor_scores)
        except Exception as e:
            print(f"Error calculating METEOR Score: {e}")

    metrics = {
        'accuracy': accuracy,
        'f1_score': f1,
        'sensitivity': sensitivity,
        'specificity': specificity,
        'auc_roc': auc,
        'confusion_matrix': conf_matrix.tolist(),
        'threshold': threshold,
        'avg_bertscore_f1': bert_f1_score,
        'avg_meteor': meteor_mean_score,
        'text_samples_used': len(generated_texts),
        'total_samples': total_samples
    }

    return total_loss / len(dataloader), metrics, generated_texts, reference_texts


In [13]:
test_loss_sampled, test_metrics_sampled, generated_texts, reference_texts = evaluate_with_sampling(
    model, test_loader, device, tokenizer, text_sample_size=500
)

# Show a few sample generations
print("\nSample Generated Texts:")
for i in range(min(5, len(generated_texts))):
    print(f"\nSample {i + 1}:")
    print(f"  Reference : {reference_texts[i]}")
    print(f"  Generated : {generated_texts[i]}")

Sampling 500 out of 3636 samples for text generation metrics


The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not va

The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not va

The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not va

The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not va

The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not va

The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not va

The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not va

The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not va

The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not va

Computing text metrics on 500 samples...
Error calculating BERT Score: '/home/s2080063/MWR-to-Text/models/roberta-large'

Sample Generated Texts:

Sample 1:
  Reference : Increased temperature (surface and depth), partial asymmetry.
  Generated : Increased temperature (surface and depth), partial asymmetry.

Sample 2:
  Reference : Moderately elevated temperature (surface).
  Generated : Moderately elevated temperature (surface).

Sample 3:
  Reference : Slightly elevated temperature.
  Generated : Slightly elevated temperature.

Sample 4:
  Reference : Slightly elevated temperature.
  Generated : Slightly elevated temperature.

Sample 5:
  Reference : Slightly elevated temperature.
  Generated : Slightly elevated temperature.


In [None]:
# Training loop, 0.1, gen weight=0.0005
num_epochs = 30
best_accuracy = 0
best_f1 = 0
best_auc = 0

for epoch in range(num_epochs):
    print(f"Current loss weights - CLF: {model.classification_weight:.4f}, GEN: {model.generation_weight:.4f}")
    print(f"\nEpoch {epoch + 1}/{num_epochs}")
    
    # Train
    train_loss = train(model, train_loader, optimizer, device)
    print(f"Training loss: {train_loss:.4f}")
    
    # Validate
    val_loss, val_metrics = evaluate(model, val_loader, device)
    print(f"Validation loss: {val_loss:.4f}")
    print(f"Validation metrics:")
    print(f"  Accuracy     : {val_metrics['accuracy']:.4f}")
    print(f"  F1-Score     : {val_metrics['f1_score']:.4f}")
    print(f"  Sensitivity  : {val_metrics['sensitivity']:.4f}")
    print(f"  Specificity  : {val_metrics['specificity']:.4f}")
    print(f"  AUC-ROC      : {val_metrics['auc_roc']:.4f}" if val_metrics['auc_roc'] is not None else "  AUC-ROC      : N/A")
    print(f"  Confusion Matrix: {val_metrics['confusion_matrix']}")

    # Track best accuracy (or optionally best F1/AUC too)
    if val_metrics['accuracy'] > best_accuracy:
        best_accuracy = val_metrics['accuracy']
        print(f"✓ New best accuracy: {best_accuracy:.4f}")
    
    # Track best AUC and F1 if relevant
    if val_metrics['f1_score'] > best_f1:
        best_f1 = val_metrics['f1_score']
        print(f"✓ New best F1: {best_f1:.4f}")

    if val_metrics['auc_roc'] is not None and val_metrics['auc_roc'] > best_auc:
        best_auc = val_metrics['auc_roc']
        print(f"✓ New best AUC-ROC: {best_auc:.4f}")
    
    # Print the learned loss weights every few epochs
    if (epoch + 1) % 3 == 0:
        print(f"Current loss weights - CLF: {model.classification_weight:.4f}, GEN: {model.generation_weight:.4f}")

In [11]:
# test evaluation, gen=0.0005
print(f"\nTraining completed!")
print(f"Best results from validation:")
print(f"  Best Accuracy: {best_accuracy:.4f}")
print(f"  Best F1: {best_f1:.4f}")
print(f"  Best AUC-ROC: {best_auc:.4f}")

print("\n" + "="*50)
print("FINAL TEST SET EVALUATION")
print("="*50)

# Quick classification metrics on full test set
print("\n1. Full Classification Performance:")
test_loss, test_metrics = evaluate(model, test_loader, device)
print(f"Test loss: {test_loss:.4f}")
print(f"Classification metrics (all {len(test_dataset)} samples):")
print(f"  Accuracy     : {test_metrics['accuracy']:.4f}")
print(f"  F1-Score     : {test_metrics['f1_score']:.4f}")
print(f"  Sensitivity  : {test_metrics['sensitivity']:.4f}")
print(f"  Specificity  : {test_metrics['specificity']:.4f}")
print(f"  AUC-ROC      : {test_metrics['auc_roc']:.4f}" if test_metrics['auc_roc'] is not None else "  AUC-ROC      : N/A")

# Sampled text generation metrics
print("\n2. Sampled Text Generation Performance:")
# Use 500 samples for text metrics (adjust as needed)
test_loss_sampled, test_metrics_sampled = evaluate_with_sampling(
    model, test_loader, device, tokenizer, text_sample_size=500
)

print(f"Text generation metrics ({test_metrics_sampled['text_samples_used']} samples):")
if test_metrics_sampled['avg_bertscore_f1'] is not None:
    print(f"  BERTScore F1 : {test_metrics_sampled['avg_bertscore_f1']:.4f}")
else:
    print(f"  BERTScore F1 : N/A")

if test_metrics_sampled['avg_meteor'] is not None:
    print(f"  METEOR       : {test_metrics_sampled['avg_meteor']:.4f}")
else:
    print(f"  METEOR       : N/A")

print(f"\nSample efficiency: {test_metrics_sampled['text_samples_used']}/{test_metrics_sampled['total_samples']} samples used for text metrics")


Training completed!
Best results from validation:
  Best Accuracy: 0.7829
  Best F1: 0.7781
  Best AUC-ROC: 0.8121

FINAL TEST SET EVALUATION

1. Full Classification Performance:


The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Test loss: 0.4296
Classification metrics (all 3636 samples):
  Accuracy     : 0.7948
  F1-Score     : 0.7943
  Sensitivity  : 0.8689
  Specificity  : 0.5472
  AUC-ROC      : 0.8264

2. Sampled Text Generation Performance:
Sampling 500 out of 3636 samples for text generation metrics


The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not va

The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not va

The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not va

The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not va

The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not va

The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not va

The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not va

The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not va

The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not va

Computing text metrics on 500 samples...
Error calculating BERT Score: '/home/s2080063/MWR-to-Text/models/roberta-large'
Text generation metrics (500 samples):
  BERTScore F1 : N/A
  METEOR       : 0.9963

Sample efficiency: 500/3636 samples used for text metrics


In [10]:
# Training loop, 0.1, gen weight=0.0005, weighted sample
num_epochs = 30
best_accuracy = 0
best_f1 = 0
best_auc = 0

for epoch in range(num_epochs):
    print(f"Current loss weights - CLF: {model.classification_weight:.4f}, GEN: {model.generation_weight:.4f}")
    print(f"\nEpoch {epoch + 1}/{num_epochs}")
    
    # Train
    train_loss = train(model, train_loader, optimizer, device)
    print(f"Training loss: {train_loss:.4f}")
    
    # Validate
    val_loss, val_metrics = evaluate(model, val_loader, device)
    print(f"Validation loss: {val_loss:.4f}")
    print(f"Validation metrics:")
    print(f"  Accuracy     : {val_metrics['accuracy']:.4f}")
    print(f"  F1-Score     : {val_metrics['f1_score']:.4f}")
    print(f"  Sensitivity  : {val_metrics['sensitivity']:.4f}")
    print(f"  Specificity  : {val_metrics['specificity']:.4f}")
    print(f"  AUC-ROC      : {val_metrics['auc_roc']:.4f}" if val_metrics['auc_roc'] is not None else "  AUC-ROC      : N/A")
    print(f"  Confusion Matrix: {val_metrics['confusion_matrix']}")

    # Track best accuracy (or optionally best F1/AUC too)
    if val_metrics['accuracy'] > best_accuracy:
        best_accuracy = val_metrics['accuracy']
        print(f"✓ New best accuracy: {best_accuracy:.4f}")
    
    # Track best AUC and F1 if relevant
    if val_metrics['f1_score'] > best_f1:
        best_f1 = val_metrics['f1_score']
        print(f"✓ New best F1: {best_f1:.4f}")

    if val_metrics['auc_roc'] is not None and val_metrics['auc_roc'] > best_auc:
        best_auc = val_metrics['auc_roc']
        print(f"✓ New best AUC-ROC: {best_auc:.4f}")
    
    # Print the learned loss weights every few epochs
    if (epoch + 1) % 3 == 0:
        print(f"Current loss weights - CLF: {model.classification_weight:.4f}, GEN: {model.generation_weight:.4f}")

Current loss weights - CLF: 1.0000, GEN: 0.0005

Epoch 1/30


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Training loss: 0.7362
Validation loss: 0.8101
Validation metrics:
  Accuracy     : 0.5131
  F1-Score     : 0.5323
  Sensitivity  : 0.4014
  Specificity  : 0.8591
  AUC-ROC      : 0.7205
  Confusion Matrix: [[762, 125], [1645, 1103]]
✓ New best accuracy: 0.5131
✓ New best F1: 0.5323
✓ New best AUC-ROC: 0.7205
Current loss weights - CLF: 1.0000, GEN: 0.0005

Epoch 2/30
Training loss: 0.6683
Validation loss: 0.8392
Validation metrics:
  Accuracy     : 0.5059
  F1-Score     : 0.5220
  Sensitivity  : 0.3836
  Specificity  : 0.8850
  AUC-ROC      : 0.7339
  Confusion Matrix: [[785, 102], [1694, 1054]]
✓ New best AUC-ROC: 0.7339
Current loss weights - CLF: 1.0000, GEN: 0.0005

Epoch 3/30
Training loss: 0.6404
Validation loss: 0.8164
Validation metrics:
  Accuracy     : 0.5290
  F1-Score     : 0.5483
  Sensitivity  : 0.4145
  Specificity  : 0.8839
  AUC-ROC      : 0.7536
  Confusion Matrix: [[784, 103], [1609, 1139]]
✓ New best accuracy: 0.5290
✓ New best F1: 0.5483
✓ New best AUC-ROC: 0.7536


Training loss: 0.5336
Validation loss: 0.7293
Validation metrics:
  Accuracy     : 0.5989
  F1-Score     : 0.6217
  Sensitivity  : 0.4993
  Specificity  : 0.9076
  AUC-ROC      : 0.7958
  Confusion Matrix: [[805, 82], [1376, 1372]]
Current loss weights - CLF: 1.0000, GEN: 0.0005

Epoch 26/30
Training loss: 0.5304
Validation loss: 0.5773
Validation metrics:
  Accuracy     : 0.6828
  F1-Score     : 0.7052
  Sensitivity  : 0.6332
  Specificity  : 0.8365
  AUC-ROC      : 0.8136
  Confusion Matrix: [[742, 145], [1008, 1740]]
✓ New best accuracy: 0.6828
✓ New best F1: 0.7052
✓ New best AUC-ROC: 0.8136
Current loss weights - CLF: 1.0000, GEN: 0.0005

Epoch 27/30
Training loss: 0.5299
Validation loss: 0.6355
Validation metrics:
  Accuracy     : 0.6429
  F1-Score     : 0.6666
  Sensitivity  : 0.5633
  Specificity  : 0.8895
  AUC-ROC      : 0.8078
  Confusion Matrix: [[789, 98], [1200, 1548]]
Current loss weights - CLF: 1.0000, GEN: 0.0005
Current loss weights - CLF: 1.0000, GEN: 0.0005

Epoch 2

In [11]:
# test evaluation, gen=0.0005
print(f"\nTraining completed!")
print(f"Best results from validation:")
print(f"  Best Accuracy: {best_accuracy:.4f}")
print(f"  Best F1: {best_f1:.4f}")
print(f"  Best AUC-ROC: {best_auc:.4f}")

print("\n" + "="*50)
print("FINAL TEST SET EVALUATION")
print("="*50)

# Quick classification metrics on full test set
print("\n1. Full Classification Performance:")
test_loss, test_metrics = evaluate(model, test_loader, device)
print(f"Test loss: {test_loss:.4f}")
print(f"Classification metrics (all {len(test_dataset)} samples):")
print(f"  Accuracy     : {test_metrics['accuracy']:.4f}")
print(f"  F1-Score     : {test_metrics['f1_score']:.4f}")
print(f"  Sensitivity  : {test_metrics['sensitivity']:.4f}")
print(f"  Specificity  : {test_metrics['specificity']:.4f}")
print(f"  AUC-ROC      : {test_metrics['auc_roc']:.4f}" if test_metrics['auc_roc'] is not None else "  AUC-ROC      : N/A")

# Sampled text generation metrics
print("\n2. Sampled Text Generation Performance:")
# Use 500 samples for text metrics (adjust as needed)
test_loss_sampled, test_metrics_sampled = evaluate_with_sampling(
    model, test_loader, device, tokenizer, text_sample_size=500
)

print(f"Text generation metrics ({test_metrics_sampled['text_samples_used']} samples):")
if test_metrics_sampled['avg_bertscore_f1'] is not None:
    print(f"  BERTScore F1 : {test_metrics_sampled['avg_bertscore_f1']:.4f}")
else:
    print(f"  BERTScore F1 : N/A")

if test_metrics_sampled['avg_meteor'] is not None:
    print(f"  METEOR       : {test_metrics_sampled['avg_meteor']:.4f}")
else:
    print(f"  METEOR       : N/A")

print(f"\nSample efficiency: {test_metrics_sampled['text_samples_used']}/{test_metrics_sampled['total_samples']} samples used for text metrics")


Training completed!
Best results from validation:
  Best Accuracy: 0.6828
  Best F1: 0.7052
  Best AUC-ROC: 0.8155

FINAL TEST SET EVALUATION

1. Full Classification Performance:


The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Test loss: 0.6091
Classification metrics (all 3636 samples):
  Accuracy     : 0.6543
  F1-Score     : 0.6809
  Sensitivity  : 0.5849
  Specificity  : 0.8865
  AUC-ROC      : 0.8295

2. Sampled Text Generation Performance:
Sampling 500 out of 3636 samples for text generation metrics


The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not va

The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not va

The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not va

The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not va

The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not va

The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not va

The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not va

The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not va

The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not va

Computing text metrics on 500 samples...
Error calculating BERT Score: '/home/s2080063/MWR-to-Text/models/roberta-large'
Text generation metrics (500 samples):
  BERTScore F1 : N/A
  METEOR       : 0.9963

Sample efficiency: 500/3636 samples used for text metrics
