In [None]:
%pip install -U scikit-learn imbalanced-learn

In [21]:
import pandas as pd
import numpy as np
from transformers import DebertaTokenizer, DebertaModel
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import MultiLabelBinarizer
from tqdm import tqdm
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    multilabel_confusion_matrix
)
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
import argparse
import os
import json
from datetime import datetime
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE

# Suppress expected warnings
import warnings
warnings.filterwarnings('ignore', category=UserWarning, module='sklearn.metrics.cluster._supervised')
warnings.filterwarnings('ignore', category=UserWarning, module='sklearn.feature_selection')

# Add GPU count check at the top level
def get_available_gpus():
    """Get the number of available GPUs and their IDs"""
    if not torch.cuda.is_available():
        return 0, []
    
    n_gpus = torch.cuda.device_count()
    gpu_ids = list(range(n_gpus))
    return n_gpus, gpu_ids

def reduce_tokens_simple_truncation(text, tokenizer, max_length=512):
    """
    Simply truncate text to the maximum allowed token length.
    
    Args:
        text (str): Input text
        tokenizer: Tokenizer to use
        max_length (int): Maximum token length
        
    Returns:
        str: Truncated text
    """
    tokens = tokenizer(text, truncation=True, max_length=max_length)
    return tokenizer.decode(tokens['input_ids'], skip_special_tokens=True)

def reduce_tokens_smart_truncation(text, tokenizer, max_length=512):
    """
    Intelligently truncate text by keeping the beginning and end portions.
    
    Args:
        text (str): Input text
        tokenizer: Tokenizer to use
        max_length (int): Maximum token length
        
    Returns:
        str: Truncated text with beginning and end portions
    """
    tokens = tokenizer(text, truncation=False, return_tensors="pt")["input_ids"][0]
    
    if len(tokens) <= max_length:
        return text
    
    # Keep beginning and end portions (prioritize beginning slightly)
    beginning_length = max_length // 2 + 50  # Keep slightly more from beginning
    end_length = max_length - beginning_length - 1  # Reserve 1 for separator
    
    beginning_tokens = tokens[:beginning_length]
    end_tokens = tokens[-end_length:]
    
    # Combine with a separator token
    beginning_text = tokenizer.decode(beginning_tokens, skip_special_tokens=True)
    end_text = tokenizer.decode(end_tokens, skip_special_tokens=True)
    
    return f"{beginning_text} [...] {end_text}"

def reduce_tokens_extractive_summarization(text, tokenizer, max_length=512):
    """
    Reduce text length using extractive summarization techniques.
    
    Args:
        text (str): Input text
        tokenizer: Tokenizer to use
        max_length (int): Maximum token length
        
    Returns:
        str: Summarized text
    """
    tokens = tokenizer(text, truncation=False, return_tensors="pt")["input_ids"][0]
    
    if len(tokens) <= max_length:
        return text
    
    # Import NLTK for sentence tokenization
    try:
        import nltk
        from nltk.tokenize import sent_tokenize
        try:
            nltk.data.find('tokenizers/punkt')
        except LookupError:
            nltk.download('punkt')
    except ImportError:
        # If NLTK is not available, fall back to smart truncation
        return reduce_tokens_smart_truncation(text, tokenizer, max_length)
    
    # Split text into sentences
    sentences = sent_tokenize(text)
    
    if len(sentences) <= 3:
        # Not enough sentences to summarize meaningfully, use smart truncation
        return reduce_tokens_smart_truncation(text, tokenizer, max_length)
    
    # Get sentence token lengths
    sentence_tokens = []
    for sentence in sentences:
        tokens = tokenizer(sentence, return_tensors="pt")["input_ids"][0]
        sentence_tokens.append((sentence, len(tokens)))
    
    # Calculate target ratio based on max length vs total length
    tokens = tokenizer(text, truncation=False, return_tensors="pt")["input_ids"][0]
    reduction_ratio = max_length / len(tokens)
    
    # Always keep first and last sentences
    first_sentence, first_len = sentence_tokens[0]
    last_sentence, last_len = sentence_tokens[-1]
    
    remaining_length = max_length - first_len - last_len - 10  # Reserve some tokens for separators
    
    # If can't even fit first and last sentences, use smart truncation
    if remaining_length <= 0:
        return reduce_tokens_smart_truncation(text, tokenizer, max_length)
    
    # Choose middle sentences based on importance (for now, just choose evenly distributed sentences)
    middle_sentences = sentence_tokens[1:-1]
    
    # Calculate how many middle sentences we can include
    middle_sentences_to_keep = []
    current_length = 0
    
    # Select sentences in a distributed manner
    if len(middle_sentences) > 0:
        # Fix: Add a check to prevent division by zero
        sentences_to_keep = int(reduction_ratio * len(middle_sentences))
        if sentences_to_keep <= 0:
            step = len(middle_sentences) + 1  # This will select only the first sentence if any
        else:
            step = max(1, len(middle_sentences) // sentences_to_keep)
            
        for i in range(0, len(middle_sentences), step):
            sentence, length = middle_sentences[i]
            if current_length + length <= remaining_length:
                middle_sentences_to_keep.append(sentence)
                current_length += length
            else:
                break
    
    # Combine sentences
    summarized_text = first_sentence
    
    if middle_sentences_to_keep:
        summarized_text += " " + " ".join(middle_sentences_to_keep)
    
    summarized_text += " " + last_sentence
    
    # Verify final length is within limit
    final_tokens = tokenizer(summarized_text, truncation=False, return_tensors="pt")["input_ids"][0]
    if len(final_tokens) > max_length:
        # Fall back to smart truncation if still too long
        return reduce_tokens_smart_truncation(summarized_text, tokenizer, max_length)
    
    return summarized_text

def reduce_tokens_hybrid(text, tokenizer, max_length=512):
    """
    Use a hybrid approach combining extractive summarization and smart truncation.
    
    Args:
        text (str): Input text
        tokenizer: Tokenizer to use
        max_length (int): Maximum token length
        
    Returns:
        str: Processed text
    """
    tokens = tokenizer(text, truncation=False, return_tensors="pt")["input_ids"][0]
    
    if len(tokens) <= max_length:
        return text
    
    # For very long documents, use extractive summarization first
    if len(tokens) > max_length * 2:
        summarized = reduce_tokens_extractive_summarization(text, tokenizer, max_length)
        summarized_tokens = tokenizer(summarized, truncation=False, return_tensors="pt")["input_ids"][0]
        
        # If still too long, apply smart truncation
        if len(summarized_tokens) > max_length:
            return reduce_tokens_smart_truncation(summarized, tokenizer, max_length)
        return summarized
    
    # For moderately long documents, use smart truncation directly
    return reduce_tokens_smart_truncation(text, tokenizer, max_length)

def process_with_token_reduction(texts, tokenizer, max_length=512, strategy="smart_truncation"):
    """
    Process a series of texts by applying token reduction where necessary.
    
    Args:
        texts (pd.Series): Series of input texts
        tokenizer: Tokenizer to use for tokenization
        max_length (int): Maximum token length (default: 512)
        strategy (str): Token reduction strategy, one of:
            - "simple": Simple truncation at max_length
            - "smart_truncation": Keep beginning and end portions
            - "extractive_summarization": Use extractive summarization
            - "hybrid": Combine summarization and smart truncation
            
    Returns:
        pd.Series: Series with processed texts
    """
    processed_texts = []
    token_lengths_before = []
    token_lengths_after = []
    
    for text in tqdm(texts, desc=f"Applying token reduction ({strategy})"):
        # Calculate original token length
        tokens = tokenizer(text, truncation=False, return_tensors="pt")["input_ids"][0]
        token_lengths_before.append(len(tokens))
        
        # Only process if longer than max_length
        if len(tokens) <= max_length:
            processed_texts.append(text)
            token_lengths_after.append(len(tokens))
            continue
        
        # Apply selected strategy
        if strategy == "simple":
            processed_text = reduce_tokens_simple_truncation(text, tokenizer, max_length)
        elif strategy == "smart_truncation":
            processed_text = reduce_tokens_smart_truncation(text, tokenizer, max_length)
        elif strategy == "extractive_summarization":
            processed_text = reduce_tokens_extractive_summarization(text, tokenizer, max_length)
        elif strategy == "hybrid":
            processed_text = reduce_tokens_hybrid(text, tokenizer, max_length)
        else:
            # Default to smart truncation
            processed_text = reduce_tokens_smart_truncation(text, tokenizer, max_length)
        
        processed_texts.append(processed_text)
        
        # Calculate new token length
        new_tokens = tokenizer(processed_text, truncation=False, return_tensors="pt")["input_ids"][0]
        token_lengths_after.append(len(new_tokens))
    
    # Print statistics
    print(f"\nToken reduction statistics using {strategy} strategy:")
    print(f"  Before:")
    print(f"    Mean length: {np.mean(token_lengths_before):.2f}")
    print(f"    Median length: {np.median(token_lengths_before):.2f}")
    print(f"    Max length: {max(token_lengths_before)}")
    print(f"    Docs exceeding {max_length} tokens: {sum(1 for l in token_lengths_before if l > max_length)} ({sum(1 for l in token_lengths_before if l > max_length)/len(token_lengths_before)*100:.2f}%)")
    
    print(f"  After:")
    print(f"    Mean length: {np.mean(token_lengths_after):.2f}")
    print(f"    Median length: {np.median(token_lengths_after):.2f}")
    print(f"    Max length: {max(token_lengths_after)}")
    print(f"    Docs exceeding {max_length} tokens: {sum(1 for l in token_lengths_after if l > max_length)} ({sum(1 for l in token_lengths_after if l > max_length)/len(token_lengths_after)*100:.2f}%)")
    
    # Optional: Create histogram plot
    try:
        plt.figure(figsize=(10, 6))
        plt.hist([token_lengths_before, token_lengths_after], bins=30, 
                 label=['Before reduction', 'After reduction'], alpha=0.7)
        plt.axvline(x=max_length, color='r', linestyle='--', label=f'Max length ({max_length})')
        plt.title(f'Token Length Distribution Before and After {strategy}')
        plt.xlabel('Number of Tokens')
        plt.ylabel('Frequency')
        plt.legend()
        plt.savefig(f'token_reduction_{strategy}.png')
        plt.close()
        print(f"  Distribution plot saved as token_reduction_{strategy}.png")
    except Exception as e:
        print(f"  Could not create distribution plot: {str(e)}")
    
    return pd.Series(processed_texts, index=texts.index)

def calculate_token_lengths(texts, tokenizer):
    """
    Calculate the token length for each text sample using the specified tokenizer.
    
    Args:
        texts (pd.Series): Series of input texts
        tokenizer: Tokenizer to use for tokenization
        
    Returns:
        pd.Series: Series containing the token length of each text
    """
    token_lengths = []
    for text in tqdm(texts, desc="Calculating token lengths"):
        tokens = tokenizer(str(text), truncation=False, return_tensors="pt")
        token_lengths.append(len(tokens['input_ids'][0]))
    
    return pd.Series(token_lengths, index=texts.index)

def filter_outliers_by_token_length(texts, token_lengths, std_threshold=3.0, min_token_threshold=None):
    """
    Filter out text samples with token lengths beyond a certain standard deviation threshold.
    
    Args:
        texts (pd.Series): Series of input texts
        token_lengths (pd.Series): Series containing token length of each text
        std_threshold (float): Standard deviation threshold (default: 3.0)
        min_token_threshold (int, optional): Minimum number of tokens required (default: None)
        
    Returns:
        tuple: Filtered texts and boolean mask to apply to original data
    """
    mean_length = token_lengths.mean()
    std_length = token_lengths.std()
    
    # Print original token statistics
    print(f"Token length statistics before filtering:")
    print(f"  Mean: {mean_length:.2f}, Std Dev: {std_length:.2f}")
    print(f"  Min: {token_lengths.min()}, Max: {token_lengths.max()}")
    print(f"  25th percentile: {token_lengths.quantile(0.25):.2f}")
    print(f"  50th percentile (median): {token_lengths.quantile(0.5):.2f}")
    print(f"  75th percentile: {token_lengths.quantile(0.75):.2f}")
    
    # Original data size
    original_size = len(texts)
    
    # Start with all True mask for the original data
    final_mask = pd.Series(True, index=texts.index)
    
    # Step 1: Apply standard deviation filtering if std_threshold is provided
    if std_threshold < float('inf'):
        # Define upper and lower bounds
        upper_bound = mean_length + std_threshold * std_length
        lower_bound = mean_length - std_threshold * std_length
        lower_bound = max(1, lower_bound)  # Ensure lower bound is at least 1
        
        # Create mask for samples within bounds
        std_mask = (token_lengths >= lower_bound) & (token_lengths <= upper_bound)
        
        # Update final mask with standard deviation condition
        final_mask = final_mask & std_mask
        
        std_removed = (~std_mask).sum()
        print(f"Applied {std_threshold} std dev threshold: ({lower_bound:.2f}, {upper_bound:.2f})")
        print(f"Removed {std_removed} samples by std dev filtering ({std_removed/original_size*100:.2f}% of data)")
    
    # Step 2: Apply minimum token threshold if specified
    if min_token_threshold is not None:
        # Create mask for minimum token threshold
        min_token_mask = token_lengths >= min_token_threshold
        
        # Track how many would be removed by this filter
        min_token_removed = (~min_token_mask).sum()
        
        # Track how many would be removed by this filter that weren't already filtered by std
        additional_removed = ((~min_token_mask) & final_mask).sum()
        
        # Update final mask with minimum token threshold condition
        final_mask = final_mask & min_token_mask
        
        print(f"Applied minimum token threshold of {min_token_threshold}")
        print(f"Removed {min_token_removed} samples below minimum token threshold ({min_token_removed/original_size*100:.2f}% of original data)")
        print(f"Of which {additional_removed} weren't already filtered by std deviation ({additional_removed/original_size*100:.2f}% of original data)")
    
    # Apply final mask to get filtered data
    filtered_texts = texts[final_mask]
    filtered_token_lengths = token_lengths[final_mask]
    
    # Calculate total removed
    total_removed = (~final_mask).sum()
    print(f"Total removed: {total_removed} samples ({total_removed/original_size*100:.2f}% of original data)")
    print(f"Remaining: {final_mask.sum()} samples ({final_mask.sum()/original_size*100:.2f}% of original data)")

    # Print final statistics
    print(f"\nToken length statistics after all filtering:")
    print(f"  Mean: {filtered_token_lengths.mean():.2f}, Std Dev: {filtered_token_lengths.std():.2f}")
    print(f"  Min: {filtered_token_lengths.min()}, Max: {filtered_token_lengths.max()}")
    print(f"  25th percentile: {filtered_token_lengths.quantile(0.25):.2f}")
    print(f"  50th percentile (median): {filtered_token_lengths.quantile(0.5):.2f}")
    print(f"  75th percentile: {filtered_token_lengths.quantile(0.75):.2f}")
    
    return filtered_texts, final_mask

class IssueDataset(Dataset):
    """
    Dataset for processing text data and multi-label classification.

    Args:
        texts (pd.Series): Series of input texts.
        labels (list or pd.Series): Corresponding labels (one-hot encoded for multi-label).
        tokenizer (transformers.PreTrainedTokenizer): Tokenizer for converting text to tokens.
        max_length (int): Maximum length of tokenized sequences (default is 512).
    """
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts.reset_index(drop=True)
        # Reset index for labels if it's a pandas Series.
        if isinstance(labels, pd.Series):
            self.labels = labels.reset_index(drop=True)
        else:
            self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts.iloc[idx])
        encodings = self.tokenizer(
            text,
            truncation=True,
            max_length=self.max_length,
            padding='max_length',
            return_tensors='pt'
        )
        
        # For multi-label classification, ensure we're passing the full label array
        # and not just a single value
        label = self.labels[idx]
        
        # Make sure we're getting a proper multi-dimensional label array
        # and not flattening it incorrectly
        if isinstance(label, (list, np.ndarray)):
            # Convert directly to tensor without modifying shape
            label = torch.tensor(label, dtype=torch.float)
        else:
            # If it's not already an array-like structure, this is likely a mistake
            # as we expect multi-label one-hot encoded data
            raise ValueError(f"Expected multi-dimensional label array but got {type(label)}")
        
        return {
            'input_ids': encodings['input_ids'].flatten(),
            'attention_mask': encodings['attention_mask'].flatten(),
            'labels': label
        }
    
class DeBERTaClassifier(nn.Module):
    """
    A classifier model based on DeBERTa for multi-label classification.
    
    This model uses a pre-trained DeBERTa model as the encoder and adds a 
    classification head on top with sigmoid activation for multi-label output.
    The DeBERTa model is completely frozen, only the classification layer is trained.
    
    Args:
        num_labels (int): Number of classes in the multi-label classification task.
    """
    def __init__(self, num_labels):
        super().__init__()
        self.deberta = DebertaModel.from_pretrained('microsoft/deberta-base')
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(768, num_labels)
        
        # Freeze all parameters in DeBERTa
        for param in self.deberta.parameters():
            param.requires_grad = False

    def forward(self, input_ids, attention_mask):
        with torch.no_grad():  # Ensure no gradients flow through DeBERTa
            outputs = self.deberta(input_ids=input_ids, attention_mask=attention_mask)
        
        # Get the [CLS] token representation
        cls_output = outputs.last_hidden_state[:, 0, :]
        cls_output = self.dropout(cls_output)
        # Return raw logits for BCEWithLogitsLoss
        return self.classifier(cls_output)
        
    def get_embeddings(self, input_ids, attention_mask):
        """
        Extract embeddings from the DeBERTa model without computing gradients.
        
        Args:
            input_ids: Input token IDs
            attention_mask: Attention mask
            
        Returns:
            torch.Tensor: CLS token embeddings for each input
        """
        with torch.no_grad():
            outputs = self.deberta(input_ids=input_ids, attention_mask=attention_mask)
            # Get the [CLS] token representation
            embeddings = outputs.last_hidden_state[:, 0, :]
        return embeddings

class EarlyStopping:
    """
    Early stopping to stop training when the validation loss does not improve sufficiently.
    
    For multi-label classification, we consider a loss improvement when 
    the validation loss decreases by at least min_delta.
    
    Args:
        patience (int): Number of epochs to wait for an improvement before stopping.
        min_delta (float): Minimum decrease in the monitored loss to qualify as an improvement.
    """
    def __init__(self, patience=3, min_delta=0):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.best_loss = None
        self.early_stop = False

    def __call__(self, val_loss):
        if self.best_loss is None:
            self.best_loss = val_loss
        elif val_loss < self.best_loss - self.min_delta:
            self.best_loss = val_loss
            self.counter = 0
        else:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True

    def reset(self):
        """Reset the early stopping state."""
        self.counter = 0
        self.best_loss = None
        self.early_stop = False


def train_epoch(model, loader, criterion, optimizer, device, threshold=0.5, early_stopping=None):
    """
    Train the model for one epoch, computing loss and metrics for multi-label classification.

    Args:
        model (nn.Module): The multi-label classification model.
        loader (DataLoader): Training DataLoader.
        criterion: Loss function (BCEWithLogitsLoss).
        optimizer: Optimization algorithm.
        device: Device to perform training (CPU or GPU).
        threshold (float): Threshold for binary predictions (default is 0.5).
        early_stopping (EarlyStopping, optional): Instance to monitor improvement in loss.

    Returns:
        tuple: Average loss, Hamming accuracy, and a flag indicating if early stopping was triggered.
    """
    model.train()
    total_loss = 0
    all_preds = []
    all_labels = []
    
    for batch in tqdm(loader, desc="Training"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        # Apply sigmoid and threshold for predictions
        predictions = torch.sigmoid(outputs) >= threshold
        all_preds.append(predictions.cpu().detach().numpy())
        all_labels.append(labels.cpu().detach().numpy())
    
    # Calculate metrics for multi-label classification
    all_preds = np.vstack(all_preds)
    all_labels = np.vstack(all_labels)
    
    # Use subset accuracy (exact match) for a strict measure
    exact_match = (all_preds == all_labels).all(axis=1).mean()
    
    avg_loss = total_loss / len(loader)
    
    if early_stopping:
        early_stopping(avg_loss)
        if early_stopping.early_stop:
            print("Early stopping triggered")
            return avg_loss, exact_match, True
            
    return avg_loss, exact_match, False
    

def validate(model, loader, criterion, device, threshold=0.5):
    """
    Evaluate the model on provided validation data for multi-label classification.

    Args:
        model (nn.Module): The multi-label classification model.
        loader (DataLoader): Validation DataLoader.
        criterion: Loss function (BCEWithLogitsLoss).
        device: Device to perform evaluation.
        threshold (float): Threshold for binary predictions (default is 0.5).

    Returns:
        tuple: Average loss, various accuracy metrics, precision, recall, and F1 score.
    """
    model.eval()
    total_loss = 0
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for batch in loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            total_loss += loss.item()
            
            # Apply sigmoid and threshold for predictions
            predictions = (torch.sigmoid(outputs) >= threshold).float()
            all_preds.append(predictions.cpu().numpy())
            all_labels.append(labels.cpu().numpy())
    
    all_preds = np.vstack(all_preds)
    all_labels = np.vstack(all_labels)
    
    # Calculate different multi-label metrics
    
    # 1. Exact Match / Subset Accuracy (all labels must be correct)
    exact_match = (all_preds == all_labels).all(axis=1).mean()
    
    # 2. Partial Match Accuracy (only count correctly predicted 1s, ignore 0s)
    # Calculate true positives per sample
    true_positives = np.logical_and(all_preds == 1, all_labels == 1).sum(axis=1)
    # Calculate total actual positives per sample
    total_positives = (all_labels == 1).sum(axis=1)
    # Handle division by zero - samples with no positive labels get a score of 0
    partial_match = np.zeros_like(true_positives, dtype=float)
    # Only calculate ratio for samples with at least one positive label
    mask = total_positives > 0
    partial_match[mask] = true_positives[mask] / total_positives[mask]
    partial_match_accuracy = partial_match.mean()
    
    # 3. Jaccard Similarity (intersection over union)
    def jaccard_score(y_true, y_pred):
        intersection = np.logical_and(y_true, y_pred).sum(axis=1)
        union = np.logical_or(y_true, y_pred).sum(axis=1)
        # Create a float array for output to avoid type casting error
        result = np.zeros_like(intersection, dtype=float)
        # Avoid division by zero
        np.divide(intersection, union, out=result, where=union!=0)
        return np.mean(result)
    
    jaccard_sim = jaccard_score(all_labels.astype(bool), all_preds.astype(bool))
    
    # Add Hamming metric - this is the same as partial_match_accuracy
    hamming_sim = partial_match_accuracy
    
    # Sample-based metrics - Each sample contributes equally regardless of number of labels
    precision = precision_score(all_labels, all_preds, average='samples', zero_division=0)
    recall = recall_score(all_labels, all_preds, average='samples', zero_division=0)
    f1 = f1_score(all_labels, all_preds, average='samples', zero_division=0)
    
    return (total_loss / len(loader), 
            {"exact_match": exact_match, 
             "partial_match": partial_match_accuracy,
             "hamming": hamming_sim,
             "jaccard": jaccard_sim}, 
            precision, recall, f1)

def plot_multilabel_confusion_matrix(y_true, y_pred, class_names):
    """
    Plot confusion matrices for each label in a multi-label classification problem.
    
    Args:
        y_true (numpy.ndarray): True binary labels.
        y_pred (numpy.ndarray): Predicted binary labels.
        class_names (list): Names of the classes/labels.
    """
    confusion_matrices = multilabel_confusion_matrix(y_true, y_pred)
    
    num_classes = len(class_names)
    fig, axes = plt.subplots(nrows=(num_classes + 3) // 4, ncols=min(4, num_classes), 
                             figsize=(20, 5 * ((num_classes + 3) // 4)))
    if num_classes == 1:
        axes = np.array([axes])  # Make it indexable for single class
    axes = axes.flatten()
    
    for i, matrix in enumerate(confusion_matrices):
        if i < num_classes:  # Ensure we don't exceed the number of classes
            ax = axes[i]
            sns.heatmap(matrix, annot=True, fmt='d', cmap='Blues', ax=ax)
            ax.set_title(f'Label: {class_names[i]}')
            ax.set_xlabel('Predicted')
            ax.set_ylabel('True')
            ax.set_xticklabels(['Negative', 'Positive'])
            ax.set_yticklabels(['Negative', 'Positive'])
    
    # Hide any unused subplots
    for i in range(num_classes, len(axes)):
        fig.delaxes(axes[i])
        
    plt.tight_layout()
    return fig

def prepare_data(df, text_column='all_text', min_label_freq=0, max_label_len=100, min_label_comb_freq=0, tokenizer=None, token_std_threshold=None, min_token_threshold=None):
    """
    Filter out infrequent labels, samples with too many labels, and token length outliers.
    
    Args:
        df (pd.DataFrame): DataFrame with text column and 'labels'
        text_column (str): Name of the column containing the text data to use
        min_label_freq (int): Minimum frequency for a label to be considered frequent.
        max_label_len (int): Maximum number of labels per sample.
        min_label_comb_freq (int): Minimum frequency for a label combination to be kept.
        tokenizer: Tokenizer to use for token length calculation (required if token_std_threshold is provided)
        token_std_threshold (float, optional): Standard deviation threshold for filtering token length outliers.
            If None, no token length filtering is applied. Common values are 2.0 or 3.0.
        min_token_threshold (int, optional): Minimum number of tokens required for a sample.
            If None, no minimum token threshold is applied.

    Returns:
        tuple: Filtered texts and labels.
    """
    # Print initial dataset size
    initial_size = len(df)
    print(f"\n=== DATA PREPROCESSING STATISTICS ===")
    print(f"Initial dataset size: {initial_size}")
    
    # Only keep text column and 'labels' columns
    if text_column in df.columns:
        df = df[[text_column, 'labels']]
        # Filter out rows with 'nan' text
        before_nan_filter = len(df)
        df = df[~df[text_column].apply(lambda x: x.startswith('nan') if isinstance(x, str) else False)]
        nan_removed = before_nan_filter - len(df)
        if nan_removed > 0:
            print(f"Step 1: Removed {nan_removed} rows with 'nan' text ({nan_removed/before_nan_filter*100:.2f}% of data)")
    else:
        raise ValueError(f"Text column '{text_column}' not found in the DataFrame. Available columns: {df.columns.tolist()}")
    
    # Drop rows with missing labels
    before_na_drop = len(df)
    df = df.dropna()
    na_removed = before_na_drop - len(df)
    if na_removed > 0:
        print(f"Step 2: Removed {na_removed} rows with missing labels ({na_removed/before_na_drop*100:.2f}% of data)")
    
    # Extract issue texts and labels
    texts = df[text_column]
    labels = df['labels'].apply(lambda x: x if isinstance(x, list) else [])  # Ensure labels are lists
    current_size = len(texts)
    print(f"Dataset size after basic cleaning: {current_size} ({current_size/initial_size*100:.2f}% of original data)")

    # Filter by token length if requested
    if (token_std_threshold is not None or min_token_threshold is not None) and tokenizer is not None:
        print(f"\nStep 3: Filtering by token length...")
        if token_std_threshold is not None:
            print(f"Using {token_std_threshold} standard deviation threshold")
        if min_token_threshold is not None:
            print(f"Using minimum token threshold of {min_token_threshold}")
        
        # Calculate token lengths
        token_lengths = calculate_token_lengths(texts, tokenizer)
        
        # Apply token length filtering
        before_token_filter = len(texts)
        filtered_texts, token_mask = filter_outliers_by_token_length(
            texts, 
            token_lengths, 
            std_threshold=token_std_threshold if token_std_threshold is not None else float('inf'),
            min_token_threshold=min_token_threshold
        )
        # Apply same filter to labels
        filtered_labels = labels[token_mask].reset_index(drop=True)
        token_removed = before_token_filter - len(filtered_texts)
        print(f"Removed {token_removed} samples by token length filtering ({token_removed/before_token_filter*100:.2f}% of data)")
        print(f"Texts after token length filtering: {len(filtered_texts)} ({len(filtered_texts)/initial_size*100:.2f}% of original data)")

    # Get labels count distribution
    label_distribution = Counter([label for labels in labels for label in labels])
    total_labels_before = len(label_distribution)
    print(f"\nStep 4: Filtering infrequent labels (min frequency: {min_label_freq})")
    print(f"Total unique labels before filtering: {total_labels_before}")

    # Labels to keep based on frequency
    frequent_labels = [label for label, count in label_distribution.items() if count >= min_label_freq]
    labels_removed = total_labels_before - len(frequent_labels)
    print(f"Removed {labels_removed} infrequent labels ({labels_removed/total_labels_before*100:.2f}% of labels)")
    print(f"Number of labels remaining: {len(frequent_labels)} ({len(frequent_labels)/total_labels_before*100:.2f}% of labels)")

    # Filter out infrequent labels
    before_label_filter = len(labels)
    filtered_labels = labels.apply(lambda x: [label for label in x if label in frequent_labels])
    
    # Count samples that have no labels after filtering
    empty_labels_mask = filtered_labels.apply(len) > 0
    empty_labels_count = (~empty_labels_mask).sum()
    if empty_labels_count > 0:
        print(f"Warning: {empty_labels_count} samples ({empty_labels_count/before_label_filter*100:.2f}%) now have no labels due to label frequency filtering")
        # Remove samples with no labels
        filtered_labels = filtered_labels[empty_labels_mask]
        texts = texts[empty_labels_mask]
        print(f"Removed {empty_labels_count} samples with no labels")
    
    print(f"Samples remaining after label filtering: {len(filtered_labels)} ({len(filtered_labels)/before_label_filter*100:.2f}% of data)")

    # Get label combinations distribution
    label_combinations = Counter([tuple(sorted(labels)) for labels in filtered_labels])
    total_combinations_before = len(label_combinations)
    
    print(f"\nStep 5: Filtering infrequent label combinations (min frequency: {min_label_comb_freq})")
    print(f"Total unique label combinations before filtering: {total_combinations_before}")
    
    frequent_combinations = {labels: count for labels, count in label_combinations.items() if count >= min_label_comb_freq}
    combinations_removed = total_combinations_before - len(frequent_combinations)
    print(f"Removed {combinations_removed} infrequent label combinations ({combinations_removed/total_combinations_before*100:.2f}% of combinations)")
    print(f"Number of label combinations remaining: {len(frequent_combinations)} ({len(frequent_combinations)/total_combinations_before*100:.2f}% of combinations)")
    
    # Create mask for samples with frequent label combinations (if min_label_comb_freq > 0)
    if min_label_comb_freq > 0:
        before_comb_filter = len(filtered_labels)
        comb_mask = filtered_labels.apply(lambda x: tuple(sorted(x)) in frequent_combinations)
        samples_removed_by_comb = before_comb_filter - comb_mask.sum()
        print(f"Removed {samples_removed_by_comb} samples with infrequent label combinations ({samples_removed_by_comb/before_comb_filter*100:.2f}% of data)")
        print(f"Samples remaining after combination filtering: {comb_mask.sum()} ({comb_mask.sum()/before_comb_filter*100:.2f}% of data)")
    else:
        comb_mask = pd.Series([True] * len(filtered_labels))
    
    # Filter by label length
    print(f"\nStep 6: Filtering samples with too many labels (max labels per sample: {max_label_len})")
    before_length_filter = len(filtered_labels)
    label_length = filtered_labels.apply(len)
    length_mask = (label_length > 0) & (label_length <= max_label_len)
    samples_removed_by_length = before_length_filter - length_mask.sum()
    print(f"Removed {samples_removed_by_length} samples with too many or zero labels ({samples_removed_by_length/before_length_filter*100:.2f}% of data)")
    
    # Combine both masks
    final_mask = comb_mask & length_mask
    
    # Now get the final filtered texts and labels
    texts = texts[final_mask].reset_index(drop=True)
    filtered_labels = filtered_labels[final_mask].reset_index(drop=True)
    
    print(f"\n=== FINAL PREPROCESSING RESULTS ===")
    print(f"Original dataset size: {initial_size}")
    print(f"Final dataset size: {len(filtered_labels)} ({len(filtered_labels)/initial_size*100:.2f}% of original data)")
    print(f"Total samples removed: {initial_size - len(filtered_labels)} ({(initial_size - len(filtered_labels))/initial_size*100:.2f}% of original data)")
    
    return texts, filtered_labels

# Add hybrid feature selection function
def hybrid_feature_selection(texts, labels_encoded, mlb, top_k_filter=20, top_k_final=10, vectorizer=None, random_seed=42, wrapper_method='rf'):
    """
    Perform hybrid feature selection using both filter and wrapper methods.
    
    Args:
        texts (pd.Series): Series of text data
        labels_encoded (np.array): One-hot encoded labels
        mlb (MultiLabelBinarizer): Label encoder used for transforming labels
        top_k_filter (int): Number of labels to retain after filter stage
        top_k_final (int): Final number of labels to select
        vectorizer (object): Text vectorizer with fit_transform method. If None, uses simple word count
        random_seed (int): Random seed for reproducibility
        wrapper_method (str): Wrapper method to use ('rf' for Random Forest or 'lr' for Logistic Regression)
        
    Returns:
        tuple: Selected indices, selected label names, and feature importance scores
    """
    print(f"Starting hybrid feature selection to select {top_k_final} out of {labels_encoded.shape[1]} labels...")
    
    # If no vectorizer provided, create a simple one using sklearn's CountVectorizer
    if vectorizer is None:
        from sklearn.feature_extraction.text import CountVectorizer
        vectorizer = CountVectorizer(max_features=5000)
    
    # Transform texts to feature vectors
    print("Vectorizing text data...")
    X_vec = vectorizer.fit_transform(texts)
    
    # STEP 1: Filter Method - Use chi-square test and mutual information
    print("Applying filter methods...")
    
    # Store scores from multiple filter methods
    feature_scores = np.zeros(labels_encoded.shape[1])
    
    # Chi-square test for each label
    for i in range(labels_encoded.shape[1]):
        chi_scores = chi2(X_vec, labels_encoded[:, i])
        feature_scores[i] += chi_scores[0].mean()  # Add chi-square statistic
    
    # Mutual information for each label
    for i in range(labels_encoded.shape[1]):
        mi_score = mutual_info_classif(X_vec, labels_encoded[:, i], random_state=random_seed)
        feature_scores[i] += mi_score.mean() * 10  # Scale and add MI score
    
    # Get top-k features from filter methods
    filter_selected_indices = np.argsort(-feature_scores)[:top_k_filter]
    filter_selected_labels = np.array(mlb.classes_)[filter_selected_indices]
    
    print(f"Filter stage selected {len(filter_selected_indices)} labels")
    
    # STEP 2: Wrapper Method - Use specified model to evaluate feature subsets
    print(f"Applying wrapper method using {wrapper_method.upper()}...")
    
    # Initialize the appropriate model based on wrapper_method
    if wrapper_method.lower() == 'rf':
        model = RandomForestClassifier(n_estimators=100, random_state=random_seed, n_jobs=-1)
    elif wrapper_method.lower() == 'lr':
        model = LogisticRegression(random_state=random_seed, max_iter=1000)
    else:
        raise ValueError(f"Unsupported wrapper method: {wrapper_method}. Use 'rf' or 'lr'.")
    
    X_filtered = labels_encoded[:, filter_selected_indices]
    
    # For wrapper method, we'll create a matrix where each sample is label presence/absence
    # and the target is other labels - a proxy for how well each label predicts others
    importance_scores = np.zeros(len(filter_selected_indices))
    
    # For each label, train a model to predict it using the other labels
    for i in tqdm(range(len(filter_selected_indices)), desc="Wrapper evaluation"):
        # Current target label
        y = X_filtered[:, i]
        
        # Features (other labels)
        X_others = np.delete(X_filtered, i, axis=1)
        
        # Train model
        model.fit(X_others, y)
        
        # Score based on model performance
        accuracy = model.score(X_others, y)
        importance_scores[i] = accuracy
    
    # STEP 3: Combine scores to select final features
    final_scores = 0.6 * feature_scores[filter_selected_indices] + 0.4 * importance_scores
    final_selected_indices = filter_selected_indices[np.argsort(-final_scores)[:top_k_final]]
    final_selected_labels = np.array(mlb.classes_)[final_selected_indices]
    
    print(f"Final selection: {len(final_selected_labels)} labels")
    print("Selected labels:", final_selected_labels)
    
    return final_selected_indices, final_selected_labels, final_scores

def extract_embeddings_and_apply_smote(model, dataloader, device, k_neighbors=5, random_state=42):
    """
    Extract embeddings from the DeBERTa model and apply SMOTE for data augmentation.
    Focuses on balancing specific area labels based on their frequencies.
    
    Args:
        model (DeBERTaClassifier): The model to extract embeddings from
        dataloader (DataLoader): DataLoader containing the training data
        device (torch.device): Device to run the model on
        k_neighbors (int): Number of neighbors to use for SMOTE
        random_state (int): Random seed for reproducibility
        
    Returns:
        tuple: (augmented_embeddings, augmented_labels) - the balanced dataset after SMOTE
    """
    print("Extracting embeddings for SMOTE augmentation...")
    all_embeddings = []
    all_labels = []
    
    model.eval()  # Set model to evaluation mode
    
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Extracting embeddings"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels']
            
            # Extract embeddings
            embeddings = model.get_embeddings(input_ids, attention_mask)
            
            all_embeddings.append(embeddings.cpu().numpy())
            all_labels.append(labels.numpy())
    
    # Concatenate all batches
    embeddings_array = np.vstack(all_embeddings)
    labels_array = np.vstack(all_labels)
    
    print(f"Extracted embeddings shape: {embeddings_array.shape}")
    print(f"Labels shape: {labels_array.shape}")
    
    # Define the specific area labels and their frequencies
    area_labels = {
        "area/kubelet": 352,
        "area/test": 297,
        "area/apiserver": 204,
        "area/cloudprovider": 178,
        "area/kubectl": 134,
        "area/provider/azure": 66,
        "area/dependency": 63,
        "area/code-generation": 47,
        "area/ipvs": 41,
        "area/kubeadm": 39,
        "area/kube-proxy": 27,
        "area/provider/gcp": 22,
        "area/e2e-test-framework": 17,
        "area/conformance": 16,
        "area/custom-resources": 15,
        "area/release-eng": 14,
        "area/security": 10,
        "area/etcd": 5,
        "area/provider/openstack": 5,
        "area/provider/vmware": 2
    }
    
    # Get the class indices from the label encoder
    label_encoder_classes = dataloader.dataset.labels[0].shape[0]
    
    # For multi-label data, we'll apply SMOTE for each label separately
    # This approach handles class imbalance for each label independently
    augmented_embeddings = embeddings_array.copy()
    augmented_labels = labels_array.copy()
    
    # Calculate class distribution before augmentation
    class_counts_before = labels_array.sum(axis=0)
    
    # Match area labels to their indices
    if hasattr(dataloader.dataset, 'mlb') and hasattr(dataloader.dataset.mlb, 'classes_'):
        mlb_classes = dataloader.dataset.mlb.classes_
    else:
        # If we don't have direct access to classes, try to infer from labels
        print("Warning: Could not access label encoder classes directly.")
        mlb_classes = [f"class_{i}" for i in range(labels_array.shape[1])]
    
    # Map area labels to their indices and filter to only include these specific labels
    target_indices = []
    for i, class_name in enumerate(mlb_classes):
        if class_name in area_labels:
            target_indices.append((i, class_name, area_labels[class_name]))
    
    if not target_indices:
        print("Warning: None of the specified area labels were found in the dataset. Falling back to all labels.")
        # Fall back to all labels
        target_indices = [(i, f"class_{i}", class_counts_before[i]) for i in range(labels_array.shape[1])]
    
    # Sort by frequency to handle rare classes first
    target_indices.sort(key=lambda x: x[2])
    
    # Get the frequency of the most common class
    max_frequency = max(item[2] for item in target_indices)
    
    print("\nClass distribution before augmentation:")
    for idx, class_name, freq in target_indices:
        print(f"  {class_name}: {int(class_counts_before[idx])} samples ({class_counts_before[idx]/len(labels_array)*100:.2f}%)")
    
    print("\nApplying SMOTE augmentation for target labels...")
    
    for idx, class_name, orig_freq in target_indices:
        # Skip the most frequent classes
        if orig_freq > max_frequency * 0.5:
            print(f"  Skipping {class_name}: Already has {int(class_counts_before[idx])} samples (>50% of max frequency)")
            continue
            
        # Get current label column
        y = labels_array[:, idx]
        
        # Check if label is imbalanced (fewer positives than negatives)
        pos_count = y.sum()
        neg_count = len(y) - pos_count
        
        # Only apply SMOTE if positive class is minority
        if pos_count < neg_count:
            print(f"  Processing {class_name}: Positive samples {int(pos_count)}/{len(y)} ({pos_count/len(y)*100:.2f}%)")
            
            # Calculate target ratio based on frequency
            # For very rare classes (< 10% of max), aim for 40% of max frequency
            # For rare classes (10-30% of max), aim for 30% of max frequency
            # For less rare classes (30-50% of max), aim for 20% of max frequency
            if orig_freq < max_frequency * 0.1:
                target_ratio = 0.4  # Very rare classes
            elif orig_freq < max_frequency * 0.3:
                target_ratio = 0.3  # Rare classes
            else:
                target_ratio = 0.2  # Less rare classes
                
            target_samples = int(max_frequency * target_ratio)
            print(f"    Target: {target_samples} samples ({target_ratio*100:.0f}% of max frequency)")
            
            try:
                # Apply SMOTE to generate synthetic samples
                # Ensure k_neighbors is less than the minority class count
                k = min(k_neighbors, int(pos_count) - 1)
                k = max(1, k)  # Ensure k is at least 1
                
                # Use sampling_strategy as a ratio to control how many samples to generate
                # Higher ratio = more synthetic samples
                sampling_ratio = min(1.0, target_samples / neg_count)
                
                smote = SMOTE(sampling_strategy=sampling_ratio,
                             k_neighbors=k,
                             random_state=random_state)
                
                # Use embeddings as features, the current label as target
                X_resampled, y_resampled = smote.fit_resample(embeddings_array, y)
                
                # Get only the newly generated samples (they come after the original samples)
                new_samples_mask = len(embeddings_array) < np.arange(len(X_resampled))
                new_embeddings = X_resampled[new_samples_mask]
                new_y = y_resampled[new_samples_mask]
                
                if len(new_embeddings) > 0:
                    # Create labels for new samples (initially all zeros)
                    new_labels = np.zeros((len(new_embeddings), labels_array.shape[1]))
                    # Set current label to 1 for all new samples
                    new_labels[:, idx] = 1
                    
                    # Add new samples to augmented dataset
                    augmented_embeddings = np.vstack([augmented_embeddings, new_embeddings])
                    augmented_labels = np.vstack([augmented_labels, new_labels])
                    
                    print(f"    Added {len(new_embeddings)} synthetic samples")
            except ValueError as e:
                print(f"    Error applying SMOTE: {str(e)}")
                if "Expected n_neighbors <= n_samples" in str(e):
                    print(f"    Not enough positive samples for SMOTE (need at least k+1={k+1})")
    
    # Calculate class distribution after augmentation
    class_counts_after = augmented_labels.sum(axis=0)
    print("\nClass distribution after augmentation:")
    for idx, class_name, _ in target_indices:
        before = class_counts_before[idx]
        after = class_counts_after[idx]
        print(f"  {class_name}: {int(before)} → {int(after)} samples ({int(after-before)} added, {after/len(augmented_labels)*100:.2f}%)")
    
    print(f"Final augmented dataset size: {len(augmented_embeddings)} samples " +
          f"({len(augmented_embeddings)-len(embeddings_array)} synthetic samples added)")
    
    return augmented_embeddings, augmented_labels

class EmbeddingDataset(Dataset):
    """
    Dataset for handling pre-extracted embeddings and labels.
    
    Args:
        embeddings (np.ndarray): Pre-extracted embeddings
        labels (np.ndarray): Corresponding labels
    """
    def __init__(self, embeddings, labels):
        self.embeddings = embeddings
        self.labels = labels
        
    def __len__(self):
        return len(self.embeddings)
    
    def __getitem__(self, idx):
        embedding = torch.tensor(self.embeddings[idx], dtype=torch.float)
        label = torch.tensor(self.labels[idx], dtype=torch.float)
        
        return {
            'embedding': embedding,
            'labels': label
        }

def train_epoch_with_embeddings(model, dataloader, criterion, optimizer, device, threshold=0.5, early_stopping=None, epoch=1):
    """
    Train the model for one epoch using pre-computed embeddings.
    This training only updates the classification layer weights.
    
    Args:
        model (nn.Module): The multi-label classification model
        dataloader (DataLoader): Training DataLoader with embeddings
        criterion: Loss function (BCEWithLogitsLoss)
        optimizer: Optimization algorithm
        device: Device to perform training (CPU or GPU)
        threshold (float): Threshold for binary predictions (default is 0.5)
        early_stopping (EarlyStopping, optional): Instance to monitor improvement in loss
        epoch (int): Current epoch number for adaptive weighting
        
    Returns:
        tuple: Average loss, Hamming accuracy, and a flag indicating if early stopping was triggered
    """
    model.train()
    
    # Explicitly set classifier to training mode and ensure gradients are enabled
    model.classifier.train()
    for param in model.classifier.parameters():
        param.requires_grad = True
        
    total_loss = 0
    all_preds = []
    all_labels = []
    
    # Track positive predictions to monitor class balance
    pos_pred_rate = 0
    
    # Add dynamic weighting based on epoch
    # First few epochs - boost positives more to prevent all-zero predictions
    # Later epochs - gradually reduce weighting for more balanced predictions
    pos_weight_factor = max(5.0 - 0.3 * epoch, 2.0)  # Starts at 5.0, decreases to minimum of 2.0
    neg_weight_factor = min(0.5 + 0.025 * epoch, 0.8)  # Starts at 0.5, increases to maximum of 0.8
    
    print(f"Using dynamic weighting: positive={pos_weight_factor:.2f}x, negative={neg_weight_factor:.2f}x")
    
    for batch_idx, batch in enumerate(tqdm(dataloader, desc="Training (embeddings)")):
        embeddings = batch['embedding'].to(device)
        labels = batch['labels'].to(device)
        
        optimizer.zero_grad()
        
        # Apply dropout to embeddings (no gradient tracking here)
        with torch.no_grad():
            embeddings_with_dropout = model.dropout(embeddings)
        
        # Forward pass through classifier - WITH gradient tracking
        outputs = model.classifier(embeddings_with_dropout)
        
        # Apply focal loss modifier to upweight rare positives
        # This helps prevent the model from converging to all zeros
        pos_weight = (labels == 0).float() * neg_weight_factor + (labels == 1).float() * pos_weight_factor
        weighted_loss = criterion(outputs, labels) * pos_weight
        loss = weighted_loss.mean()
        
        loss.backward()
        
        # Verify gradients are flowing (only on first batch)
        if batch_idx == 0:
            has_grad = any(p.grad is not None and p.grad.abs().sum().item() > 0 for p in model.classifier.parameters())
            if not has_grad:
                print("WARNING: No gradients flowing to classifier!")
            else:
                print("✓ Gradients are flowing to classifier")
        
        # Add noise to gradients (helps escape local minima)
        if epoch < 5:  # Only in early epochs
            for p in model.classifier.parameters():
                if p.grad is not None:
                    noise = 0.01 * torch.randn_like(p.grad) * p.grad.std()
                    p.grad += noise
        
        # Calculate positive prediction rate for monitoring
        with torch.no_grad():
            pos_preds = (torch.sigmoid(outputs) > threshold).float()
            pos_pred_rate += pos_preds.mean().item()
                
        optimizer.step()
        
        total_loss += loss.item()
        
        # Apply sigmoid and threshold for predictions
        predictions = torch.sigmoid(outputs) >= threshold
        all_preds.append(predictions.cpu().detach().numpy())
        all_labels.append(labels.cpu().detach().numpy())
    
    # Print classifier gradient magnitudes to verify training
    grad_norms = [p.grad.norm().item() if p.grad is not None else 0 
                  for p in model.classifier.parameters()]
    if len(grad_norms) > 0:
        print(f"Classifier gradient norms: mean={np.mean(grad_norms):.6f}, max={max(grad_norms):.6f}")
    else:
        print("WARNING: No gradients in classifier parameters!")
    
    # Print positive prediction rate
    avg_pos_pred_rate = pos_pred_rate / len(dataloader)
    print(f"Positive prediction rate: {avg_pos_pred_rate:.4f}")
    if avg_pos_pred_rate < 0.01:
        print("WARNING: Very low positive prediction rate - model may be converging to all zeros")
    
    # Calculate metrics for multi-label classification
    all_preds = np.vstack(all_preds)
    all_labels = np.vstack(all_labels)
    
    # Use subset accuracy (exact match) for a strict measure
    exact_match = (all_preds == all_labels).all(axis=1).mean()
    
    avg_loss = total_loss / len(dataloader)
    
    if early_stopping:
        early_stopping(avg_loss)
        if early_stopping.early_stop:
            print("Early stopping triggered")
            return avg_loss, exact_match, True
            
    return avg_loss, exact_match, False

def validate_with_embeddings(model, dataloader, criterion, device, threshold=0.5):
    """
    Evaluate the model using pre-computed embeddings.
    
    Args:
        model (nn.Module): The multi-label classification model
        dataloader (DataLoader): Validation DataLoader with embeddings
        criterion: Loss function (BCEWithLogitsLoss)
        device: Device to perform evaluation
        threshold (float): Threshold for binary predictions (default is 0.5)
        
    Returns:
        tuple: Average loss, various accuracy metrics, precision, recall, and F1 score
    """
    model.eval()
    total_loss = 0
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for batch in dataloader:
            embeddings = batch['embedding'].to(device)
            labels = batch['labels'].to(device)
            
            # Apply dropout to embeddings (same as in forward pass)
            embeddings = model.dropout(embeddings)
            
            # Get outputs from classification layer
            outputs = model.classifier(embeddings)
            
            loss = criterion(outputs, labels)
            total_loss += loss.item()
            
            # Apply sigmoid and threshold for predictions
            predictions = (torch.sigmoid(outputs) >= threshold).float()
            all_preds.append(predictions.cpu().numpy())
            all_labels.append(labels.cpu().numpy())
    
    all_preds = np.vstack(all_preds)
    all_labels = np.vstack(all_labels)
    
    # Calculate different multi-label metrics
    
    # 1. Exact Match / Subset Accuracy (all labels must be correct)
    exact_match = (all_preds == all_labels).all(axis=1).mean()
    
    # 2. Partial Match Accuracy (only count correctly predicted 1s, ignore 0s)
    # Calculate true positives per sample
    true_positives = np.logical_and(all_preds == 1, all_labels == 1).sum(axis=1)
    # Calculate total actual positives per sample
    total_positives = (all_labels == 1).sum(axis=1)
    # Handle division by zero - samples with no positive labels get a score of 0
    partial_match = np.zeros_like(true_positives, dtype=float)
    # Only calculate ratio for samples with at least one positive label
    mask = total_positives > 0
    partial_match[mask] = true_positives[mask] / total_positives[mask]
    partial_match_accuracy = partial_match.mean()
    
    # 3. Jaccard Similarity (intersection over union)
    def jaccard_score(y_true, y_pred):
        intersection = np.logical_and(y_true, y_pred).sum(axis=1)
        union = np.logical_or(y_true, y_pred).sum(axis=1)
        # Create a float array for output to avoid type casting error
        result = np.zeros_like(intersection, dtype=float)
        # Avoid division by zero
        np.divide(intersection, union, out=result, where=union!=0)
        return np.mean(result)
    
    jaccard_sim = jaccard_score(all_labels.astype(bool), all_preds.astype(bool))
    
    # Add Hamming metric - this is the same as partial_match_accuracy
    hamming_sim = partial_match_accuracy
    
    # Sample-based metrics - Each sample contributes equally regardless of number of labels
    precision = precision_score(all_labels, all_preds, average='samples', zero_division=0)
    recall = recall_score(all_labels, all_preds, average='samples', zero_division=0)
    f1 = f1_score(all_labels, all_preds, average='samples', zero_division=0)
    
    return (total_loss / len(dataloader), 
            {"exact_match": exact_match, 
             "partial_match": partial_match_accuracy,
             "hamming": hamming_sim,
             "jaccard": jaccard_sim}, 
            precision, recall, f1)

def main(args):
    """
    Main function to run the multi-label classification pipeline with DeBERTa.
    This function loads data, preprocesses it, trains the model, and evaluates performance.
    
    Includes data augmentation with SMOTE to balance class distribution.
    """
    # Set random seeds for reproducibility
    torch.manual_seed(42)
    np.random.seed(42)
    
    # Check for GPU availability
    n_gpus, gpu_ids = get_available_gpus()
    if n_gpus >= 2:
        print(f"Using {n_gpus} GPUs: {gpu_ids}")
        device = torch.device("cuda")
        use_multi_gpu = True
    elif n_gpus == 1:
        print("Using 1 GPU")
        device = torch.device("cuda")
        use_multi_gpu = False
    else:
        print("No GPUs available, using CPU")
        device = torch.device("cpu")
        use_multi_gpu = False
    
    # Make results directory if it doesn't exist
    results_dir = args.results_dir
    os.makedirs(results_dir, exist_ok=True)
    
    # Create a timestamped directory for this run
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    run_dir = os.path.join(results_dir, f"run_{timestamp}_{args.text_column}_augmented")
    os.makedirs(run_dir, exist_ok=True)
    
    # Load data
    print(f"Loading data from {args.data_path}...")
    df = pd.read_json(args.data_path)
    
    # Check if the text column exists
    if args.text_column not in df.columns:
        available_columns = [col for col in df.columns if col.startswith('all_text')]
        print(f"Text column '{args.text_column}' not found. Available text columns: {available_columns}")
        if len(available_columns) == 0:
            raise ValueError("No text columns found in the data")
        args.text_column = available_columns[0]
        print(f"Using '{args.text_column}' instead")
    
    # Load the tokenizer for token length calculations
    print("Loading tokenizer...")
    tokenizer = DebertaTokenizer.from_pretrained('microsoft/deberta-base')
    
    # Extract issue texts and labels
    texts = df[args.text_column]
    labels = df['labels'].apply(lambda x: x if isinstance(x, list) else [])  # Ensure labels are lists
    
    # Determine token length filtering threshold based on args
    token_std_threshold = None
    if args.token_length_filter == '3std':
        token_std_threshold = 3.0
    elif args.token_length_filter == '2std':
        token_std_threshold = 2.0
    
    # Apply token length filtering first if requested
    if token_std_threshold is not None or args.min_token_threshold is not None:
        print(f"\nApplying token length filtering...")
        token_lengths = calculate_token_lengths(texts, tokenizer)
        
        # First filter by standard deviation, then by min threshold (in sequence)
        filtered_texts, token_mask = filter_outliers_by_token_length(
            texts, 
            token_lengths, 
            std_threshold=token_std_threshold if token_std_threshold is not None else float('inf'),
            min_token_threshold=args.min_token_threshold
        )
        
        # Apply same filter to labels and dataframe - keep original indices
        filtered_labels = labels[token_mask]
        filtered_df = df[token_mask]
        
        # Now reset indices for further processing
        texts = filtered_texts.reset_index(drop=True)
        labels = filtered_labels.reset_index(drop=True)
        filtered_df = filtered_df.reset_index(drop=True)
    else:
        filtered_df = df
    
    # Apply token reduction if requested (after outlier removal)
    if args.token_reduction_strategy:
        print(f"\nApplying token reduction strategy: {args.token_reduction_strategy}")
        texts = process_with_token_reduction(
            texts, 
            tokenizer, 
            max_length=args.max_length, 
            strategy=args.token_reduction_strategy
        )
        # Update filtered_df with the reduced texts
        filtered_df[args.text_column] = texts
    
    # Use prepare_data function to filter and prepare data, but skip token length filtering since we've done it
    texts, filtered_labels = prepare_data(
        filtered_df,
        text_column=args.text_column,
        min_label_freq=args.min_label_freq, 
        max_label_len=args.max_label_len, 
        min_label_comb_freq=args.min_label_comb_freq,
        tokenizer=tokenizer,
        token_std_threshold=None,  # Set to None to skip the token filtering in prepare_data
        min_token_threshold=args.min_token_threshold
    )
    
    # Print final dataset statistics
    print("\n=== FINAL DATASET STATISTICS ===")
    print(f"Initial dataset size: {len(df)}")
    print(f"Final dataset size: {len(texts)}")
    print(f"Total samples removed: {len(df) - len(texts)} ({(len(df) - len(texts))/len(df)*100:.2f}% of original data)")
    
    # Count the number of labels distribution
    label_distribution = Counter([label for labels in filtered_labels for label in labels])
    print('\nLabel Distribution:')
    for i, (label, count) in enumerate(sorted(label_distribution.items(), key=lambda x: x[1], reverse=True)):
        print(f'{i}. {label}: {count}')
    
    # Count the label length distribution
    label_length_distribution = Counter([len(labels) for labels in filtered_labels])
    print('\nLabel count per row distribution:')
    for label in sorted(label_length_distribution.keys()):
        print(f'Label: {label}, count: {label_length_distribution[label]}')
    
    # Save preprocessing metadata
    preprocessing_metadata = {
        'initial_dataset_size': len(df),
        'final_dataset_size': len(texts),
        'token_reduction': {
            'applied': args.token_reduction_strategy is not None,
            'strategy': args.token_reduction_strategy if args.token_reduction_strategy else None,
            'max_length': args.max_length
        },
        'token_length_filtering': {
            'applied': token_std_threshold is not None,
            'threshold': token_std_threshold
        },
        'label_filtering': {
            'min_label_freq': args.min_label_freq,
            'max_label_len': args.max_label_len,
            'min_label_comb_freq': args.min_label_comb_freq
        },
        'min_token_threshold': {
            'applied': args.min_token_threshold is not None,
            'threshold': args.min_token_threshold
        },
        'data_augmentation': {
            'enabled': args.use_data_augmentation,
            'augmentation_method': 'SMOTE'
        }
    }
    
    # Calculate and add max token length to metadata
    if tokenizer is not None:
        token_lengths = calculate_token_lengths(texts, tokenizer)
        max_token_length = int(token_lengths.max())
        preprocessing_metadata['token_stats'] = {
            'max_token_length': max_token_length,
            'mean_token_length': float(token_lengths.mean()),
            'median_token_length': float(token_lengths.median())
        }
        print(f"\n=== TOKEN LENGTH SUMMARY ===")
        print(f"Maximum token length: {max_token_length}")
        print(f"Mean token length: {token_lengths.mean():.2f}")
        print(f"Median token length: {token_lengths.median():.2f}")
    
    with open(os.path.join(run_dir, 'preprocessing_metadata.json'), 'w') as f:
        json.dump(preprocessing_metadata, f, indent=4)
    
    # Encode multi-labels using MultiLabelBinarizer
    print("Encoding labels...")
    mlb = MultiLabelBinarizer()
    labels_encoded = mlb.fit_transform(filtered_labels)
    
    # Save all original label classes
    all_classes = mlb.classes_.tolist()
    
    # Save label encoder for future use
    with open(os.path.join(run_dir, 'label_encoder.json'), 'w') as f:
        json.dump({
            'classes': all_classes
        }, f)
    
    # Calculate label distribution
    label_counts = labels_encoded.sum(axis=0)
    
    # Log class imbalance metrics
    label_density = label_counts.sum() / (labels_encoded.shape[0] * labels_encoded.shape[1])
    print(f"Label density: {label_density:.4f}")
    print(f"Average labels per sample: {label_counts.sum() / labels_encoded.shape[0]:.2f}")
    
    # Print hybrid feature selection args
    print(f"Feature selection enabled: {args.feature_selection}")
    if args.feature_selection:
        print(f"Filter top-k: {args.filter_k}, Final top-k: {args.final_k}")
        print(f"Wrapper method: {args.wrapper_method.upper()}")
    else:
        print("Feature selection disabled")
        
    # Perform hybrid feature selection if enabled
    if args.feature_selection:
        print(f"\nPerforming hybrid feature selection...")
        
        # Create appropriate vectorizer based on argument
        if args.vectorizer == 'tfidf':
            from sklearn.feature_extraction.text import TfidfVectorizer
            vectorizer = TfidfVectorizer(max_features=5000)
            print("Using TF-IDF vectorizer for feature selection")
        else:  # default to count
            from sklearn.feature_extraction.text import CountVectorizer
            vectorizer = CountVectorizer(max_features=5000)
            print("Using Count vectorizer for feature selection")
        
        selected_indices, selected_labels, feature_scores = hybrid_feature_selection(
            texts, labels_encoded, mlb, 
            top_k_filter=args.filter_k,
            top_k_final=args.final_k,
            vectorizer=vectorizer,
            random_seed=42,
            wrapper_method=args.wrapper_method
        )
        
        # Filter labels_encoded to keep only selected labels
        labels_encoded = labels_encoded[:, selected_indices]
        
        # Save selected labels to file
        with open(os.path.join(run_dir, 'selected_labels.json'), 'w') as f:
            json.dump({
                'selected_labels': selected_labels.tolist(),
                'feature_scores': feature_scores.tolist(),
                'selected_indices': selected_indices.tolist(),
                'vectorizer_type': args.vectorizer,
                'wrapper_method': args.wrapper_method
            }, f)
        
        # Update mlb.classes_ to only contain selected classes
        mlb.classes_ = np.array(selected_labels)
        
        # Recalculate label counts with selected labels
        label_counts = labels_encoded.sum(axis=0)
        print(f"Training with {len(selected_labels)} selected labels: {selected_labels}")
    else:
        print("Feature selection disabled, using all labels")
    
    # Split data into training and validation sets (80% training, 20% validation)
    split_idx = int(len(texts) * 0.8)
    train_texts, val_texts = texts[:split_idx], texts[split_idx:]
    train_labels, val_labels = labels_encoded[:split_idx], labels_encoded[split_idx:]
    
    print(f"Training samples: {len(train_texts)}, Validation samples: {len(val_texts)}")
    
    # Initialize tokenizer
    print("Loading tokenizer...")
    tokenizer = DebertaTokenizer.from_pretrained('microsoft/deberta-base')
    
    # Implement class weights for loss function to handle imbalance
    pos_weights = None
    if args.use_class_weights and label_counts.min() < label_counts.max() / 5:  # If there's significant imbalance
        print("Computing class weights for imbalanced labels...")
        pos_weights = torch.FloatTensor(
            (labels_encoded.shape[0] - label_counts) / label_counts
        ).clamp(0.5, 10).to(device)  # Limit range to prevent extreme weights
    
    # Create datasets and dataloaders
    batch_size = args.batch_size
    
    # Create original datasets for getting embeddings
    train_dataset = IssueDataset(train_texts, train_labels, tokenizer, max_length=args.max_length)
    val_dataset = IssueDataset(val_texts, val_labels, tokenizer, max_length=args.max_length)
    
    # Increase batch size for DataParallel if multiple GPUs
    if use_multi_gpu:
        batch_size = batch_size * n_gpus
        print(f"Using larger batch size of {batch_size} for {n_gpus} GPUs")
    
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)  # Don't shuffle yet
    val_loader = DataLoader(val_dataset, batch_size=batch_size)
    
    # Initialize model
    print("Initializing model...")
    model = DeBERTaClassifier(num_labels=len(mlb.classes_)).to(device)
    
    # Use DataParallel for multiple GPUs
    if use_multi_gpu:
        model = nn.DataParallel(model)
        print("Model wrapped in DataParallel")
    
    # Extract embeddings and apply SMOTE augmentation if enabled
    if args.use_data_augmentation:
        print("\n=== APPLYING DATA AUGMENTATION WITH SMOTE ===")
        
        # Extract embeddings from training set and apply SMOTE
        augmented_embeddings, augmented_labels = extract_embeddings_and_apply_smote(
            model.module if use_multi_gpu else model,
            train_loader,
            device,
            k_neighbors=5,
            random_state=42
        )
        
        # Create a new dataset with the augmented data
        train_embedding_dataset = EmbeddingDataset(augmented_embeddings, augmented_labels)
        augmented_train_loader = DataLoader(train_embedding_dataset, batch_size=batch_size, shuffle=True)
        
        # Also extract embeddings for validation set (no augmentation)
        print("\nExtracting embeddings for validation set...")
        val_embeddings = []
        val_labels_list = []
        
        model.eval()
        with torch.no_grad():
            for batch in tqdm(val_loader, desc="Extracting validation embeddings"):
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                
                # Extract embeddings
                if use_multi_gpu:
                    embeddings = model.module.get_embeddings(input_ids, attention_mask)
                else:
                    embeddings = model.get_embeddings(input_ids, attention_mask)
                
                val_embeddings.append(embeddings.cpu().numpy())
                val_labels_list.append(batch['labels'].numpy())
        
        val_embeddings = np.vstack(val_embeddings)
        val_labels_np = np.vstack(val_labels_list)
        
        val_embedding_dataset = EmbeddingDataset(val_embeddings, val_labels_np)
        val_embedding_loader = DataLoader(val_embedding_dataset, batch_size=batch_size)
        
        # Set flags to use embedding-based training
        use_embeddings = True
        
        # Save augmentation statistics to metadata
        with open(os.path.join(run_dir, 'augmentation_stats.json'), 'w') as f:
            json.dump({
                'original_train_samples': len(train_texts),
                'augmented_train_samples': len(augmented_embeddings),
                'synthetic_samples_added': len(augmented_embeddings) - len(train_texts),
                'augmentation_method': 'SMOTE'
            }, f)
    else:
        print("Data augmentation disabled")
        use_embeddings = False
    
    # Use weighted loss if we have weights
    if pos_weights is not None:
        criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weights)
        print("Using weighted BCE loss")
    else:
        criterion = nn.BCEWithLogitsLoss()
    
    # Using optimizer that only updates classifier parameters
    if use_multi_gpu:
        optimizer = torch.optim.AdamW(model.module.classifier.parameters(), 
                                    lr=args.learning_rate * 0.5,  # Higher learning rate (0.5x instead of 0.1x)
                                    weight_decay=0.01)
    else:
        optimizer = torch.optim.AdamW(model.classifier.parameters(), 
                                    lr=args.learning_rate * 0.5,  # Higher learning rate (0.5x instead of 0.1x)
                                    weight_decay=0.01)
    
    # Add learning rate scheduler for better convergence
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, mode='max', factor=0.5, patience=2, verbose=True
    )
    
    # Initialize early stopping
    early_stopping = EarlyStopping(patience=args.patience, min_delta=0.01)
    
    # Training loop
    num_epochs = args.epochs
    print(f"Starting training for {num_epochs} epochs...")
    print(f"Training mode: {'Using pre-computed embeddings with augmentation' if use_embeddings else 'Standard training'}")
    
    train_losses = []
    val_losses = []
    best_f1 = 0.0
    best_model_saved = False  # Flag to track if we've saved at least one model
    stuck_epochs = 0  # Counter for epochs with no improvement
    
    # Define model path
    model_path = os.path.join(run_dir, f'best_model_{args.text_column}_augmented.pt')
    
    for epoch in range(num_epochs):
        print(f"\nEpoch {epoch+1}/{num_epochs}")
        
        # Check for stuck training and reinitialize if needed
        if epoch >= 5 and stuck_epochs >= 3:
            print("Model seems stuck. Reinitializing classifier layer...")
            # Reinitialize the classifier layer with different initialization
            if use_multi_gpu:
                nn.init.xavier_normal_(model.module.classifier.weight)
                if model.module.classifier.bias is not None:
                    nn.init.zeros_(model.module.classifier.bias)
            else:
                nn.init.xavier_normal_(model.classifier.weight)
                if model.classifier.bias is not None:
                    nn.init.zeros_(model.classifier.bias)
            
            # Reset optimizer with higher learning rate
            if use_multi_gpu:
                optimizer = torch.optim.AdamW(
                    model.module.classifier.parameters(),
                    lr=args.learning_rate * 1.0,  # Full learning rate for reinitialization
                    weight_decay=0.005
                )
            else:
                optimizer = torch.optim.AdamW(
                    model.classifier.parameters(),
                    lr=args.learning_rate * 1.0,  # Full learning rate for reinitialization
                    weight_decay=0.005
                )
            
            # Reset scheduler
            scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
                optimizer, mode='max', factor=0.5, patience=2, verbose=True
            )
            
            stuck_epochs = 0  # Reset counter
        
        # Train for one epoch - choose appropriate training function based on mode
        if use_embeddings:
            train_loss, train_acc, stop_early = train_epoch_with_embeddings(
                model.module if use_multi_gpu else model,
                augmented_train_loader,
                criterion,
                optimizer,
                device,
                early_stopping=early_stopping,
                epoch=epoch+1
            )
        else:
            train_loss, train_acc, stop_early = train_epoch(
                model,
                train_loader,
                criterion,
                optimizer,
                device,
                early_stopping=early_stopping
            )
        
        # Validate - choose appropriate validation function based on mode
        if use_embeddings:
            val_loss, accuracy_metrics, val_precision, val_recall, val_f1 = validate_with_embeddings(
                model.module if use_multi_gpu else model,
                val_embedding_loader,
                criterion,
                device
            )
        else:
            val_loss, accuracy_metrics, val_precision, val_recall, val_f1 = validate(
                model,
                val_loader,
                criterion,
                device
            )
        
        # Update scheduler based on F1 score
        scheduler.step(val_f1)
        
        # Save metrics
        train_losses.append(train_loss)
        val_losses.append(val_loss)
        
        print(f"Train Loss: {train_loss:.4f}, Train Accuracy (Exact Match): {train_acc:.4f}")
        print(f"Val Loss: {val_loss:.4f}")
        print(f"Val Accuracy (Exact Match): {accuracy_metrics['exact_match']:.4f}")
        print(f"Val Accuracy (Partial Match): {accuracy_metrics['partial_match']:.4f}")
        print(f"Val Accuracy (Jaccard): {accuracy_metrics['jaccard']:.4f}")
        print(f"Val Precision: {val_precision:.4f}, Val Recall: {val_recall:.4f}, Val F1: {val_f1:.4f}")
        
        # Check for model improvement
        improved = False
        
        # Save best model based on F1 score
        if val_f1 > best_f1:
            best_f1 = val_f1
            improved = True
            
            # Save the model state_dict (handle DataParallel wrapper if needed)
            if use_multi_gpu:
                torch.save(model.module.state_dict(), model_path)
            else:
                torch.save(model.state_dict(), model_path)
                
            print(f"Saved new best model to {model_path}")
            best_model_saved = True
            stuck_epochs = 0  # Reset counter when we improve
        else:
            stuck_epochs += 1  # Increment counter when no improvement
            print(f"No improvement for {stuck_epochs} epochs. Best F1: {best_f1:.4f}")
        
        # Always save a model for the first epoch if no model has been saved yet
        # This ensures we have at least one model if early stopping occurs
        if epoch == 0 and not best_model_saved:
            if use_multi_gpu:
                torch.save(model.module.state_dict(), model_path)
            else:
                torch.save(model.state_dict(), model_path)
            print(f"Saved initial model to {model_path} as baseline")
            best_model_saved = True
            
        # Check for early stopping
        if stop_early:
            print("Early stopping triggered. Terminating training.")
            break
    
    # Save training history
    history = {
        'train_losses': train_losses,
        'val_losses': val_losses
    }
    with open(os.path.join(run_dir, 'training_history.json'), 'w') as f:
        json.dump(history, f)
    
    # Load best model for final evaluation
    print("\n=== FINAL EVALUATION ===")
    best_model_path = os.path.join(run_dir, f'best_model_{args.text_column}_augmented.pt')
    
    # Handle loading for DataParallel model
    if use_multi_gpu:
        model.module.load_state_dict(torch.load(best_model_path))
    else:
        model.load_state_dict(torch.load(best_model_path))
    
    # Evaluate the model with default threshold
    print("Final evaluation with best model:")
    if use_embeddings:
        final_loss, final_acc_metrics, final_precision, final_recall, final_f1 = validate_with_embeddings(
            model.module if use_multi_gpu else model,
            val_embedding_loader,
            criterion,
            device
        )
    else:
        final_loss, final_acc_metrics, final_precision, final_recall, final_f1 = validate(
            model,
            val_loader,
            criterion,
            device
        )
    
    print(f"Final Loss: {final_loss:.4f}")
    print(f"Final Exact Match Accuracy: {final_acc_metrics['exact_match']:.4f}")
    print(f"Final Partial Match Accuracy: {final_acc_metrics['partial_match']:.4f}")
    print(f"Final Jaccard Similarity: {final_acc_metrics['jaccard']:.4f}")
    print(f"Final Precision: {final_precision:.4f}")
    print(f"Final Recall: {final_recall:.4f}")
    print(f"Final F1 Score: {final_f1:.4f}")
    
    # Update results dictionary with final metrics
    results = {
        'text_column': args.text_column,
        'token_length_filter': args.token_length_filter,
        'token_reduction_strategy': args.token_reduction_strategy,
        'data_augmentation': {
            'enabled': args.use_data_augmentation,
            'method': 'SMOTE' if args.use_data_augmentation else None
        },
        'metrics': {
            'exact_match': float(final_acc_metrics['exact_match']),
            'partial_match': float(final_acc_metrics['partial_match']),
            'jaccard': float(final_acc_metrics['jaccard']),
            'precision': float(final_precision),
            'recall': float(final_recall), 
            'f1': float(final_f1),
        }
    }
    with open(os.path.join(run_dir, 'results.json'), 'w') as f:
        json.dump(results, f, indent=4)
    
    print(f"\nTraining completed! Results saved to {run_dir}")
    
    return {
        'metrics': results['metrics'],
        'model': model,
        'label_encoder': mlb,
        'results_dir': run_dir
    }

In [22]:
if __name__ == "__main__":
    # Create parser and handle Jupyter/Colab environment by ignoring unknown args
    parser = argparse.ArgumentParser(description='Train DeBERTa for multi-label classification')
    
    # Data parameters
    parser.add_argument('--data_path', type=str, 
                        default="/kaggle/input/kubernetes-final-bug-data-without-comments/cleaned_data_with_changed_files_no_comments.json",
                        help='Path to the JSON data file')
    parser.add_argument('--text_column', type=str, default='all_text_0.5',
                        help='Column name with the text data to use for training (e.g., all_text, all_text_0.5)')
    parser.add_argument('--results_dir', type=str, default='./results',
                        help='Directory to save results')
    
    # Label filtering parameters
    parser.add_argument('--min_label_freq', type=int, default=5,
                        help='Minimum frequency for a label to be considered')
    parser.add_argument('--max_label_len', type=int, default=5,
                        help='Maximum number of labels per sample (default: 5)')
    parser.add_argument('--min_label_comb_freq', type=int, default=2,
                        help='Minimum frequency for a label combination')
    
    # Training parameters
    parser.add_argument('--batch_size', type=int, default=16, help='Training batch size')
    parser.add_argument('--epochs', type=int, default=200, help='Number of training epochs')
    parser.add_argument('--learning_rate', type=float, default=2e-5, help='Learning rate')
    parser.add_argument('--patience', type=int, default=100, help='Early stopping patience')
    parser.add_argument('--use_class_weights', action='store_true', help='Use class weights for imbalanced data')
    
    # Token length parameters
    parser.add_argument('--max_length', type=int, default=512, help='Maximum token length for model input')
    
    # Token length filtering parameters
    parser.add_argument('--token_length_filter', type=str, choices=['2std', '3std', None], default='3std',
                        help='Remove token length outliers based on standard deviation threshold')
    parser.add_argument('--min_token_threshold', type=int, default=None,
                        help='Minimum number of tokens required for a sample')
    
    # Token reduction parameters for handling long tokens
    parser.add_argument('--token_reduction_strategy', type=str, 
                        choices=['simple', 'smart_truncation', 'extractive_summarization', 'hybrid'], 
                        default=None,
                        help='Strategy to handle long tokens exceeding max_length: '
                             'simple=simple truncation, '
                             'smart_truncation=keep beginning and end, '
                             'extractive_summarization=extract key sentences, '
                             'hybrid=combine summarization and truncation')
    
    # Feature selection parameters
    parser.add_argument('--feature_selection', action='store_true', #default=True, 
                        help='Enable hybrid feature selection')
    parser.add_argument('--filter_k', type=int, default=20, 
                        help='Number of labels to retain after filter stage')
    parser.add_argument('--final_k', type=int, default=15, 
                        help='Final number of labels to select')
    parser.add_argument('--vectorizer', type=str, choices=['count', 'tfidf'], default='tfidf',
                        help='Vectorizer to use for feature selection')
    parser.add_argument('--wrapper_method', type=str, choices=['rf', 'lr'], default='rf',
                        help='Wrapper method to use for feature selection (rf: Random Forest, lr: Logistic Regression)')
    
    # Data augmentation parameter
    parser.add_argument('--use_data_augmentation', action='store_true', default=True,
                        help='Enable data augmentation with SMOTE to balance class distribution')
    
    # Parse arguments, ignore unknown args for compatibility with Jupyter/Colab
    args, unknown = parser.parse_known_args()
    
    # If the script is run directly, not imported
    results = main(args)

Using 2 GPUs: [0, 1]
Loading data from /kaggle/input/kubernetes-final-bug-data-without-comments/cleaned_data_with_changed_files_no_comments.json...
Loading tokenizer...

Applying token length filtering...


Calculating token lengths: 100%|██████████| 1258/1258 [00:03<00:00, 373.70it/s]


Token length statistics before filtering:
  Mean: 379.62, Std Dev: 458.90
  Min: 32, Max: 5493
  25th percentile: 148.00
  50th percentile (median): 245.00
  75th percentile: 439.75
Applied 3.0 std dev threshold: (1.00, 1756.33)
Removed 21 samples by std dev filtering (1.67% of data)
Total removed: 21 samples (1.67% of original data)
Remaining: 1237 samples (98.33% of original data)

Token length statistics after all filtering:
  Mean: 336.26, Std Dev: 279.70
  Min: 32, Max: 1725
  25th percentile: 148.00
  50th percentile (median): 241.00
  75th percentile: 418.00

=== DATA PREPROCESSING STATISTICS ===
Initial dataset size: 1237
Dataset size after basic cleaning: 1237 (100.00% of original data)

Step 4: Filtering infrequent labels (min frequency: 5)
Total unique labels before filtering: 39
Removed 19 infrequent labels (48.72% of labels)
Number of labels remaining: 20 (51.28% of labels)
Removed 15 samples with no labels
Samples remaining after label filtering: 1222 (98.79% of data)

St

Calculating token lengths: 100%|██████████| 1156/1156 [00:02<00:00, 504.43it/s]



=== TOKEN LENGTH SUMMARY ===
Maximum token length: 1725
Mean token length: 325.18
Median token length: 231.50
Encoding labels...
Label density: 0.0672
Average labels per sample: 1.34
Feature selection enabled: False
Feature selection disabled
Feature selection disabled, using all labels
Training samples: 924, Validation samples: 232
Loading tokenizer...
Using larger batch size of 32 for 2 GPUs
Initializing model...
Model wrapped in DataParallel

=== APPLYING DATA AUGMENTATION WITH SMOTE ===
Extracting embeddings for SMOTE augmentation...


Extracting embeddings: 100%|██████████| 29/29 [00:58<00:00,  2.01s/it]


Extracted embeddings shape: (924, 768)
Labels shape: (924, 20)

Class distribution before augmentation:
  class_15: 0 samples (0.00%)
  class_16: 0 samples (0.00%)
  class_18: 4 samples (0.43%)
  class_7: 5 samples (0.54%)
  class_4: 6 samples (0.65%)
  class_3: 11 samples (1.19%)
  class_17: 14 samples (1.52%)
  class_6: 16 samples (1.73%)
  class_9: 22 samples (2.38%)
  class_14: 22 samples (2.38%)
  class_10: 24 samples (2.60%)
  class_8: 29 samples (3.14%)
  class_2: 42 samples (4.55%)
  class_5: 57 samples (6.17%)
  class_13: 63 samples (6.82%)
  class_11: 107 samples (11.58%)
  class_1: 139 samples (15.04%)
  class_0: 162 samples (17.53%)
  class_19: 270 samples (29.22%)
  class_12: 277 samples (29.98%)

Applying SMOTE augmentation for target labels...
  Processing class_15: Positive samples 0/924 (0.00%)
    Target: 110 samples (40% of max frequency)
    Error applying SMOTE: The target 'y' needs to have more than 1 class. Got 1 class instead
  Processing class_16: Positive samp

Extracting validation embeddings: 100%|██████████| 8/8 [00:13<00:00,  1.71s/it]


Starting training for 200 epochs...
Training mode: Using pre-computed embeddings with augmentation

Epoch 1/200
Using dynamic weighting: positive=4.70x, negative=0.53x


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 405.63it/s]


✓ Gradients are flowing to classifier
Classifier gradient norms: mean=0.601825, max=1.130166
Positive prediction rate: 0.3865
Train Loss: 0.5315, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.6738
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.6458
Val Accuracy (Jaccard): 0.1025
Val Precision: 0.1059, Val Recall: 0.6458, Val F1: 0.1791
Saved new best model to ./results/run_20250428_083800_all_text_0.5_augmented/best_model_all_text_0.5_augmented.pt

Epoch 2/200
Using dynamic weighting: positive=4.40x, negative=0.55x


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 423.02it/s]


✓ Gradients are flowing to classifier
Classifier gradient norms: mean=0.620763, max=1.167662
Positive prediction rate: 0.3646
Train Loss: 0.5227, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.6576
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.5328
Val Accuracy (Jaccard): 0.0998
Val Precision: 0.1042, Val Recall: 0.5328, Val F1: 0.1711
No improvement for 1 epochs. Best F1: 0.1791

Epoch 3/200
Using dynamic weighting: positive=4.10x, negative=0.57x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 417.12it/s]


Classifier gradient norms: mean=0.621638, max=1.170756
Positive prediction rate: 0.3256
Train Loss: 0.5140, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.6418
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.5328
Val Accuracy (Jaccard): 0.0998
Val Precision: 0.1042, Val Recall: 0.5328, Val F1: 0.1711
No improvement for 2 epochs. Best F1: 0.1791

Epoch 4/200
Using dynamic weighting: positive=3.80x, negative=0.60x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 412.46it/s]


Classifier gradient norms: mean=0.653077, max=1.232497
Positive prediction rate: 0.3053
Train Loss: 0.5055, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.6265
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.5328
Val Accuracy (Jaccard): 0.0998
Val Precision: 0.1042, Val Recall: 0.5328, Val F1: 0.1711
No improvement for 3 epochs. Best F1: 0.1791

Epoch 5/200
Using dynamic weighting: positive=3.50x, negative=0.62x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 429.71it/s]


Classifier gradient norms: mean=0.600352, max=1.128990
Positive prediction rate: 0.2880
Train Loss: 0.5004, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.6189
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.5328
Val Accuracy (Jaccard): 0.0998
Val Precision: 0.1042, Val Recall: 0.5328, Val F1: 0.1711
No improvement for 4 epochs. Best F1: 0.1791

Epoch 6/200
Model seems stuck. Reinitializing classifier layer...
Using dynamic weighting: positive=3.20x, negative=0.65x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 451.33it/s]


Classifier gradient norms: mean=0.801829, max=1.506977
Positive prediction rate: 0.7262
Train Loss: 0.7501, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.9103
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.7673
Val Accuracy (Jaccard): 0.0647
Val Precision: 0.0649, Val Recall: 0.7673, Val F1: 0.1181
No improvement for 1 epochs. Best F1: 0.1791

Epoch 7/200
Using dynamic weighting: positive=2.90x, negative=0.68x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 471.09it/s]


Classifier gradient norms: mean=0.773628, max=1.453209
Positive prediction rate: 0.7141
Train Loss: 0.7243, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.8708
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.7662
Val Accuracy (Jaccard): 0.0646
Val Precision: 0.0648, Val Recall: 0.7662, Val F1: 0.1179
No improvement for 2 epochs. Best F1: 0.1791

Epoch 8/200
Using dynamic weighting: positive=2.60x, negative=0.70x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 452.32it/s]


Classifier gradient norms: mean=0.740976, max=1.390578
Positive prediction rate: 0.6754
Train Loss: 0.7002, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.8330
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.6183
Val Accuracy (Jaccard): 0.0598
Val Precision: 0.0603, Val Recall: 0.6183, Val F1: 0.1084
No improvement for 3 epochs. Best F1: 0.1791

Epoch 9/200
Model seems stuck. Reinitializing classifier layer...
Using dynamic weighting: positive=2.30x, negative=0.72x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 399.32it/s]


Classifier gradient norms: mean=0.797792, max=1.504830
Positive prediction rate: 0.5279
Train Loss: 0.6724, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.7824
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.5843
Val Accuracy (Jaccard): 0.0739
Val Precision: 0.0749, Val Recall: 0.5843, Val F1: 0.1303
No improvement for 1 epochs. Best F1: 0.1791

Epoch 10/200
Using dynamic weighting: positive=2.00x, negative=0.75x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 440.31it/s]


Classifier gradient norms: mean=0.754305, max=1.421458
Positive prediction rate: 0.4882
Train Loss: 0.6481, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.7484
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.5800
Val Accuracy (Jaccard): 0.0735
Val Precision: 0.0746, Val Recall: 0.5800, Val F1: 0.1296
No improvement for 2 epochs. Best F1: 0.1791

Epoch 11/200
Using dynamic weighting: positive=2.00x, negative=0.78x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 458.83it/s]


Classifier gradient norms: mean=0.717770, max=1.349571
Positive prediction rate: 0.4722
Train Loss: 0.6392, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.7156
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.5671
Val Accuracy (Jaccard): 0.0848
Val Precision: 0.0863, Val Recall: 0.5671, Val F1: 0.1465
No improvement for 3 epochs. Best F1: 0.1791

Epoch 12/200
Model seems stuck. Reinitializing classifier layer...
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 434.66it/s]


Classifier gradient norms: mean=0.836737, max=1.572644
Positive prediction rate: 0.7186
Train Loss: 0.7661, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.8343
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.8772
Val Accuracy (Jaccard): 0.0723
Val Precision: 0.0725, Val Recall: 0.8772, Val F1: 0.1320
No improvement for 1 epochs. Best F1: 0.1791

Epoch 13/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 454.90it/s]


Classifier gradient norms: mean=0.841459, max=1.586371
Positive prediction rate: 0.6622
Train Loss: 0.7342, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.7970
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.7384
Val Accuracy (Jaccard): 0.0723
Val Precision: 0.0725, Val Recall: 0.7384, Val F1: 0.1298
No improvement for 2 epochs. Best F1: 0.1791

Epoch 14/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 407.22it/s]


Classifier gradient norms: mean=0.768737, max=1.444865
Positive prediction rate: 0.5722
Train Loss: 0.7017, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.7618
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.6593
Val Accuracy (Jaccard): 0.0748
Val Precision: 0.0757, Val Recall: 0.6593, Val F1: 0.1333
No improvement for 3 epochs. Best F1: 0.1791

Epoch 15/200
Model seems stuck. Reinitializing classifier layer...
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 409.29it/s]


Classifier gradient norms: mean=0.716223, max=1.341949
Positive prediction rate: 0.5349
Train Loss: 0.6833, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.7489
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.6626
Val Accuracy (Jaccard): 0.0752
Val Precision: 0.0760, Val Recall: 0.6626, Val F1: 0.1342
No improvement for 1 epochs. Best F1: 0.1791

Epoch 16/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 420.86it/s]


Classifier gradient norms: mean=0.769155, max=1.448591
Positive prediction rate: 0.5148
Train Loss: 0.6536, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.7156
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.6152
Val Accuracy (Jaccard): 0.0773
Val Precision: 0.0783, Val Recall: 0.6152, Val F1: 0.1364
No improvement for 2 epochs. Best F1: 0.1791

Epoch 17/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 449.96it/s]


Classifier gradient norms: mean=0.704242, max=1.321923
Positive prediction rate: 0.4810
Train Loss: 0.6260, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.6841
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.6152
Val Accuracy (Jaccard): 0.0774
Val Precision: 0.0784, Val Recall: 0.6152, Val F1: 0.1366
No improvement for 3 epochs. Best F1: 0.1791

Epoch 18/200
Model seems stuck. Reinitializing classifier layer...
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 463.63it/s]


Classifier gradient norms: mean=0.698601, max=1.311452
Positive prediction rate: 0.3666
Train Loss: 0.6462, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.6992
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.4180
Val Accuracy (Jaccard): 0.0776
Val Precision: 0.0810, Val Recall: 0.4180, Val F1: 0.1331
No improvement for 1 epochs. Best F1: 0.1791

Epoch 19/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 446.22it/s]


Classifier gradient norms: mean=0.672264, max=1.260926
Positive prediction rate: 0.3323
Train Loss: 0.6195, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.6691
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.4180
Val Accuracy (Jaccard): 0.0790
Val Precision: 0.0826, Val Recall: 0.4180, Val F1: 0.1351
No improvement for 2 epochs. Best F1: 0.1791

Epoch 20/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 443.98it/s]


Classifier gradient norms: mean=0.627841, max=1.175812
Positive prediction rate: 0.3122
Train Loss: 0.5935, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.6408
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.4180
Val Accuracy (Jaccard): 0.0790
Val Precision: 0.0826, Val Recall: 0.4180, Val F1: 0.1351
No improvement for 3 epochs. Best F1: 0.1791

Epoch 21/200
Model seems stuck. Reinitializing classifier layer...
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 455.25it/s]


Classifier gradient norms: mean=0.791159, max=1.492409
Positive prediction rate: 0.4780
Train Loss: 0.6600, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.7125
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.6585
Val Accuracy (Jaccard): 0.0756
Val Precision: 0.0772, Val Recall: 0.6585, Val F1: 0.1360
No improvement for 1 epochs. Best F1: 0.1791

Epoch 22/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 447.15it/s]


Classifier gradient norms: mean=0.769426, max=1.451812
Positive prediction rate: 0.4567
Train Loss: 0.6323, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.6816
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.6369
Val Accuracy (Jaccard): 0.0813
Val Precision: 0.0832, Val Recall: 0.6369, Val F1: 0.1447
No improvement for 2 epochs. Best F1: 0.1791

Epoch 23/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 446.92it/s]


Classifier gradient norms: mean=0.715776, max=1.345625
Positive prediction rate: 0.4125
Train Loss: 0.6078, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.6523
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.6348
Val Accuracy (Jaccard): 0.0907
Val Precision: 0.0930, Val Recall: 0.6348, Val F1: 0.1594
No improvement for 3 epochs. Best F1: 0.1791

Epoch 24/200
Model seems stuck. Reinitializing classifier layer...
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 452.43it/s]


Classifier gradient norms: mean=0.661799, max=1.245944
Positive prediction rate: 0.4052
Train Loss: 0.5568, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.6249
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.2075
Val Accuracy (Jaccard): 0.0311
Val Precision: 0.0325, Val Recall: 0.2075, Val F1: 0.0550
No improvement for 1 epochs. Best F1: 0.1791

Epoch 25/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 444.92it/s]


Classifier gradient norms: mean=0.663979, max=1.251063
Positive prediction rate: 0.3322
Train Loss: 0.5312, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.5972
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.1731
Val Accuracy (Jaccard): 0.0329
Val Precision: 0.0349, Val Recall: 0.1731, Val F1: 0.0565
No improvement for 2 epochs. Best F1: 0.1791

Epoch 26/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 446.29it/s]


Classifier gradient norms: mean=0.625805, max=1.179444
Positive prediction rate: 0.2789
Train Loss: 0.5090, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.5714
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.1618
Val Accuracy (Jaccard): 0.0347
Val Precision: 0.0362, Val Recall: 0.1618, Val F1: 0.0582
No improvement for 3 epochs. Best F1: 0.1791

Epoch 27/200
Model seems stuck. Reinitializing classifier layer...
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 458.43it/s]


Classifier gradient norms: mean=0.755420, max=1.422198
Positive prediction rate: 0.3878
Train Loss: 0.6439, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.7167
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.3136
Val Accuracy (Jaccard): 0.0503
Val Precision: 0.0517, Val Recall: 0.3136, Val F1: 0.0868
No improvement for 1 epochs. Best F1: 0.1791

Epoch 28/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 441.84it/s]


Classifier gradient norms: mean=0.675394, max=1.266116
Positive prediction rate: 0.3673
Train Loss: 0.6160, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.6852
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.2921
Val Accuracy (Jaccard): 0.0494
Val Precision: 0.0509, Val Recall: 0.2921, Val F1: 0.0846
No improvement for 2 epochs. Best F1: 0.1791

Epoch 29/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 447.11it/s]


Classifier gradient norms: mean=0.702456, max=1.322510
Positive prediction rate: 0.3327
Train Loss: 0.5888, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.6556
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.2007
Val Accuracy (Jaccard): 0.0391
Val Precision: 0.0409, Val Recall: 0.2007, Val F1: 0.0660
No improvement for 3 epochs. Best F1: 0.1791

Epoch 30/200
Model seems stuck. Reinitializing classifier layer...
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 414.71it/s]


Classifier gradient norms: mean=0.804497, max=1.511639
Positive prediction rate: 0.5409
Train Loss: 0.7356, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.8139
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.3618
Val Accuracy (Jaccard): 0.0434
Val Precision: 0.0448, Val Recall: 0.3618, Val F1: 0.0782
No improvement for 1 epochs. Best F1: 0.1791

Epoch 31/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 472.26it/s]


Classifier gradient norms: mean=0.835805, max=1.575427
Positive prediction rate: 0.5027
Train Loss: 0.7059, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.7790
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.3618
Val Accuracy (Jaccard): 0.0434
Val Precision: 0.0448, Val Recall: 0.3618, Val F1: 0.0782
No improvement for 2 epochs. Best F1: 0.1791

Epoch 32/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 458.70it/s]


Classifier gradient norms: mean=0.762460, max=1.432982
Positive prediction rate: 0.4935
Train Loss: 0.6734, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.7461
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.3618
Val Accuracy (Jaccard): 0.0434
Val Precision: 0.0448, Val Recall: 0.3618, Val F1: 0.0782
No improvement for 3 epochs. Best F1: 0.1791

Epoch 33/200
Model seems stuck. Reinitializing classifier layer...
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 444.79it/s]


Classifier gradient norms: mean=0.645688, max=1.209179
Positive prediction rate: 0.3723
Train Loss: 0.5937, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.6550
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.3195
Val Accuracy (Jaccard): 0.0597
Val Precision: 0.0625, Val Recall: 0.3195, Val F1: 0.1023
No improvement for 1 epochs. Best F1: 0.1791

Epoch 34/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 410.86it/s]


Classifier gradient norms: mean=0.695829, max=1.311443
Positive prediction rate: 0.3364
Train Loss: 0.5666, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.6265
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.3195
Val Accuracy (Jaccard): 0.0597
Val Precision: 0.0625, Val Recall: 0.3195, Val F1: 0.1023
No improvement for 2 epochs. Best F1: 0.1791

Epoch 35/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 458.02it/s]


Classifier gradient norms: mean=0.637703, max=1.198423
Positive prediction rate: 0.3249
Train Loss: 0.5428, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.5996
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.3195
Val Accuracy (Jaccard): 0.0597
Val Precision: 0.0625, Val Recall: 0.3195, Val F1: 0.1023
No improvement for 3 epochs. Best F1: 0.1791

Epoch 36/200
Model seems stuck. Reinitializing classifier layer...
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 452.29it/s]


Classifier gradient norms: mean=0.692690, max=1.299366
Positive prediction rate: 0.3912
Train Loss: 0.6320, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.6952
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.5719
Val Accuracy (Jaccard): 0.0988
Val Precision: 0.1016, Val Recall: 0.5719, Val F1: 0.1683
No improvement for 1 epochs. Best F1: 0.1791

Epoch 37/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 434.28it/s]


Classifier gradient norms: mean=0.685811, max=1.288655
Positive prediction rate: 0.3455
Train Loss: 0.6046, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.6649
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.4299
Val Accuracy (Jaccard): 0.0863
Val Precision: 0.0905, Val Recall: 0.4299, Val F1: 0.1455
No improvement for 2 epochs. Best F1: 0.1791

Epoch 38/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 469.39it/s]


Classifier gradient norms: mean=0.644434, max=1.209351
Positive prediction rate: 0.3090
Train Loss: 0.5791, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.6365
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.4299
Val Accuracy (Jaccard): 0.0863
Val Precision: 0.0905, Val Recall: 0.4299, Val F1: 0.1455
No improvement for 3 epochs. Best F1: 0.1791

Epoch 39/200
Model seems stuck. Reinitializing classifier layer...
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 452.28it/s]


Classifier gradient norms: mean=0.734315, max=1.381313
Positive prediction rate: 0.5609
Train Loss: 0.6317, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.7210
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.2794
Val Accuracy (Jaccard): 0.0278
Val Precision: 0.0285, Val Recall: 0.2794, Val F1: 0.0511
No improvement for 1 epochs. Best F1: 0.1791

Epoch 40/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 435.23it/s]


Classifier gradient norms: mean=0.692907, max=1.301251
Positive prediction rate: 0.4937
Train Loss: 0.6034, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.6897
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.2305
Val Accuracy (Jaccard): 0.0304
Val Precision: 0.0313, Val Recall: 0.2305, Val F1: 0.0543
No improvement for 2 epochs. Best F1: 0.1791

Epoch 41/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 437.98it/s]


Classifier gradient norms: mean=0.622591, max=1.166014
Positive prediction rate: 0.3923
Train Loss: 0.5775, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.6602
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.2154
Val Accuracy (Jaccard): 0.0329
Val Precision: 0.0339, Val Recall: 0.2154, Val F1: 0.0576
No improvement for 3 epochs. Best F1: 0.1791

Epoch 42/200
Model seems stuck. Reinitializing classifier layer...
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 461.84it/s]


Classifier gradient norms: mean=0.796362, max=1.498882
Positive prediction rate: 0.4913
Train Loss: 0.7013, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.7673
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.6838
Val Accuracy (Jaccard): 0.0869
Val Precision: 0.0886, Val Recall: 0.6838, Val F1: 0.1545
No improvement for 1 epochs. Best F1: 0.1791

Epoch 43/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 459.60it/s]


Classifier gradient norms: mean=0.719565, max=1.349274
Positive prediction rate: 0.4543
Train Loss: 0.6714, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.7349
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.6838
Val Accuracy (Jaccard): 0.0874
Val Precision: 0.0891, Val Recall: 0.6838, Val F1: 0.1553
No improvement for 2 epochs. Best F1: 0.1791

Epoch 44/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 447.64it/s]


Classifier gradient norms: mean=0.735851, max=1.383171
Positive prediction rate: 0.4339
Train Loss: 0.6453, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.7041
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.5540
Val Accuracy (Jaccard): 0.0789
Val Precision: 0.0806, Val Recall: 0.5540, Val F1: 0.1387
No improvement for 3 epochs. Best F1: 0.1791

Epoch 45/200
Model seems stuck. Reinitializing classifier layer...
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 456.43it/s]


Classifier gradient norms: mean=0.703376, max=1.322378
Positive prediction rate: 0.3919
Train Loss: 0.6018, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.6790
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.2001
Val Accuracy (Jaccard): 0.0284
Val Precision: 0.0296, Val Recall: 0.2001, Val F1: 0.0502
No improvement for 1 epochs. Best F1: 0.1791

Epoch 46/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 444.41it/s]


Classifier gradient norms: mean=0.700027, max=1.317660
Positive prediction rate: 0.3723
Train Loss: 0.5763, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.6506
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.1987
Val Accuracy (Jaccard): 0.0318
Val Precision: 0.0333, Val Recall: 0.1987, Val F1: 0.0554
No improvement for 2 epochs. Best F1: 0.1791

Epoch 47/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 457.82it/s]


Classifier gradient norms: mean=0.662445, max=1.247442
Positive prediction rate: 0.3267
Train Loss: 0.5502, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.6240
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.0599
Val Accuracy (Jaccard): 0.0157
Val Precision: 0.0181, Val Recall: 0.0599, Val F1: 0.0262
No improvement for 3 epochs. Best F1: 0.1791

Epoch 48/200
Model seems stuck. Reinitializing classifier layer...
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 414.11it/s]


Classifier gradient norms: mean=0.870200, max=1.638922
Positive prediction rate: 0.5157
Train Loss: 0.8076, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.9003
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.3147
Val Accuracy (Jaccard): 0.0405
Val Precision: 0.0418, Val Recall: 0.3147, Val F1: 0.0721
No improvement for 1 epochs. Best F1: 0.1791

Epoch 49/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 437.88it/s]


Classifier gradient norms: mean=0.826476, max=1.554644
Positive prediction rate: 0.4783
Train Loss: 0.7746, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.8649
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.3147
Val Accuracy (Jaccard): 0.0448
Val Precision: 0.0465, Val Recall: 0.3147, Val F1: 0.0789
No improvement for 2 epochs. Best F1: 0.1791

Epoch 50/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 468.02it/s]


Classifier gradient norms: mean=0.768186, max=1.441424
Positive prediction rate: 0.4466
Train Loss: 0.7458, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.8311
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.3147
Val Accuracy (Jaccard): 0.0476
Val Precision: 0.0493, Val Recall: 0.3147, Val F1: 0.0832
No improvement for 3 epochs. Best F1: 0.1791

Epoch 51/200
Model seems stuck. Reinitializing classifier layer...
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 460.57it/s]


Classifier gradient norms: mean=0.779146, max=1.462827
Positive prediction rate: 0.5472
Train Loss: 0.7363, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.8069
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.6416
Val Accuracy (Jaccard): 0.0774
Val Precision: 0.0789, Val Recall: 0.6416, Val F1: 0.1380
No improvement for 1 epochs. Best F1: 0.1791

Epoch 52/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 464.19it/s]


Classifier gradient norms: mean=0.779449, max=1.466506
Positive prediction rate: 0.4997
Train Loss: 0.7031, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.7734
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.6416
Val Accuracy (Jaccard): 0.0778
Val Precision: 0.0793, Val Recall: 0.6416, Val F1: 0.1386
No improvement for 2 epochs. Best F1: 0.1791

Epoch 53/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 462.97it/s]


Classifier gradient norms: mean=0.758377, max=1.425814
Positive prediction rate: 0.4866
Train Loss: 0.6769, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.7415
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.6157
Val Accuracy (Jaccard): 0.0821
Val Precision: 0.0838, Val Recall: 0.6157, Val F1: 0.1447
No improvement for 3 epochs. Best F1: 0.1791

Epoch 54/200
Model seems stuck. Reinitializing classifier layer...
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 459.25it/s]


Classifier gradient norms: mean=0.681021, max=1.278856
Positive prediction rate: 0.4456
Train Loss: 0.6155, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.6902
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.5768
Val Accuracy (Jaccard): 0.0757
Val Precision: 0.0776, Val Recall: 0.5768, Val F1: 0.1345
No improvement for 1 epochs. Best F1: 0.1791

Epoch 55/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 453.29it/s]


Classifier gradient norms: mean=0.700914, max=1.320098
Positive prediction rate: 0.4382
Train Loss: 0.5901, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.6615
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.5768
Val Accuracy (Jaccard): 0.0757
Val Precision: 0.0776, Val Recall: 0.5768, Val F1: 0.1345
No improvement for 2 epochs. Best F1: 0.1791

Epoch 56/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 463.57it/s]


Classifier gradient norms: mean=0.723191, max=1.363719
Positive prediction rate: 0.4269
Train Loss: 0.5654, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.6346
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.4578
Val Accuracy (Jaccard): 0.0631
Val Precision: 0.0651, Val Recall: 0.4578, Val F1: 0.1125
No improvement for 3 epochs. Best F1: 0.1791

Epoch 57/200
Model seems stuck. Reinitializing classifier layer...
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 451.22it/s]


Classifier gradient norms: mean=0.946079, max=1.785310
Positive prediction rate: 0.6272
Train Loss: 0.8071, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.9081
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.5004
Val Accuracy (Jaccard): 0.0503
Val Precision: 0.0514, Val Recall: 0.5004, Val F1: 0.0917
No improvement for 1 epochs. Best F1: 0.1791

Epoch 58/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 468.78it/s]


Classifier gradient norms: mean=0.891954, max=1.681461
Positive prediction rate: 0.5966
Train Loss: 0.7731, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.8709
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.4961
Val Accuracy (Jaccard): 0.0499
Val Precision: 0.0510, Val Recall: 0.4961, Val F1: 0.0910
No improvement for 2 epochs. Best F1: 0.1791

Epoch 59/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 462.24it/s]


Classifier gradient norms: mean=0.795769, max=1.494174
Positive prediction rate: 0.5722
Train Loss: 0.7408, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.8356
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.3767
Val Accuracy (Jaccard): 0.0463
Val Precision: 0.0479, Val Recall: 0.3767, Val F1: 0.0833
No improvement for 3 epochs. Best F1: 0.1791

Epoch 60/200
Model seems stuck. Reinitializing classifier layer...
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 461.57it/s]


Classifier gradient norms: mean=0.723176, max=1.361793
Positive prediction rate: 0.4011
Train Loss: 0.6065, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.6860
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.2516
Val Accuracy (Jaccard): 0.0366
Val Precision: 0.0379, Val Recall: 0.2516, Val F1: 0.0648
No improvement for 1 epochs. Best F1: 0.1791

Epoch 61/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 447.71it/s]


Classifier gradient norms: mean=0.690847, max=1.299893
Positive prediction rate: 0.3702
Train Loss: 0.5804, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.6573
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.2386
Val Accuracy (Jaccard): 0.0389
Val Precision: 0.0406, Val Recall: 0.2386, Val F1: 0.0681
No improvement for 2 epochs. Best F1: 0.1791

Epoch 62/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 442.30it/s]


Classifier gradient norms: mean=0.670997, max=1.264256
Positive prediction rate: 0.3358
Train Loss: 0.5572, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.6304
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.2372
Val Accuracy (Jaccard): 0.0444
Val Precision: 0.0467, Val Recall: 0.2372, Val F1: 0.0765
No improvement for 3 epochs. Best F1: 0.1791

Epoch 63/200
Model seems stuck. Reinitializing classifier layer...
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 369.12it/s]


Classifier gradient norms: mean=0.823737, max=1.552160
Positive prediction rate: 0.5057
Train Loss: 0.6925, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.7590
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.4383
Val Accuracy (Jaccard): 0.0543
Val Precision: 0.0556, Val Recall: 0.4383, Val F1: 0.0967
No improvement for 1 epochs. Best F1: 0.1791

Epoch 64/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 389.51it/s]


Classifier gradient norms: mean=0.763834, max=1.438068
Positive prediction rate: 0.4780
Train Loss: 0.6629, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.7267
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.4383
Val Accuracy (Jaccard): 0.0601
Val Precision: 0.0617, Val Recall: 0.4383, Val F1: 0.1058
No improvement for 2 epochs. Best F1: 0.1791

Epoch 65/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 416.44it/s]


Classifier gradient norms: mean=0.716975, max=1.346852
Positive prediction rate: 0.4478
Train Loss: 0.6363, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.6961
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.4361
Val Accuracy (Jaccard): 0.0612
Val Precision: 0.0629, Val Recall: 0.4361, Val F1: 0.1076
No improvement for 3 epochs. Best F1: 0.1791

Epoch 66/200
Model seems stuck. Reinitializing classifier layer...
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 450.06it/s]


Classifier gradient norms: mean=0.724724, max=1.364385
Positive prediction rate: 0.4203
Train Loss: 0.6184, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.6690
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.7269
Val Accuracy (Jaccard): 0.1049
Val Precision: 0.1072, Val Recall: 0.7269, Val F1: 0.1839
Saved new best model to ./results/run_20250428_083800_all_text_0.5_augmented/best_model_all_text_0.5_augmented.pt

Epoch 67/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 448.65it/s]


✓ Gradients are flowing to classifier
Classifier gradient norms: mean=0.688784, max=1.293393
Positive prediction rate: 0.4047
Train Loss: 0.5911, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.6388
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.7269
Val Accuracy (Jaccard): 0.1049
Val Precision: 0.1072, Val Recall: 0.7269, Val F1: 0.1839
No improvement for 1 epochs. Best F1: 0.1839

Epoch 68/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 446.24it/s]


Classifier gradient norms: mean=0.695005, max=1.309617
Positive prediction rate: 0.3930
Train Loss: 0.5654, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.6104
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.7054
Val Accuracy (Jaccard): 0.1117
Val Precision: 0.1145, Val Recall: 0.7054, Val F1: 0.1935
Saved new best model to ./results/run_20250428_083800_all_text_0.5_augmented/best_model_all_text_0.5_augmented.pt

Epoch 69/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 456.41it/s]

✓ Gradients are flowing to classifier
Classifier gradient norms: mean=0.632519, max=1.189254
Positive prediction rate: 0.3626
Train Loss: 0.5411, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.5837
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.6687
Val Accuracy (Jaccard): 0.1247
Val Precision: 0.1285, Val Recall: 0.6687, Val F1: 0.2114





Saved new best model to ./results/run_20250428_083800_all_text_0.5_augmented/best_model_all_text_0.5_augmented.pt

Epoch 70/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 446.78it/s]

✓ Gradients are flowing to classifier
Classifier gradient norms: mean=0.637016, max=1.200244
Positive prediction rate: 0.3275
Train Loss: 0.5178, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.5587
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.6580
Val Accuracy (Jaccard): 0.1266
Val Precision: 0.1307, Val Recall: 0.6580, Val F1: 0.2137





Saved new best model to ./results/run_20250428_083800_all_text_0.5_augmented/best_model_all_text_0.5_augmented.pt

Epoch 71/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 419.25it/s]


✓ Gradients are flowing to classifier
Classifier gradient norms: mean=0.646124, max=1.219303
Positive prediction rate: 0.3086
Train Loss: 0.4970, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.5353
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.6580
Val Accuracy (Jaccard): 0.1266
Val Precision: 0.1307, Val Recall: 0.6580, Val F1: 0.2137
No improvement for 1 epochs. Best F1: 0.2137

Epoch 72/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 441.41it/s]


Classifier gradient norms: mean=0.572363, max=1.076087
Positive prediction rate: 0.2978
Train Loss: 0.4771, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.5133
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.6580
Val Accuracy (Jaccard): 0.1266
Val Precision: 0.1307, Val Recall: 0.6580, Val F1: 0.2137
No improvement for 2 epochs. Best F1: 0.2137

Epoch 73/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 425.90it/s]


Classifier gradient norms: mean=0.536747, max=1.008661
Positive prediction rate: 0.2834
Train Loss: 0.4582, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.4926
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.6580
Val Accuracy (Jaccard): 0.1269
Val Precision: 0.1310, Val Recall: 0.6580, Val F1: 0.2141
Saved new best model to ./results/run_20250428_083800_all_text_0.5_augmented/best_model_all_text_0.5_augmented.pt

Epoch 74/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 428.50it/s]

✓ Gradients are flowing to classifier
Classifier gradient norms: mean=0.554645, max=1.045746
Positive prediction rate: 0.2665
Train Loss: 0.4405, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.4733
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.6429
Val Accuracy (Jaccard): 0.1455
Val Precision: 0.1516, Val Recall: 0.6429, Val F1: 0.2402





Saved new best model to ./results/run_20250428_083800_all_text_0.5_augmented/best_model_all_text_0.5_augmented.pt

Epoch 75/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 439.93it/s]

✓ Gradients are flowing to classifier
Classifier gradient norms: mean=0.512112, max=0.963128
Positive prediction rate: 0.2424
Train Loss: 0.4252, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.4553
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.6429
Val Accuracy (Jaccard): 0.1464
Val Precision: 0.1526, Val Recall: 0.6429, Val F1: 0.2412





Saved new best model to ./results/run_20250428_083800_all_text_0.5_augmented/best_model_all_text_0.5_augmented.pt

Epoch 76/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 438.64it/s]


✓ Gradients are flowing to classifier
Classifier gradient norms: mean=0.486113, max=0.914042
Positive prediction rate: 0.2212
Train Loss: 0.4088, Train Accuracy (Exact Match): 0.0016
Val Loss: 0.4384
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.6386
Val Accuracy (Jaccard): 0.1455
Val Precision: 0.1517, Val Recall: 0.6386, Val F1: 0.2398
No improvement for 1 epochs. Best F1: 0.2412

Epoch 77/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 454.25it/s]


Classifier gradient norms: mean=0.478630, max=0.900667
Positive prediction rate: 0.2106
Train Loss: 0.3948, Train Accuracy (Exact Match): 0.0010
Val Loss: 0.4226
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.3555
Val Accuracy (Jaccard): 0.1025
Val Precision: 0.1106, Val Recall: 0.3555, Val F1: 0.1638
No improvement for 2 epochs. Best F1: 0.2412

Epoch 78/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 427.12it/s]


Classifier gradient norms: mean=0.481866, max=0.909585
Positive prediction rate: 0.1784
Train Loss: 0.3809, Train Accuracy (Exact Match): 0.0021
Val Loss: 0.4079
Val Accuracy (Exact Match): 0.0905
Val Accuracy (Partial Match): 0.1463
Val Accuracy (Jaccard): 0.1248
Val Precision: 0.1480, Val Recall: 0.1463, Val F1: 0.1382
No improvement for 3 epochs. Best F1: 0.2412

Epoch 79/200
Model seems stuck. Reinitializing classifier layer...
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 438.27it/s]


Classifier gradient norms: mean=0.673261, max=1.265404
Positive prediction rate: 0.3271
Train Loss: 0.5791, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.6239
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.6126
Val Accuracy (Jaccard): 0.1241
Val Precision: 0.1264, Val Recall: 0.6126, Val F1: 0.2050
No improvement for 1 epochs. Best F1: 0.2412

Epoch 80/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 415.23it/s]


Classifier gradient norms: mean=0.643539, max=1.208494
Positive prediction rate: 0.3143
Train Loss: 0.5543, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.5953
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.6126
Val Accuracy (Jaccard): 0.1241
Val Precision: 0.1264, Val Recall: 0.6126, Val F1: 0.2050
No improvement for 2 epochs. Best F1: 0.2412

Epoch 81/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 449.24it/s]


Classifier gradient norms: mean=0.635583, max=1.194468
Positive prediction rate: 0.3059
Train Loss: 0.5301, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.5685
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.6126
Val Accuracy (Jaccard): 0.1241
Val Precision: 0.1264, Val Recall: 0.6126, Val F1: 0.2050
No improvement for 3 epochs. Best F1: 0.2412

Epoch 82/200
Model seems stuck. Reinitializing classifier layer...
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 442.09it/s]


Classifier gradient norms: mean=0.710151, max=1.335069
Positive prediction rate: 0.4177
Train Loss: 0.6095, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.7002
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.3568
Val Accuracy (Jaccard): 0.0561
Val Precision: 0.0585, Val Recall: 0.3568, Val F1: 0.0987
No improvement for 1 epochs. Best F1: 0.2412

Epoch 83/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 437.02it/s]


Classifier gradient norms: mean=0.734845, max=1.386889
Positive prediction rate: 0.3672
Train Loss: 0.5842, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.6714
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.3491
Val Accuracy (Jaccard): 0.0581
Val Precision: 0.0604, Val Recall: 0.3491, Val F1: 0.1014
No improvement for 2 epochs. Best F1: 0.2412

Epoch 84/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 433.84it/s]


Classifier gradient norms: mean=0.668143, max=1.255799
Positive prediction rate: 0.3232
Train Loss: 0.5598, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.6444
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.3276
Val Accuracy (Jaccard): 0.0617
Val Precision: 0.0647, Val Recall: 0.3276, Val F1: 0.1061
No improvement for 3 epochs. Best F1: 0.2412

Epoch 85/200
Model seems stuck. Reinitializing classifier layer...
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 435.28it/s]


Classifier gradient norms: mean=0.596805, max=1.121940
Positive prediction rate: 0.3118
Train Loss: 0.4925, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.5474
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.2606
Val Accuracy (Jaccard): 0.0498
Val Precision: 0.0520, Val Recall: 0.2606, Val F1: 0.0852
No improvement for 1 epochs. Best F1: 0.2412

Epoch 86/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 441.81it/s]


Classifier gradient norms: mean=0.580895, max=1.094850
Positive prediction rate: 0.2457
Train Loss: 0.4706, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.5242
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.2110
Val Accuracy (Jaccard): 0.0705
Val Precision: 0.0761, Val Recall: 0.2110, Val F1: 0.1088
No improvement for 2 epochs. Best F1: 0.2412

Epoch 87/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 447.18it/s]


Classifier gradient norms: mean=0.560159, max=1.056869
Positive prediction rate: 0.1730
Train Loss: 0.4525, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.5026
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.1571
Val Accuracy (Jaccard): 0.0831
Val Precision: 0.0948, Val Recall: 0.1571, Val F1: 0.1142
No improvement for 3 epochs. Best F1: 0.2412

Epoch 88/200
Model seems stuck. Reinitializing classifier layer...
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 445.41it/s]


Classifier gradient norms: mean=0.719164, max=1.353234
Positive prediction rate: 0.4026
Train Loss: 0.6191, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.6777
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.4205
Val Accuracy (Jaccard): 0.0709
Val Precision: 0.0737, Val Recall: 0.4205, Val F1: 0.1221
No improvement for 1 epochs. Best F1: 0.2412

Epoch 89/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 447.10it/s]


Classifier gradient norms: mean=0.671882, max=1.259828
Positive prediction rate: 0.3624
Train Loss: 0.5919, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.6474
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.4205
Val Accuracy (Jaccard): 0.0713
Val Precision: 0.0742, Val Recall: 0.4205, Val F1: 0.1227
No improvement for 2 epochs. Best F1: 0.2412

Epoch 90/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 438.92it/s]


Classifier gradient norms: mean=0.689435, max=1.298362
Positive prediction rate: 0.3206
Train Loss: 0.5679, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.6188
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.3175
Val Accuracy (Jaccard): 0.0841
Val Precision: 0.0904, Val Recall: 0.3175, Val F1: 0.1367
No improvement for 3 epochs. Best F1: 0.2412

Epoch 91/200
Model seems stuck. Reinitializing classifier layer...
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 451.36it/s]


Classifier gradient norms: mean=0.787117, max=1.479841
Positive prediction rate: 0.5693
Train Loss: 0.7094, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.8009
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.3061
Val Accuracy (Jaccard): 0.0360
Val Precision: 0.0371, Val Recall: 0.3061, Val F1: 0.0647
No improvement for 1 epochs. Best F1: 0.2412

Epoch 92/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 426.99it/s]


Classifier gradient norms: mean=0.766909, max=1.443244
Positive prediction rate: 0.5288
Train Loss: 0.6781, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.7680
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.3061
Val Accuracy (Jaccard): 0.0361
Val Precision: 0.0372, Val Recall: 0.3061, Val F1: 0.0648
No improvement for 2 epochs. Best F1: 0.2412

Epoch 93/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 433.73it/s]


Classifier gradient norms: mean=0.734864, max=1.379543
Positive prediction rate: 0.5095
Train Loss: 0.6512, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.7366
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.3047
Val Accuracy (Jaccard): 0.0391
Val Precision: 0.0404, Val Recall: 0.3047, Val F1: 0.0697
No improvement for 3 epochs. Best F1: 0.2412

Epoch 94/200
Model seems stuck. Reinitializing classifier layer...
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 454.77it/s]


Classifier gradient norms: mean=0.814998, max=1.532839
Positive prediction rate: 0.6534
Train Loss: 0.7272, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.8025
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.9231
Val Accuracy (Jaccard): 0.0867
Val Precision: 0.0869, Val Recall: 0.9231, Val F1: 0.1564
No improvement for 1 epochs. Best F1: 0.2412

Epoch 95/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 449.07it/s]


Classifier gradient norms: mean=0.814750, max=1.534305
Positive prediction rate: 0.5956
Train Loss: 0.6948, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.7666
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.5771
Val Accuracy (Jaccard): 0.0623
Val Precision: 0.0637, Val Recall: 0.5771, Val F1: 0.1129
No improvement for 2 epochs. Best F1: 0.2412

Epoch 96/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 435.45it/s]


Classifier gradient norms: mean=0.753802, max=1.416437
Positive prediction rate: 0.5188
Train Loss: 0.6651, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.7326
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.4214
Val Accuracy (Jaccard): 0.0526
Val Precision: 0.0541, Val Recall: 0.4214, Val F1: 0.0942
No improvement for 3 epochs. Best F1: 0.2412

Epoch 97/200
Model seems stuck. Reinitializing classifier layer...
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 444.94it/s]


Classifier gradient norms: mean=0.818484, max=1.542865
Positive prediction rate: 0.5453
Train Loss: 0.6950, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.7597
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.7389
Val Accuracy (Jaccard): 0.0816
Val Precision: 0.0828, Val Recall: 0.7389, Val F1: 0.1466
No improvement for 1 epochs. Best F1: 0.2412

Epoch 98/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 420.72it/s]


Classifier gradient norms: mean=0.750048, max=1.410069
Positive prediction rate: 0.4896
Train Loss: 0.6662, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.7256
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.6664
Val Accuracy (Jaccard): 0.0845
Val Precision: 0.0863, Val Recall: 0.6664, Val F1: 0.1504
No improvement for 2 epochs. Best F1: 0.2412

Epoch 99/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 418.29it/s]


Classifier gradient norms: mean=0.704167, max=1.321763
Positive prediction rate: 0.4486
Train Loss: 0.6357, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.6934
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.6664
Val Accuracy (Jaccard): 0.0849
Val Precision: 0.0867, Val Recall: 0.6664, Val F1: 0.1509
No improvement for 3 epochs. Best F1: 0.2412

Epoch 100/200
Model seems stuck. Reinitializing classifier layer...
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 403.29it/s]


Classifier gradient norms: mean=0.761709, max=1.437277
Positive prediction rate: 0.4479
Train Loss: 0.6224, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.7098
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.4258
Val Accuracy (Jaccard): 0.0624
Val Precision: 0.0641, Val Recall: 0.4258, Val F1: 0.1096
No improvement for 1 epochs. Best F1: 0.2412

Epoch 101/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 455.99it/s]


Classifier gradient norms: mean=0.675639, max=1.269912
Positive prediction rate: 0.3959
Train Loss: 0.5955, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.6806
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.4042
Val Accuracy (Jaccard): 0.0684
Val Precision: 0.0708, Val Recall: 0.4042, Val F1: 0.1180
No improvement for 2 epochs. Best F1: 0.2412

Epoch 102/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 418.28it/s]


Classifier gradient norms: mean=0.657554, max=1.234354
Positive prediction rate: 0.3621
Train Loss: 0.5713, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.6530
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.3956
Val Accuracy (Jaccard): 0.0746
Val Precision: 0.0774, Val Recall: 0.3956, Val F1: 0.1264
No improvement for 3 epochs. Best F1: 0.2412

Epoch 103/200
Model seems stuck. Reinitializing classifier layer...
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 457.23it/s]


Classifier gradient norms: mean=0.744527, max=1.402711
Positive prediction rate: 0.4847
Train Loss: 0.6175, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.6676
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.8611
Val Accuracy (Jaccard): 0.1033
Val Precision: 0.1043, Val Recall: 0.8611, Val F1: 0.1831
No improvement for 1 epochs. Best F1: 0.2412

Epoch 104/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 465.98it/s]


Classifier gradient norms: mean=0.703675, max=1.324682
Positive prediction rate: 0.4653
Train Loss: 0.5912, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.6377
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.8524
Val Accuracy (Jaccard): 0.1053
Val Precision: 0.1063, Val Recall: 0.8524, Val F1: 0.1859
No improvement for 2 epochs. Best F1: 0.2412

Epoch 105/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 461.33it/s]


Classifier gradient norms: mean=0.669437, max=1.259205
Positive prediction rate: 0.4043
Train Loss: 0.5654, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.6097
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.2677
Val Accuracy (Jaccard): 0.0545
Val Precision: 0.0572, Val Recall: 0.2677, Val F1: 0.0918
No improvement for 3 epochs. Best F1: 0.2412

Epoch 106/200
Model seems stuck. Reinitializing classifier layer...
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 447.44it/s]


Classifier gradient norms: mean=0.868970, max=1.637025
Positive prediction rate: 0.5642
Train Loss: 0.7481, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.8348
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.4094
Val Accuracy (Jaccard): 0.0408
Val Precision: 0.0415, Val Recall: 0.4094, Val F1: 0.0746
No improvement for 1 epochs. Best F1: 0.2412

Epoch 107/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 439.20it/s]


Classifier gradient norms: mean=0.830778, max=1.564356
Positive prediction rate: 0.5440
Train Loss: 0.7185, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.7994
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.3965
Val Accuracy (Jaccard): 0.0429
Val Precision: 0.0437, Val Recall: 0.3965, Val F1: 0.0779
No improvement for 2 epochs. Best F1: 0.2412

Epoch 108/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 454.51it/s]


Classifier gradient norms: mean=0.714137, max=1.338250
Positive prediction rate: 0.4826
Train Loss: 0.6878, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.7659
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.3864
Val Accuracy (Jaccard): 0.0469
Val Precision: 0.0479, Val Recall: 0.3864, Val F1: 0.0841
No improvement for 3 epochs. Best F1: 0.2412

Epoch 109/200
Model seems stuck. Reinitializing classifier layer...
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 463.08it/s]


Classifier gradient norms: mean=0.806252, max=1.519674
Positive prediction rate: 0.4283
Train Loss: 0.6803, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.7644
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.2670
Val Accuracy (Jaccard): 0.0441
Val Precision: 0.0463, Val Recall: 0.2670, Val F1: 0.0771
No improvement for 1 epochs. Best F1: 0.2412

Epoch 110/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 435.80it/s]


Classifier gradient norms: mean=0.730845, max=1.372634
Positive prediction rate: 0.4040
Train Loss: 0.6500, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.7319
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.2670
Val Accuracy (Jaccard): 0.0500
Val Precision: 0.0529, Val Recall: 0.2670, Val F1: 0.0860
No improvement for 2 epochs. Best F1: 0.2412

Epoch 111/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 432.45it/s]


Classifier gradient norms: mean=0.738974, max=1.392048
Positive prediction rate: 0.3651
Train Loss: 0.6221, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.7014
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.2670
Val Accuracy (Jaccard): 0.0501
Val Precision: 0.0530, Val Recall: 0.2670, Val F1: 0.0861
No improvement for 3 epochs. Best F1: 0.2412

Epoch 112/200
Model seems stuck. Reinitializing classifier layer...
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 439.78it/s]


Classifier gradient norms: mean=0.828987, max=1.560377
Positive prediction rate: 0.6017
Train Loss: 0.7367, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.8129
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.5087
Val Accuracy (Jaccard): 0.0551
Val Precision: 0.0560, Val Recall: 0.5087, Val F1: 0.0991
No improvement for 1 epochs. Best F1: 0.2412

Epoch 113/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 455.07it/s]


Classifier gradient norms: mean=0.814783, max=1.534754
Positive prediction rate: 0.5803
Train Loss: 0.7049, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.7769
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.5087
Val Accuracy (Jaccard): 0.0551
Val Precision: 0.0560, Val Recall: 0.5087, Val F1: 0.0991
No improvement for 2 epochs. Best F1: 0.2412

Epoch 114/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 432.34it/s]


Classifier gradient norms: mean=0.816570, max=1.540761
Positive prediction rate: 0.5384
Train Loss: 0.6725, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.7431
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.4232
Val Accuracy (Jaccard): 0.0510
Val Precision: 0.0526, Val Recall: 0.4232, Val F1: 0.0917
No improvement for 3 epochs. Best F1: 0.2412

Epoch 115/200
Model seems stuck. Reinitializing classifier layer...
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 422.43it/s]


Classifier gradient norms: mean=0.876486, max=1.650595
Positive prediction rate: 0.6809
Train Loss: 0.7753, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.8600
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.5871
Val Accuracy (Jaccard): 0.0531
Val Precision: 0.0540, Val Recall: 0.5871, Val F1: 0.0975
No improvement for 1 epochs. Best F1: 0.2412

Epoch 116/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 452.92it/s]


Classifier gradient norms: mean=0.880210, max=1.660296
Positive prediction rate: 0.6246
Train Loss: 0.7402, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.8218
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.5548
Val Accuracy (Jaccard): 0.0535
Val Precision: 0.0545, Val Recall: 0.5548, Val F1: 0.0977
No improvement for 2 epochs. Best F1: 0.2412

Epoch 117/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 458.25it/s]


Classifier gradient norms: mean=0.818993, max=1.541919
Positive prediction rate: 0.5584
Train Loss: 0.7086, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.7856
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.3665
Val Accuracy (Jaccard): 0.0454
Val Precision: 0.0469, Val Recall: 0.3665, Val F1: 0.0813
No improvement for 3 epochs. Best F1: 0.2412

Epoch 118/200
Model seems stuck. Reinitializing classifier layer...
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 444.85it/s]


Classifier gradient norms: mean=0.680931, max=1.280019
Positive prediction rate: 0.4284
Train Loss: 0.6045, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.6633
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.5019
Val Accuracy (Jaccard): 0.0704
Val Precision: 0.0722, Val Recall: 0.5019, Val F1: 0.1243
No improvement for 1 epochs. Best F1: 0.2412

Epoch 119/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 454.42it/s]


Classifier gradient norms: mean=0.679591, max=1.279124
Positive prediction rate: 0.3970
Train Loss: 0.5789, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.6345
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.4976
Val Accuracy (Jaccard): 0.0701
Val Precision: 0.0720, Val Recall: 0.4976, Val F1: 0.1238
No improvement for 2 epochs. Best F1: 0.2412

Epoch 120/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 463.82it/s]


Classifier gradient norms: mean=0.638071, max=1.198300
Positive prediction rate: 0.3649
Train Loss: 0.5544, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.6075
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.4782
Val Accuracy (Jaccard): 0.0877
Val Precision: 0.0908, Val Recall: 0.4782, Val F1: 0.1500
No improvement for 3 epochs. Best F1: 0.2412

Epoch 121/200
Model seems stuck. Reinitializing classifier layer...
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 436.95it/s]


Classifier gradient norms: mean=0.748515, max=1.408553
Positive prediction rate: 0.4393
Train Loss: 0.6322, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.7271
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.2616
Val Accuracy (Jaccard): 0.0315
Val Precision: 0.0322, Val Recall: 0.2616, Val F1: 0.0568
No improvement for 1 epochs. Best F1: 0.2412

Epoch 122/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 440.16it/s]


Classifier gradient norms: mean=0.746371, max=1.407879
Positive prediction rate: 0.3957
Train Loss: 0.6056, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.6959
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.2013
Val Accuracy (Jaccard): 0.0322
Val Precision: 0.0331, Val Recall: 0.2013, Val F1: 0.0562
No improvement for 2 epochs. Best F1: 0.2412

Epoch 123/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 443.13it/s]


Classifier gradient norms: mean=0.676633, max=1.273818
Positive prediction rate: 0.3286
Train Loss: 0.5792, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.6667
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.1797
Val Accuracy (Jaccard): 0.0388
Val Precision: 0.0405, Val Recall: 0.1797, Val F1: 0.0651
No improvement for 3 epochs. Best F1: 0.2412

Epoch 124/200
Model seems stuck. Reinitializing classifier layer...
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 431.13it/s]


Classifier gradient norms: mean=0.619454, max=1.159958
Positive prediction rate: 0.3193
Train Loss: 0.5698, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.6226
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.5165
Val Accuracy (Jaccard): 0.0989
Val Precision: 0.1027, Val Recall: 0.5165, Val F1: 0.1677
No improvement for 1 epochs. Best F1: 0.2412

Epoch 125/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 439.74it/s]


Classifier gradient norms: mean=0.647797, max=1.219292
Positive prediction rate: 0.3113
Train Loss: 0.5448, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.5956
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.5165
Val Accuracy (Jaccard): 0.0989
Val Precision: 0.1027, Val Recall: 0.5165, Val F1: 0.1677
No improvement for 2 epochs. Best F1: 0.2412

Epoch 126/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 433.33it/s]


Classifier gradient norms: mean=0.662128, max=1.247775
Positive prediction rate: 0.3048
Train Loss: 0.5217, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.5703
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.5165
Val Accuracy (Jaccard): 0.0989
Val Precision: 0.1027, Val Recall: 0.5165, Val F1: 0.1677
No improvement for 3 epochs. Best F1: 0.2412

Epoch 127/200
Model seems stuck. Reinitializing classifier layer...
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 432.12it/s]


Classifier gradient norms: mean=0.820066, max=1.544146
Positive prediction rate: 0.6006
Train Loss: 0.7108, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.7962
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.5494
Val Accuracy (Jaccard): 0.0549
Val Precision: 0.0557, Val Recall: 0.5494, Val F1: 0.0998
No improvement for 1 epochs. Best F1: 0.2412

Epoch 128/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 443.96it/s]


Classifier gradient norms: mean=0.794864, max=1.497379
Positive prediction rate: 0.5508
Train Loss: 0.6793, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.7611
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.5192
Val Accuracy (Jaccard): 0.0617
Val Precision: 0.0629, Val Recall: 0.5192, Val F1: 0.1104
No improvement for 2 epochs. Best F1: 0.2412

Epoch 129/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 460.33it/s]


Classifier gradient norms: mean=0.751937, max=1.413961
Positive prediction rate: 0.5010
Train Loss: 0.6489, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.7281
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.4423
Val Accuracy (Jaccard): 0.0555
Val Precision: 0.0570, Val Recall: 0.4423, Val F1: 0.0995
No improvement for 3 epochs. Best F1: 0.2412

Epoch 130/200
Model seems stuck. Reinitializing classifier layer...
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 449.10it/s]


Classifier gradient norms: mean=0.883247, max=1.664846
Positive prediction rate: 0.7004
Train Loss: 0.7677, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.8557
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.5935
Val Accuracy (Jaccard): 0.0511
Val Precision: 0.0520, Val Recall: 0.5935, Val F1: 0.0943
No improvement for 1 epochs. Best F1: 0.2412

Epoch 131/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 446.77it/s]


Classifier gradient norms: mean=0.854980, max=1.610586
Positive prediction rate: 0.6697
Train Loss: 0.7340, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.8181
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.5784
Val Accuracy (Jaccard): 0.0536
Val Precision: 0.0546, Val Recall: 0.5784, Val F1: 0.0983
No improvement for 2 epochs. Best F1: 0.2412

Epoch 132/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 453.16it/s]


Classifier gradient norms: mean=0.803021, max=1.511733
Positive prediction rate: 0.5943
Train Loss: 0.7031, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.7824
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.5654
Val Accuracy (Jaccard): 0.0630
Val Precision: 0.0643, Val Recall: 0.5654, Val F1: 0.1135
No improvement for 3 epochs. Best F1: 0.2412

Epoch 133/200
Model seems stuck. Reinitializing classifier layer...
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 442.14it/s]


Classifier gradient norms: mean=0.612934, max=1.146978
Positive prediction rate: 0.3789
Train Loss: 0.5643, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.6332
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.2205
Val Accuracy (Jaccard): 0.0389
Val Precision: 0.0413, Val Recall: 0.2205, Val F1: 0.0681
No improvement for 1 epochs. Best F1: 0.2412

Epoch 134/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 453.58it/s]


Classifier gradient norms: mean=0.654971, max=1.233157
Positive prediction rate: 0.3493
Train Loss: 0.5394, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.6057
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.2205
Val Accuracy (Jaccard): 0.0389
Val Precision: 0.0413, Val Recall: 0.2205, Val F1: 0.0681
No improvement for 2 epochs. Best F1: 0.2412

Epoch 135/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 460.92it/s]


Classifier gradient norms: mean=0.654679, max=1.235770
Positive prediction rate: 0.3404
Train Loss: 0.5164, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.5800
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.2205
Val Accuracy (Jaccard): 0.0389
Val Precision: 0.0413, Val Recall: 0.2205, Val F1: 0.0681
No improvement for 3 epochs. Best F1: 0.2412

Epoch 136/200
Model seems stuck. Reinitializing classifier layer...
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 456.05it/s]


Classifier gradient norms: mean=0.741716, max=1.390948
Positive prediction rate: 0.4768
Train Loss: 0.7086, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.7696
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.6222
Val Accuracy (Jaccard): 0.0771
Val Precision: 0.0787, Val Recall: 0.6222, Val F1: 0.1371
No improvement for 1 epochs. Best F1: 0.2412

Epoch 137/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 442.07it/s]


Classifier gradient norms: mean=0.767937, max=1.445009
Positive prediction rate: 0.4308
Train Loss: 0.6790, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.7363
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.6037
Val Accuracy (Jaccard): 0.0898
Val Precision: 0.0921, Val Recall: 0.6037, Val F1: 0.1567
No improvement for 2 epochs. Best F1: 0.2412

Epoch 138/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 470.36it/s]


Classifier gradient norms: mean=0.735247, max=1.383166
Positive prediction rate: 0.3972
Train Loss: 0.6510, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.7049
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.5951
Val Accuracy (Jaccard): 0.0909
Val Precision: 0.0934, Val Recall: 0.5951, Val F1: 0.1583
No improvement for 3 epochs. Best F1: 0.2412

Epoch 139/200
Model seems stuck. Reinitializing classifier layer...
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 480.02it/s]


Classifier gradient norms: mean=0.870916, max=1.639575
Positive prediction rate: 0.6721
Train Loss: 0.7705, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.8706
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.3588
Val Accuracy (Jaccard): 0.0333
Val Precision: 0.0342, Val Recall: 0.3588, Val F1: 0.0614
No improvement for 1 epochs. Best F1: 0.2412

Epoch 140/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 461.79it/s]


Classifier gradient norms: mean=0.871848, max=1.644108
Positive prediction rate: 0.6148
Train Loss: 0.7382, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.8333
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.3351
Val Accuracy (Jaccard): 0.0348
Val Precision: 0.0358, Val Recall: 0.3351, Val F1: 0.0635
No improvement for 2 epochs. Best F1: 0.2412

Epoch 141/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 464.37it/s]


Classifier gradient norms: mean=0.798408, max=1.502014
Positive prediction rate: 0.5504
Train Loss: 0.7061, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.7980
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.2713
Val Accuracy (Jaccard): 0.0333
Val Precision: 0.0345, Val Recall: 0.2713, Val F1: 0.0601
No improvement for 3 epochs. Best F1: 0.2412

Epoch 142/200
Model seems stuck. Reinitializing classifier layer...
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 475.16it/s]


Classifier gradient norms: mean=0.723055, max=1.359282
Positive prediction rate: 0.3853
Train Loss: 0.6516, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.7139
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.5300
Val Accuracy (Jaccard): 0.0852
Val Precision: 0.0887, Val Recall: 0.5300, Val F1: 0.1493
No improvement for 1 epochs. Best F1: 0.2412

Epoch 143/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 452.16it/s]


Classifier gradient norms: mean=0.679667, max=1.276377
Positive prediction rate: 0.3547
Train Loss: 0.6238, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.6825
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.5084
Val Accuracy (Jaccard): 0.0918
Val Precision: 0.0959, Val Recall: 0.5084, Val F1: 0.1583
No improvement for 2 epochs. Best F1: 0.2412

Epoch 144/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 438.90it/s]


Classifier gradient norms: mean=0.667064, max=1.251871
Positive prediction rate: 0.3198
Train Loss: 0.5967, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.6530
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.5041
Val Accuracy (Jaccard): 0.0930
Val Precision: 0.0970, Val Recall: 0.5041, Val F1: 0.1597
No improvement for 3 epochs. Best F1: 0.2412

Epoch 145/200
Model seems stuck. Reinitializing classifier layer...
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 459.05it/s]


Classifier gradient norms: mean=0.661738, max=1.245562
Positive prediction rate: 0.3466
Train Loss: 0.5465, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.6039
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.3180
Val Accuracy (Jaccard): 0.0528
Val Precision: 0.0546, Val Recall: 0.3180, Val F1: 0.0913
No improvement for 1 epochs. Best F1: 0.2412

Epoch 146/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 461.35it/s]


Classifier gradient norms: mean=0.611308, max=1.148841
Positive prediction rate: 0.2963
Train Loss: 0.5220, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.5766
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.2521
Val Accuracy (Jaccard): 0.0457
Val Precision: 0.0476, Val Recall: 0.2521, Val F1: 0.0785
No improvement for 2 epochs. Best F1: 0.2412

Epoch 147/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 463.02it/s]


Classifier gradient norms: mean=0.592629, max=1.114937
Positive prediction rate: 0.2436
Train Loss: 0.4984, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.5512
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.1702
Val Accuracy (Jaccard): 0.0482
Val Precision: 0.0514, Val Recall: 0.1702, Val F1: 0.0762
No improvement for 3 epochs. Best F1: 0.2412

Epoch 148/200
Model seems stuck. Reinitializing classifier layer...
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 418.29it/s]


Classifier gradient norms: mean=0.841520, max=1.586483
Positive prediction rate: 0.5941
Train Loss: 0.7220, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.8150
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.3356
Val Accuracy (Jaccard): 0.0330
Val Precision: 0.0338, Val Recall: 0.3356, Val F1: 0.0603
No improvement for 1 epochs. Best F1: 0.2412

Epoch 149/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 428.64it/s]


Classifier gradient norms: mean=0.804264, max=1.514976
Positive prediction rate: 0.5620
Train Loss: 0.6909, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.7797
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.3356
Val Accuracy (Jaccard): 0.0356
Val Precision: 0.0364, Val Recall: 0.3356, Val F1: 0.0645
No improvement for 2 epochs. Best F1: 0.2412

Epoch 150/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 453.09it/s]


Classifier gradient norms: mean=0.751491, max=1.412689
Positive prediction rate: 0.4990
Train Loss: 0.6607, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.7463
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.2946
Val Accuracy (Jaccard): 0.0416
Val Precision: 0.0431, Val Recall: 0.2946, Val F1: 0.0734
No improvement for 3 epochs. Best F1: 0.2412

Epoch 151/200
Model seems stuck. Reinitializing classifier layer...
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 462.87it/s]


Classifier gradient norms: mean=0.738900, max=1.390059
Positive prediction rate: 0.4915
Train Loss: 0.6301, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.6998
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.5610
Val Accuracy (Jaccard): 0.0685
Val Precision: 0.0702, Val Recall: 0.5610, Val F1: 0.1229
No improvement for 1 epochs. Best F1: 0.2412

Epoch 152/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 467.16it/s]


Classifier gradient norms: mean=0.661682, max=1.239887
Positive prediction rate: 0.4510
Train Loss: 0.5995, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.6682
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.4567
Val Accuracy (Jaccard): 0.0652
Val Precision: 0.0674, Val Recall: 0.4567, Val F1: 0.1151
No improvement for 2 epochs. Best F1: 0.2412

Epoch 153/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 456.60it/s]


Classifier gradient norms: mean=0.660512, max=1.242454
Positive prediction rate: 0.3914
Train Loss: 0.5737, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.6384
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.3841
Val Accuracy (Jaccard): 0.0684
Val Precision: 0.0712, Val Recall: 0.3841, Val F1: 0.1173
No improvement for 3 epochs. Best F1: 0.2412

Epoch 154/200
Model seems stuck. Reinitializing classifier layer...
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 445.19it/s]


Classifier gradient norms: mean=0.806711, max=1.522426
Positive prediction rate: 0.5107
Train Loss: 0.6459, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.7146
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.4182
Val Accuracy (Jaccard): 0.0488
Val Precision: 0.0498, Val Recall: 0.4182, Val F1: 0.0871
No improvement for 1 epochs. Best F1: 0.2412

Epoch 155/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 459.17it/s]


Classifier gradient norms: mean=0.732657, max=1.380195
Positive prediction rate: 0.4964
Train Loss: 0.6177, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.6835
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.4139
Val Accuracy (Jaccard): 0.0520
Val Precision: 0.0533, Val Recall: 0.4139, Val F1: 0.0924
No improvement for 2 epochs. Best F1: 0.2412

Epoch 156/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 424.79it/s]


Classifier gradient norms: mean=0.705895, max=1.328429
Positive prediction rate: 0.4429
Train Loss: 0.5910, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.6542
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.3670
Val Accuracy (Jaccard): 0.0638
Val Precision: 0.0660, Val Recall: 0.3670, Val F1: 0.1092
No improvement for 3 epochs. Best F1: 0.2412

Epoch 157/200
Model seems stuck. Reinitializing classifier layer...
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 450.59it/s]


Classifier gradient norms: mean=0.657343, max=1.232692
Positive prediction rate: 0.4370
Train Loss: 0.5854, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.6548
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.3445
Val Accuracy (Jaccard): 0.0533
Val Precision: 0.0556, Val Recall: 0.3445, Val F1: 0.0934
No improvement for 1 epochs. Best F1: 0.2412

Epoch 158/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 449.46it/s]


Classifier gradient norms: mean=0.661966, max=1.245583
Positive prediction rate: 0.3765
Train Loss: 0.5605, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.6266
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.2061
Val Accuracy (Jaccard): 0.0385
Val Precision: 0.0407, Val Recall: 0.2061, Val F1: 0.0664
No improvement for 2 epochs. Best F1: 0.2412

Epoch 159/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 470.89it/s]


Classifier gradient norms: mean=0.628175, max=1.180478
Positive prediction rate: 0.3254
Train Loss: 0.5369, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.6000
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.2018
Val Accuracy (Jaccard): 0.0380
Val Precision: 0.0402, Val Recall: 0.2018, Val F1: 0.0655
No improvement for 3 epochs. Best F1: 0.2412

Epoch 160/200
Model seems stuck. Reinitializing classifier layer...
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 449.54it/s]


Classifier gradient norms: mean=0.792636, max=1.490214
Positive prediction rate: 0.6170
Train Loss: 0.7044, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.7967
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.5427
Val Accuracy (Jaccard): 0.0574
Val Precision: 0.0587, Val Recall: 0.5427, Val F1: 0.1043
No improvement for 1 epochs. Best F1: 0.2412

Epoch 161/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 470.76it/s]


Classifier gradient norms: mean=0.821814, max=1.549747
Positive prediction rate: 0.5620
Train Loss: 0.6746, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.7625
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.5254
Val Accuracy (Jaccard): 0.0586
Val Precision: 0.0600, Val Recall: 0.5254, Val F1: 0.1058
No improvement for 2 epochs. Best F1: 0.2412

Epoch 162/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 449.97it/s]


Classifier gradient norms: mean=0.755467, max=1.421905
Positive prediction rate: 0.5335
Train Loss: 0.6461, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.7301
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.5254
Val Accuracy (Jaccard): 0.0586
Val Precision: 0.0600, Val Recall: 0.5254, Val F1: 0.1058
No improvement for 3 epochs. Best F1: 0.2412

Epoch 163/200
Model seems stuck. Reinitializing classifier layer...
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 467.12it/s]


Classifier gradient norms: mean=0.803278, max=1.509117
Positive prediction rate: 0.5416
Train Loss: 0.7578, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.8622
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.3009
Val Accuracy (Jaccard): 0.0299
Val Precision: 0.0306, Val Recall: 0.3009, Val F1: 0.0548
No improvement for 1 epochs. Best F1: 0.2412

Epoch 164/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 360.73it/s]


Classifier gradient norms: mean=0.821969, max=1.546290
Positive prediction rate: 0.5323
Train Loss: 0.7283, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.8275
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.2923
Val Accuracy (Jaccard): 0.0297
Val Precision: 0.0304, Val Recall: 0.2923, Val F1: 0.0544
No improvement for 2 epochs. Best F1: 0.2412

Epoch 165/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 390.56it/s]


Classifier gradient norms: mean=0.779473, max=1.466130
Positive prediction rate: 0.4876
Train Loss: 0.6976, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.7946
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.2157
Val Accuracy (Jaccard): 0.0289
Val Precision: 0.0296, Val Recall: 0.2157, Val F1: 0.0515
No improvement for 3 epochs. Best F1: 0.2412

Epoch 166/200
Model seems stuck. Reinitializing classifier layer...
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 436.85it/s]


Classifier gradient norms: mean=0.651648, max=1.225994
Positive prediction rate: 0.3341
Train Loss: 0.5718, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.6032
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.6332
Val Accuracy (Jaccard): 0.1392
Val Precision: 0.1430, Val Recall: 0.6332, Val F1: 0.2264
No improvement for 1 epochs. Best F1: 0.2412

Epoch 167/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 448.76it/s]


Classifier gradient norms: mean=0.629475, max=1.183099
Positive prediction rate: 0.2788
Train Loss: 0.5468, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.5756
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.6318
Val Accuracy (Jaccard): 0.1529
Val Precision: 0.1578, Val Recall: 0.6318, Val F1: 0.2457
Saved new best model to ./results/run_20250428_083800_all_text_0.5_augmented/best_model_all_text_0.5_augmented.pt

Epoch 168/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 432.85it/s]


✓ Gradients are flowing to classifier
Classifier gradient norms: mean=0.632339, max=1.188222
Positive prediction rate: 0.2573
Train Loss: 0.5238, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.5499
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.6318
Val Accuracy (Jaccard): 0.1529
Val Precision: 0.1578, Val Recall: 0.6318, Val F1: 0.2457
No improvement for 1 epochs. Best F1: 0.2457

Epoch 169/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 471.10it/s]


Classifier gradient norms: mean=0.599218, max=1.127002
Positive prediction rate: 0.2460
Train Loss: 0.5018, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.5259
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.6318
Val Accuracy (Jaccard): 0.1529
Val Precision: 0.1578, Val Recall: 0.6318, Val F1: 0.2457
No improvement for 2 epochs. Best F1: 0.2457

Epoch 170/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 461.25it/s]


Classifier gradient norms: mean=0.577753, max=1.087895
Positive prediction rate: 0.2402
Train Loss: 0.4802, Train Accuracy (Exact Match): 0.0010
Val Loss: 0.5034
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.6318
Val Accuracy (Jaccard): 0.1529
Val Precision: 0.1578, Val Recall: 0.6318, Val F1: 0.2457
No improvement for 3 epochs. Best F1: 0.2457

Epoch 171/200
Model seems stuck. Reinitializing classifier layer...
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 420.79it/s]


Classifier gradient norms: mean=0.758417, max=1.425263
Positive prediction rate: 0.5848
Train Loss: 0.6721, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.7697
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.3287
Val Accuracy (Jaccard): 0.0342
Val Precision: 0.0352, Val Recall: 0.3287, Val F1: 0.0624
No improvement for 1 epochs. Best F1: 0.2457

Epoch 172/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 456.35it/s]


Classifier gradient norms: mean=0.754521, max=1.420707
Positive prediction rate: 0.5622
Train Loss: 0.6429, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.7365
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.3287
Val Accuracy (Jaccard): 0.0342
Val Precision: 0.0352, Val Recall: 0.3287, Val F1: 0.0624
No improvement for 2 epochs. Best F1: 0.2457

Epoch 173/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 451.72it/s]


Classifier gradient norms: mean=0.717909, max=1.351330
Positive prediction rate: 0.5246
Train Loss: 0.6140, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.7051
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.3153
Val Accuracy (Jaccard): 0.0383
Val Precision: 0.0397, Val Recall: 0.3153, Val F1: 0.0692
No improvement for 3 epochs. Best F1: 0.2457

Epoch 174/200
Model seems stuck. Reinitializing classifier layer...
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 447.26it/s]


Classifier gradient norms: mean=0.742534, max=1.394700
Positive prediction rate: 0.4152
Train Loss: 0.6717, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.7280
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.4877
Val Accuracy (Jaccard): 0.0686
Val Precision: 0.0708, Val Recall: 0.4877, Val F1: 0.1218
No improvement for 1 epochs. Best F1: 0.2457

Epoch 175/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 462.77it/s]


Classifier gradient norms: mean=0.775894, max=1.463846
Positive prediction rate: 0.3824
Train Loss: 0.6456, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.6967
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.4877
Val Accuracy (Jaccard): 0.0767
Val Precision: 0.0794, Val Recall: 0.4877, Val F1: 0.1344
No improvement for 2 epochs. Best F1: 0.2457

Epoch 176/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 466.55it/s]


Classifier gradient norms: mean=0.739497, max=1.393292
Positive prediction rate: 0.3633
Train Loss: 0.6189, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.6672
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.4877
Val Accuracy (Jaccard): 0.0767
Val Precision: 0.0794, Val Recall: 0.4877, Val F1: 0.1344
No improvement for 3 epochs. Best F1: 0.2457

Epoch 177/200
Model seems stuck. Reinitializing classifier layer...
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 449.46it/s]


Classifier gradient norms: mean=0.770034, max=1.449005
Positive prediction rate: 0.5497
Train Loss: 0.6640, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.7280
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.5004
Val Accuracy (Jaccard): 0.0597
Val Precision: 0.0610, Val Recall: 0.5004, Val F1: 0.1069
No improvement for 1 epochs. Best F1: 0.2457

Epoch 178/200
Using dynamic weighting: positive=2.00x, negative=0.80x


Training (embeddings):   0%|          | 0/60 [00:00<?, ?it/s]

✓ Gradients are flowing to classifier


Training (embeddings): 100%|██████████| 60/60 [00:00<00:00, 436.20it/s]
  model.module.load_state_dict(torch.load(best_model_path))


Classifier gradient norms: mean=0.711888, max=1.337139
Positive prediction rate: 0.4580
Early stopping triggered
Train Loss: 0.6341, Train Accuracy (Exact Match): 0.0000
Val Loss: 0.6956
Val Accuracy (Exact Match): 0.0000
Val Accuracy (Partial Match): 0.3713
Val Accuracy (Jaccard): 0.0514
Val Precision: 0.0528, Val Recall: 0.3713, Val F1: 0.0911
No improvement for 2 epochs. Best F1: 0.2457
Early stopping triggered. Terminating training.

=== FINAL EVALUATION ===
Final evaluation with best model:
Final Loss: 0.5756
Final Exact Match Accuracy: 0.0000
Final Partial Match Accuracy: 0.6318
Final Jaccard Similarity: 0.1529
Final Precision: 0.1578
Final Recall: 0.6318
Final F1 Score: 0.2457

Training completed! Results saved to ./results/run_20250428_083800_all_text_0.5_augmented


Your problem with SMOTE for text classification is common. SMOTE works well for numerical features but struggles with high-dimensional text embeddings. Here's why:

The embedding space quality is critical - DeBERTa embeddings represent semantic relationships that SMOTE likely disrupts
Text data synthetic samples may not maintain linguistic coherence
Class imbalance in NLP often requires different approaches than traditional oversampling