In [1]:
# --- Cell 1: Setup & Configuration (Modified for XLM-RoBERTa) ---

# 1. Install necessary libraries (Ensure 'transformers', 'datasets', etc. are installed)
# !pip install -q transformers datasets accelerate torch evaluate scikit-learn pandas # Keep this if running fresh

# 2. Imports (Keep imports as they were)
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    AutoConfig # Keep AutoConfig if needed later for inference loading
)
from datasets import Dataset, DatasetDict
import pandas as pd
import numpy as np
import os
# from peft import LoraConfig, get_peft_model, TaskType, PeftModel # No longer needed for PEFT
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score, hamming_loss
from transformers import EvalPrediction
import warnings

warnings.filterwarnings("ignore", category=FutureWarning)

# 3. Configuration (MODIFIED)
MODEL_ID = "xlm-roberta-base" # <-- CHANGE Model ID
# Or use "xlm-roberta-large" for potentially better performance but more memory usage

TEST_CSV = "/kaggle/input/unlp-2025-shared-task-classification-techniques/test.csv"
OUTPUT_DIR = "xlm-roberta-multi-label-finetuned" # <-- CHANGE Output directory
# ADAPTER_SAVE_PATH = "..." # Not needed without LoRA
SUBMISSION_FILE = "submission_xlmr.csv" # <-- CHANGE Submission filename

# technique_columns definition remains the same
technique_columns = [
    'straw_man', 'appeal_to_fear', 'fud', 'bandwagon', 'whataboutism',
    'loaded_language', 'glittering_generalities', 'euphoria',
    'cherry_picking', 'cliche'
]
num_labels = len(technique_columns)

# Training Hyperparameters (MODIFIED for standard fine-tuning)
LEARNING_RATE = 2e-5 # Typical range for BERT/RoBERTa: 2e-5 to 5e-5
TRAIN_BATCH_SIZE = 8 # Can likely increase significantly vs Llama 3
EVAL_BATCH_SIZE = 32 # Can likely increase
NUM_EPOCHS = 15 # Might need a few more epochs than LoRA
WEIGHT_DECAY = 0.01
MAX_SEQ_LENGTH = 512 # Standard for RoBERTa

# LoRA Configuration (DISABLED)
USE_LORA = False
# Remove LORA_R, LORA_ALPHA, LORA_DROPOUT, LORA_TARGET_MODULES

# Quantization Configuration (DISABLED)
USE_4BIT_QUANT = False
# Remove bnb_config related lines

# 4. Check GPU and Login (Keep this section as is)
print("\nChecking GPU availability...")
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"GPU is available: {torch.cuda.get_device_name(0)}")
    total_gpu_mem = torch.cuda.get_device_properties(0).total_memory / (1024**3)
    print(f"Total GPU Memory: {total_gpu_mem:.2f} GB")
    use_gpu = True
else:
    device = torch.device("cpu")
    print("WARNING: GPU not found. Training will be very slow on CPU.")
    # Allow CPU training for smaller models like XLM-R, but warn user.
    use_gpu = False

# Optional Hugging Face Login (Keep as is)
# from huggingface_hub import notebook_login
# print("\nPlease login to Hugging Face Hub (requires a token):")
# notebook_login()

print(f"\nConfiguration:\nModel ID: {MODEL_ID}\nNum Labels: {num_labels}\nUse LoRA: {USE_LORA}\nUse Quantization: {USE_4BIT_QUANT}")
print("\n--- Setup Complete ---")


Checking GPU availability...
GPU is available: Tesla T4
Total GPU Memory: 14.74 GB

Configuration:
Model ID: xlm-roberta-base
Num Labels: 10
Use LoRA: False
Use Quantization: False

--- Setup Complete ---


In [2]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
# --- Cell 2: Data Preparation (Revised for 'techniques' column) ---

import pandas as pd
import numpy as np
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
import re # Import regular expressions for cleaning

# Ensure configuration variables from Cell 1 are accessible
# Required: TRAIN_CSV, technique_columns, MODEL_ID, MAX_SEQ_LENGTH

print(f"Loading training data")
try:
    df_train_full = pd.read_csv("/kaggle/input/unlp-dataset/train.csv")
    # df_train_s = pd.read_csv("/kaggle/input/unlp-dataset/synthetic_train_dataset.csv")
    # df_train_s['techniques'] = df_train_s['techniques'].apply(
    #     lambda x: [x] if isinstance(x, str) else x
    # )

    # df_train_full = pd.concat([df_train_full,df_train_s])
    
    # Basic text cleaning function
    def clean_text(text):
        text = text.lower() # Convert to lowercase
        text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE) # Remove URLs
        text = re.sub(r'\@\w+|\#', '', text) # Remove mentions and hashtags
        text = re.sub(r'[^\w\s\']', '', text) # Remove punctuation except apostrophes
        text = re.sub(r'\s+', ' ', text).strip() # Remove extra whitespace
        return text
    
    # Apply cleaning to the 'content' column
    df_train_full['content'] = df_train_full['content'].apply(clean_text)
    
    print("\nSample of cleaned content:")
    print(df_train_full['content'].head())

    # df_train_full['content'] = df_train_full['content'].fillna('').apply(clean_text)
    df_train_full['content'] = df_train_full['content'].fillna('') # Ensure no NaN content
    # Fill NaN values in 'techniques' with a string representation of an empty list
    df_train_full['techniques'] = df_train_full['techniques'].fillna('[]')
except FileNotFoundError:
    print(f"ERROR: Training file not found at {TRAIN_CSV}")
    raise
except Exception as e:
    print(f"Error loading or processing {TRAIN_CSV}: {e}")
    raise

# Verify required columns 'content' and 'techniques'
if 'content' not in df_train_full.columns: raise ValueError("'content' column missing in train data.")
if 'techniques' not in df_train_full.columns: raise ValueError("'techniques' column missing in train data.")

# --- Define Label Parsing and Encoding Function ---
# technique_columns should be defined in Cell 1
if 'technique_columns' not in globals():
     raise NameError("Variable 'technique_columns' not defined. Ensure Cell 1 was executed.")
technique_to_index = {name: i for i, name in enumerate(technique_columns)}
num_labels = len(technique_columns)
unknown_techniques_found = set() # To track techniques not in our predefined list

# --- Define Label Parsing and Encoding Function (REVISED) ---
# technique_columns is defined in Cell 1
technique_to_index = {name: i for i, name in enumerate(technique_columns)}
num_labels = len(technique_columns)
unknown_techniques_found = set() # To track techniques not in our predefined list

def parse_and_encode_labels(technique_string):
    """
    Parses the string representation from the 'techniques' column
    (e.g., "['tech1' 'tech2']") and returns a multi-hot encoded
    list of labels (float32).
    """
    # Initialize label vector with zeros
    labels = np.zeros(num_labels, dtype=np.float32)

    # 1. Handle empty or invalid entries first
    if not isinstance(technique_string, str) or technique_string.strip() in ('[]', '', 'nan'):
        return labels.tolist() # Return all zeros

    try:
        # --- Revised Parsing Logic ---
        # a. Remove outer brackets and leading/trailing whitespace
        cleaned_str = technique_string.strip("[] ")

        # b. Handle empty string after stripping brackets (e.g., if input was '[]')
        if not cleaned_str:
            return labels.tolist()

        # c. Assume techniques are separated by spaces, potentially within single quotes
        #    Replace the common separator "' '" (quote-space-quote) with a unique delimiter
        #    Using regex substitution for robustness against multiple spaces between items
        delimited_str = re.sub(r"'\s+'", "|", cleaned_str) # Replace ' ' with |

        # d. Remove any remaining single quotes (e.g., around the first/last item or if only one item)
        delimited_str = delimited_str.replace("'", "")

        # e. Split the string by the delimiter
        parsed_techniques = [tech.strip() for tech in delimited_str.split('|') if tech.strip()]

        # --- Encoding Logic (remains the same) ---
        if not parsed_techniques:
             # This might happen if the format was unexpected, e.g. "['']"
             return labels.tolist()

        found_match = False
        for tech_name in parsed_techniques:
            if tech_name in technique_to_index:
                labels[technique_to_index[tech_name]] = 1.0
                found_match = True # Mark that at least one known technique was found
            else:
                # Optionally track or warn about techniques found in data but not expected
                if tech_name not in unknown_techniques_found:
                     print(f"Warning: Technique '{tech_name}' found in data but not in predefined technique_columns. Ignoring.")
                     unknown_techniques_found.add(tech_name)

        # Sanity check: If parsing resulted in something but no matches were found, print a warning
        # This helps catch cases where parsing extracted unexpected strings.
        # if parsed_techniques and not found_match:
        #      print(f"Warning: Parsed techniques {parsed_techniques} from '{technique_string}' but none matched known techniques.")

        return labels.tolist()

    except Exception as e:
        print(f"Error parsing technique string: '{technique_string}'. Error: {e}. Returning all zeros.")
        # Return all zeros in case of unexpected error during parsing
        return np.zeros(num_labels, dtype=np.float32).tolist()

# --- Apply the function to create the 'labels' column ---
print("\nParsing 'techniques' column and creating multi-hot encoded 'labels'...")
df_train_full['labels'] = df_train_full['techniques'].apply(parse_and_encode_labels)

# --- Verification ---
print("Example of created labels (first 5 rows):")
print(df_train_full[['content','techniques', 'labels']].head())

# Check if all label lists have the correct length
label_lengths = df_train_full['labels'].apply(len)
incorrect_length_count = (label_lengths != num_labels).sum()
if incorrect_length_count > 0:
    print(f"\nERROR: Found {incorrect_length_count} rows where the generated 'labels' list does not have the expected length ({num_labels})!")
    # Optionally print problematic rows:
    # print("Problematic rows:")
    # print(df_train_full[label_lengths != num_labels][['techniques', 'labels']])
    raise ValueError("Label length mismatch detected. Please check the parsing function or input data.")
else:
    print(f"\nVerified: All {len(df_train_full)} rows have a 'labels' list of length {num_labels}.")


# Select final columns needed for the dataset ('content' and the new 'labels')
df_train_final = df_train_full[['content', 'labels']]

print(df_train_final['labels'])

# --- Split data into Training and Validation sets ---
print("\nSplitting data into train/validation sets (90/10 split)...")
df_train, df_val = train_test_split(df_train_final, test_size=0.1, random_state=42) # Adjust test_size if needed

# --- Calculate Class Weights for BCEWithLogitsLoss ---
print("\nCalculating positive class weights for loss function...")

Loading training data

Sample of cleaned content:
0    новий огляд мапи deepstate від російського вій...
1    недавно 95 квартал жёстко поглумился над русск...
2    тим часом йде евакуація бєлгородського автовок...
3    в україні найближчим часом мають намір посилит...
4    расчёты 122мм сау 2с1 гвоздика 132й бригады 1г...
Name: content, dtype: object

Parsing 'techniques' column and creating multi-hot encoded 'labels'...
Example of created labels (first 5 rows):
                                             content  \
0  новий огляд мапи deepstate від російського вій...   
1  недавно 95 квартал жёстко поглумился над русск...   
2  тим часом йде евакуація бєлгородського автовок...   
3  в україні найближчим часом мають намір посилит...   
4  расчёты 122мм сау 2с1 гвоздика 132й бригады 1г...   

                             techniques  \
0        ['euphoria' 'loaded_language']   
1  ['loaded_language' 'cherry_picking']   
2        ['loaded_language' 'euphoria']   
3                      

In [5]:
# Use the training portion of the labels DataFrame (df_train contains 'labels' column)
train_labels_np = np.array(df_train['labels'].tolist()) # N_train_samples x num_labels

pos_weights_list = []
epsilon = 1e-6 # To avoid division by zero if a class has no positive examples

for i in range(num_labels): # num_labels defined in Cell 1
    num_positive = train_labels_np[:, i].sum()
    num_negative = len(train_labels_np) - num_positive

    # Formula: pos_weight = num_negative / num_positive
    # Add epsilon for numerical stability if num_positive is 0
    pos_weight = num_negative / (num_positive + epsilon)
    pos_weights_list.append(pos_weight)
    print(f"  Technique '{technique_columns[i]}': Positives={int(num_positive)}, Negatives={int(num_negative)}, PosWeight={pos_weight:.2f}")

# Convert weights to a PyTorch tensor (needs to be accessible in Cell 4)
# Making it global for simplicity, or pass via a shared context if preferred
global pos_weights_tensor
pos_weights_tensor = torch.tensor(pos_weights_list, dtype=torch.float32)
print("\nPositive class weights tensor created:")
print(pos_weights_tensor)

# --- (Rest of Cell 2: Convert to DatasetDict, Tokenize, etc.) ---
# ... (Keep the subsequent code in Cell 2) ...

  Technique 'straw_man': Positives=128, Negatives=3311, PosWeight=25.87
  Technique 'appeal_to_fear': Positives=270, Negatives=3169, PosWeight=11.74
  Technique 'fud': Positives=348, Negatives=3091, PosWeight=8.88
  Technique 'bandwagon': Positives=138, Negatives=3301, PosWeight=23.92
  Technique 'whataboutism': Positives=146, Negatives=3293, PosWeight=22.55
  Technique 'loaded_language': Positives=1788, Negatives=1651, PosWeight=0.92
  Technique 'glittering_generalities': Positives=444, Negatives=2995, PosWeight=6.75
  Technique 'euphoria': Positives=418, Negatives=3021, PosWeight=7.23
  Technique 'cherry_picking': Positives=463, Negatives=2976, PosWeight=6.43
  Technique 'cliche': Positives=418, Negatives=3021, PosWeight=7.23

Positive class weights tensor created:
tensor([25.8672, 11.7370,  8.8822, 23.9203, 22.5548,  0.9234,  6.7455,  7.2273,
         6.4276,  7.2273])


In [6]:
# --- Convert pandas DataFrames to Hugging Face Datasets ---
print("Converting to Hugging Face Dataset format...")
raw_dataset = DatasetDict({
    'train': Dataset.from_pandas(df_train, preserve_index=False),
    'validation': Dataset.from_pandas(df_val, preserve_index=False)
})

print("\nRaw Datasets:")
print(raw_dataset)

# --- Load Tokenizer ---
# Ensure tokenizer is available from Cell 1 or load it here
# --- Load Tokenizer (Ensure it matches MODEL_ID from Cell 1) ---
print(f"\nLoading tokenizer for {MODEL_ID}...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

# XLM-RoBERTa usually has padding defined, but check just in case
if tokenizer.pad_token is None:
    print("Warning: Tokenizer does not have a pad token. Adding EOS token as pad token.")
    # Some models might need specific pad tokens, but EOS often works.
    # Check XLM-R documentation if issues arise.
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'right' # Standard for classification

# --- Tokenization Function (No changes needed in the function itself) ---
def tokenize_and_format(examples):
    # Tokenize the text content
    tokenized_inputs = tokenizer(
        examples['content'],
        truncation=True,       # Truncate sequences longer than max_seq_length
        padding='max_length',  # Pad sequences to max_seq_length
        max_length=MAX_SEQ_LENGTH,
        # return_tensors='pt' # Trainer handles tensor conversion
    )
    # Ensure labels are included
    tokenized_inputs['labels'] = examples['labels']
    return tokenized_inputs

# --- Apply tokenization (No changes needed) ---
print("\nTokenizing datasets...")
tokenized_dataset = raw_dataset.map(
    tokenize_and_format,
    batched=True,
    remove_columns=['content']
)

# --- Set format (No changes needed) ---
tokenized_dataset.set_format("torch")

print("\nTokenized Datasets:")
print(tokenized_dataset)
print(f"Columns in tokenized train dataset: {tokenized_dataset['train'].column_names}")
print(f"\nExample tokenized input:\nText (IDs): {tokenized_dataset['train'][0]['input_ids'][:20]}...\nAttention Mask: {tokenized_dataset['train'][0]['attention_mask'][:20]}...\nLabels: {tokenized_dataset['train'][0]['labels']}")

print("\n--- Data Preparation Complete ---")

Converting to Hugging Face Dataset format...

Raw Datasets:
DatasetDict({
    train: Dataset({
        features: ['content', 'labels'],
        num_rows: 3439
    })
    validation: Dataset({
        features: ['content', 'labels'],
        num_rows: 383
    })
})

Loading tokenizer for xlm-roberta-base...


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]


Tokenizing datasets...


Map:   0%|          | 0/3439 [00:00<?, ? examples/s]

Map:   0%|          | 0/383 [00:00<?, ? examples/s]


Tokenized Datasets:
DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 3439
    })
    validation: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 383
    })
})
Columns in tokenized train dataset: ['labels', 'input_ids', 'attention_mask']

Example tokenized input:
Text (IDs): tensor([     0, 220007, 237041,    105,  52652,    260,  38565,   4335,  72681,
         52503,     29,    805,   1045,   6173,   7310,  52432, 167254,    518,
        119746,    105])...
Attention Mask: tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])...
Labels: tensor([0., 0., 0., 0., 0., 0., 1., 0., 0., 0.])

--- Data Preparation Complete ---


In [7]:
# --- Cell 3: Model Configuration & Loading (Simplified for XLM-RoBERTa) ---

print(f"Loading model: {MODEL_ID}")

# 1. Load Base Model (No Quantization, No LoRA)
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_ID,
    num_labels=num_labels,
    problem_type="multi_label_classification", # Still multi-label
    device_map="auto", # Automatically use GPU if available
    # trust_remote_code=True # Usually not needed for standard models like XLM-R
    ignore_mismatched_sizes=True # Add this if loading base model weights into classification head causes size mismatch warnings (common and usually okay)
)

# 2. Handle Padding Token ID (Good practice, though usually set for XLM-R)
if tokenizer.pad_token_id is not None and model.config.pad_token_id is None:
     print(f"Setting model's pad_token_id to tokenizer's: {tokenizer.pad_token_id}")
     model.config.pad_token_id = tokenizer.pad_token_id
elif tokenizer.pad_token_id is None and model.config.pad_token_id is None:
     print("Warning: Both tokenizer and model lack a pad_token_id.")
     # You might need to add one to the tokenizer and resize model embeddings if padding is necessary


print(f"\nModel loaded: {MODEL_ID}")
# Print model structure summary (optional)
# print(model)
# Verify model is on the correct device
print(f"Model is on device: {model.device}")

print("\n--- Model Configuration & Loading Complete ---")

Loading model: xlm-roberta-base


model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Model loaded: xlm-roberta-base
Model is on device: cuda:0

--- Model Configuration & Loading Complete ---


In [21]:
!pip install evaluate



In [10]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [14]:
# --- Cell 4: Training Configuration & Execution (Adjusted for XLM-RoBERTa) ---
import evaluate
print("Configuring Training...")


from torch.nn import BCEWithLogitsLoss # Import the loss function

# --- Cell 4: Training Configuration & Execution (with Class Weights) ---

# Imports for this cell
import torch
from torch.nn import BCEWithLogitsLoss
from transformers import Trainer, TrainingArguments, EvalPrediction
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score, hamming_loss
import numpy as np
import evaluate # Use HF evaluate library

# Check if pos_weights_tensor exists from Cell 2
if 'pos_weights_tensor' not in globals():
     raise NameError("Variable 'pos_weights_tensor' not found. Ensure Cell 2 defining it was executed.")

# 1. Define Metrics Computation Function
# Using a threshold of 0.5 for F1/Hamming calculation during evaluation
def compute_metrics(p: EvalPrediction):
    # Predictions are logits, labels are already multi-hot
    logits = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    labels = p.label_ids.astype(np.float32) # Ensure labels are float32

    if logits is None:
         print("Warning: compute_metrics received None for predictions.")
         return {} # Return empty dict if predictions are None

    # Check for NaN in logits before sigmoid
    if np.isnan(logits).any():
        print("ERROR: NaNs detected in model logits within compute_metrics! Returning zero metrics.")
        # Return default/zero metrics to avoid crashing Trainer evaluation loop
        return {'f1_macro': 0.0, 'f1_micro': 0.0, 'hamming_loss': 1.0}

    # Apply sigmoid activation to get probabilities
    sigmoid_preds = 1 / (1 + np.exp(-logits)) # Sigmoid function

    # Apply threshold (0.5) to get binary predictions
    binary_preds = (sigmoid_preds > 0.5).astype(int)

    # Calculate metrics using sklearn
    try:
        f1_macro = f1_score(y_true=labels, y_pred=binary_preds, average='macro', zero_division=0)
        f1_micro = f1_score(y_true=labels, y_pred=binary_preds, average='micro', zero_division=0)
        # f1_weighted = f1_score(y_true=labels, y_pred=binary_preds, average='weighted', zero_division=0)
        # f1_samples = f1_score(y_true=labels, y_pred=binary_preds, average='samples', zero_division=0)
        hamming = hamming_loss(y_true=labels, y_pred=binary_preds)

        return {
            'f1_macro': f1_macro,
            'f1_micro': f1_micro,
            # 'f1_weighted': f1_weighted,
            # 'f1_samples': f1_samples,
            'hamming_loss': hamming
        }
    except Exception as e:
        print(f"Error calculating metrics: {e}. Returning zero metrics.")
        return {'f1_macro': 0.0, 'f1_micro': 0.0, 'hamming_loss': 1.0}

# --- Define Custom Trainer to Apply Class Weights (Corrected for DataParallel) ---
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        """
        Overrides the default loss computation to use BCEWithLogitsLoss with pos_weight.
        Handles models potentially wrapped by DataParallel/DistributedDataParallel.
        Accepts **kwargs to handle unexpected arguments passed by Trainer.
        """
        if 'pos_weights_tensor' not in globals():
             raise NameError("pos_weights_tensor not found in global scope for CustomTrainer.")

        labels = inputs.pop("labels").float()
        outputs = model(**inputs) # model here might be the DataParallel wrapper
        logits = outputs.get("logits")

        # --- Get device correctly, handling DataParallel ---
        if hasattr(model, 'module'):
            model_device = model.module.device # Access device from underlying module
        else:
            model_device = model.device # Access device directly
        # -------------------------------------------------

        weights = pos_weights_tensor.to(model_device) # Move weights to the correct device
        loss_fct = BCEWithLogitsLoss(pos_weight=weights)
        loss = loss_fct(logits, labels)

        return (loss, outputs) if return_outputs else loss
# --- End of Custom Trainer Definition ---

# 2. Define Training Arguments (Adjust as needed, e.g., add save_total_limit)
print("Setting Training Arguments...")
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=TRAIN_BATCH_SIZE,
    per_device_eval_batch_size=EVAL_BATCH_SIZE,
    learning_rate=LEARNING_RATE,
    weight_decay=WEIGHT_DECAY,
    logging_dir=f'{OUTPUT_DIR}/logs',
    logging_strategy="steps",
    logging_steps=max(10, int(len(tokenized_dataset["train"]) / (TRAIN_BATCH_SIZE * torch.cuda.device_count() if torch.cuda.is_available() else 1 ) / 10)),
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro", # Focus on Macro F1
    greater_is_better=True,
    save_total_limit=2, # Keep only the best and the latest checkpoint (suggestion 2)
    fp16=use_gpu, # Enable if using GPU
    report_to="none",
)

# 3. Initialize Trainer --> Use CustomTrainer <--
print("Initializing CustomTrainer...")
trainer = CustomTrainer( # <-- Use the custom class
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics, # Your existing function
)

# 4. Start Training
print("\n--- Starting Training with Custom Loss ---")
torch.cuda.empty_cache()
train_result = trainer.train()

# 5. Save Training Metrics & Final Model (Keep as is)
# ... (Keep saving logic) ...

print("\n--- Training Complete ---")

Configuring Training...
Setting Training Arguments...
Initializing CustomTrainer...

--- Starting Training with Custom Loss ---


Epoch,Training Loss,Validation Loss,F1 Macro,F1 Micro,Hamming Loss,Runtime,Samples Per Second,Steps Per Second
1,1.1365,1.106488,0.245509,0.261957,0.354569,6.8935,55.559,0.87
2,1.0505,0.967286,0.309029,0.35132,0.295039,6.9219,55.332,0.867
3,0.9152,0.973558,0.359606,0.410635,0.208355,6.9684,54.963,0.861
4,0.8486,0.921028,0.357716,0.426319,0.218538,6.9289,55.275,0.866
5,0.7222,0.952976,0.362068,0.435495,0.215927,6.9125,55.407,0.868
6,0.6644,1.054844,0.359668,0.44256,0.188773,6.9184,55.359,0.867
7,0.633,1.103434,0.379631,0.484026,0.168668,6.909,55.435,0.868
8,0.6099,1.189787,0.373412,0.460999,0.160574,6.935,55.227,0.865
9,0.5013,1.349015,0.334315,0.440617,0.161097,6.9236,55.318,0.867
10,0.486,1.349858,0.352266,0.477048,0.151697,6.9568,55.054,0.862


KeyboardInterrupt: 

In [23]:
# import shutil

# # Список чекпоінтів, які хочеш видалити
# checkpoints_to_delete = [
#     "checkpoint-356",
#     "checkpoint-712",
#     "checkpoint-1068",
#     "checkpoint-1424",
#     "checkpoint-1780"
# ]

# for checkpoint in checkpoints_to_delete:
#     path = f"/kaggle/working/xlm-roberta-multi-label-finetuned/{checkpoint}"
#     shutil.rmtree(path, ignore_errors=True)
#     print(f"Deleted: {checkpoint}")

Deleted: checkpoint-356
Deleted: checkpoint-712
Deleted: checkpoint-1068
Deleted: checkpoint-1424
Deleted: checkpoint-1780


In [16]:
# --- Cell 5: Inference & Submission (Adjusted for Loading from Checkpoint) ---
import math
from transformers import AutoConfig # Make sure AutoConfig is imported

print("--- Starting Inference and Submission File Generation ---")

# --- Specify the path to the BEST checkpoint directory ---
# Replace 'checkpoint-XXXX' with the actual directory name identified as the best
# Based on your image and logs (Epoch 5 best), let's assume it might be checkpoint-2365
# VERIFY THIS from your full logs or trainer_state.json if possible!
best_checkpoint_path = os.path.join(OUTPUT_DIR, "checkpoint-2365") # Or "checkpoint-1505", etc.
print(f"Attempting to load fine-tuned model from: {best_checkpoint_path}")
#------------------------------------------------------------

# 1. Load Fine-tuned Model for Inference from the Checkpoint
try:
    inference_model = AutoModelForSequenceClassification.from_pretrained(
        best_checkpoint_path, # <-- Load from the specific checkpoint path
        device_map="auto" # Load onto GPU if available
    )
    # Load the tokenizer associated with the fine-tuned model/checkpoint
    tokenizer = AutoTokenizer.from_pretrained(best_checkpoint_path)

    # Ensure model is on the correct device and in evaluation mode
    inference_model.to(device) # device should be defined from Cell 1 ('cuda' or 'cpu')
    inference_model.eval()
    print("Model and tokenizer loaded successfully from checkpoint and set to evaluation mode.")

except OSError as e:
     print(f"ERROR loading model from {best_checkpoint_path}: {e}")
     print("Please ensure the checkpoint path is correct and contains the model files.")
     print(f"Files in {OUTPUT_DIR}: {os.listdir(OUTPUT_DIR)}")
     if os.path.exists(best_checkpoint_path):
         print(f"Files in {best_checkpoint_path}: {os.listdir(best_checkpoint_path)}")
     raise SystemExit("Stopping due to model loading error.")


# 2. Load Test Data (Keep as is)
print(f"\nLoading test data from: {TEST_CSV}")
# ... (rest of test data loading code) ...
try:
    df_test = pd.read_csv(TEST_CSV)
    if 'id' not in df_test.columns: raise ValueError("'id' column missing in test data.")
    if 'content' not in df_test.columns: raise ValueError("'content' column missing in test data.")
    df_test['content'] = df_test['content'].fillna('')
    # Apply the same basic cleaning if done during training
    # df_test['content'] = df_test['content'].apply(clean_text)
except FileNotFoundError:
    print(f"ERROR: Test file not found at {TEST_CSV}")
    raise
except Exception as e:
    print(f"Error loading or processing {TEST_CSV}: {e}")
    raise
print(f"Test data loaded. Shape: {df_test.shape}")


# 3. Predict on Test Data (Using Batches - Keep as is)
print("\nGenerating predictions on test data...")
# ... (rest of prediction loop code using 'inference_model' and 'tokenizer') ...
results = []
test_texts = df_test['content'].tolist()
inference_batch_size = EVAL_BATCH_SIZE * 2 # Use eval batch size or larger defined in Cell 1

num_batches = math.ceil(len(test_texts) / inference_batch_size)

with torch.no_grad():
    for i in range(0, len(test_texts), inference_batch_size):
        batch_texts = test_texts[i : i + inference_batch_size]

        inputs = tokenizer(
            batch_texts,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=MAX_SEQ_LENGTH # MAX_SEQ_LENGTH defined in Cell 1
        ).to(device) # Move input tensors to the correct device

        outputs = inference_model(**inputs)
        logits = outputs.logits
        probabilities = torch.sigmoid(logits)
        predictions = (probabilities > 0.5).cpu().numpy().astype(int) # Thresholding
        results.extend(predictions.tolist())
        print(f"  Processed batch {i // inference_batch_size + 1} / {num_batches}")

print("\nPredictions generated.")


# 4. Create Submission File (Keep as is, using SUBMISSION_FILE defined in Cell 1)
print("Creating submission file...")
# ... (rest of submission file creation code) ...
df_predictions = pd.DataFrame(results, columns=technique_columns) # technique_columns defined in Cell 1
df_submission = pd.concat([df_test[['id']], df_predictions], axis=1)
expected_submission_columns = ['id'] + list(technique_columns)
if list(df_submission.columns) != expected_submission_columns:
     print("Warning: Column order mismatch in submission df. Reordering...")
     df_submission = df_submission[expected_submission_columns]

print(f"\nSubmission DataFrame preview (first 5 rows):")
print(df_submission.head())

# Save to CSV
try:
    df_submission.to_csv(SUBMISSION_FILE, index=False) # Use SUBMISSION_FILE variable
    print(f"\nSubmission file created successfully at: {SUBMISSION_FILE}")
    # Optional: Download link for Colab
    # from google.colab import files
    # files.download(SUBMISSION_FILE)
except Exception as e:
    print(f"\nError saving submission file: {e}")

print("\n--- Inference and Submission Complete ---")

--- Starting Inference and Submission File Generation ---
Attempting to load fine-tuned model from: xlm-roberta-multi-label-finetuned/checkpoint-2365
Model and tokenizer loaded successfully from checkpoint and set to evaluation mode.

Loading test data from: /kaggle/input/unlp-2025-shared-task-classification-techniques/test.csv
Test data loaded. Shape: (5735, 2)

Generating predictions on test data...
  Processed batch 1 / 90
  Processed batch 2 / 90
  Processed batch 3 / 90
  Processed batch 4 / 90
  Processed batch 5 / 90
  Processed batch 6 / 90
  Processed batch 7 / 90
  Processed batch 8 / 90
  Processed batch 9 / 90
  Processed batch 10 / 90
  Processed batch 11 / 90
  Processed batch 12 / 90
  Processed batch 13 / 90
  Processed batch 14 / 90
  Processed batch 15 / 90
  Processed batch 16 / 90
  Processed batch 17 / 90
  Processed batch 18 / 90
  Processed batch 19 / 90
  Processed batch 20 / 90
  Processed batch 21 / 90
  Processed batch 22 / 90
  Processed batch 23 / 90
  Proc

In [17]:
import pandas as pd
df = pd.read_csv(SUBMISSION_FILE)

df

Unnamed: 0,id,straw_man,appeal_to_fear,fud,bandwagon,whataboutism,loaded_language,glittering_generalities,euphoria,cherry_picking,cliche
0,521cd2e8-dd9f-42c4-98ba-c0c8890ff1ba,0,1,1,0,0,1,0,1,1,1
1,9b2a61e4-d14e-4ff7-b304-e73d720319bf,0,0,0,0,0,0,0,0,0,0
2,f0f1c236-80a8-4d25-b30c-a420a39be632,0,0,0,0,0,0,0,0,0,0
3,31ea05ba-2c2b-4b84-aba7-f3cf6841b204,0,0,0,0,0,0,0,0,0,0
4,a79e13ec-6d9a-40b5-b54c-7f4f743a7525,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
5730,e8e22b6d-0068-4afb-b606-4a1baa8a8d4c,0,0,1,1,0,1,0,0,1,1
5731,8b1d69b4-69ce-4e40-b4ba-dd2f370a8b6f,0,0,1,0,0,1,0,0,1,0
5732,c2246217-3358-4f61-bda8-e2ec21aed5b2,0,0,0,0,0,0,0,0,0,0
5733,45aa63c4-2248-4a0e-8f66-f3d23b6828ed,0,0,0,0,0,0,0,0,0,0
