In [1]:
# -*- coding: utf-8 -*-
# %%capture
# Commented out IPython magic to ensure Python compatibility.
# --- Installations ---
import os
import sys

# Check if running in Colab or locally
IN_COLAB = 'google.colab' in sys.modules

if IN_COLAB:
    print("Running in Colab - installing dependencies.")
    # Install necessary libraries for Colab
    # Unsloth for efficient fine-tuning
    !pip install --no-deps "unsloth[colab-newest]>=2024.7"
    # vLLM (optional, might have compatibility issues in Colab)
    # !pip install "vllm<0.6.0"
    # Latest Transformers for Gemma-3 support
    !pip install --no-deps "transformers>=4.49.0"
    # Other required libraries
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft "trl<0.9.0" triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf datasets huggingface_hub hf_transfer pandas # Added pandas
    print("Colab installations complete.")
else:
    print("Running locally.")
    # Check if essential libraries are installed, prompt user if not
    try:
        import unsloth
        import transformers
        import datasets
        import torch
        import pandas
        import peft
        import trl
        print("Required libraries seem to be installed.")
    except ImportError as e:
        print(f"Missing library: {e.name}")
        print("Please ensure unsloth, transformers, datasets, torch, pandas, peft, trl are installed.")
        print("Example: pip install unsloth transformers datasets torch pandas peft trl bitsandbytes accelerate sentencepiece protobuf huggingface_hub hf_transfer")
        # Optionally, uncomment the line below to attempt installation locally (use with caution)
        # !pip install unsloth transformers datasets torch pandas peft trl bitsandbytes accelerate sentencepiece protobuf huggingface_hub hf_transfer

# Verify GPU availability
import torch
if not torch.cuda.is_available():
    print("WARNING: No GPU detected. Training and inference will be very slow.")
else:
    print(f"GPU detected: {torch.cuda.get_device_name(0)}")

Running in Colab - installing dependencies.
Colab installations complete.
GPU detected: NVIDIA A100-SXM4-40GB


In [2]:
# --- Core Libraries ---
from unsloth import FastModel

import os
import re
import sys
import torch
import pandas as pd
from datasets import load_dataset, Dataset # Added Dataset for dummy data fallback
from transformers import TrainingArguments, TextStreamer
from peft import PeftModel
from trl import SFTTrainer

# Check if running in Colab again for specific logic if needed later
IN_COLAB = 'google.colab' in sys.modules

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [52]:
# --- Configuration ---

# Labels (use submission column names)
# Ensure these exactly match the required columns in submission.csv (except 'id')
SUBMISSION_LABELS = sorted([
    'straw_man', 'appeal_to_fear', 'fud', 'bandwagon', 'whataboutism',
    'loaded_language', 'glittering_generalities', 'euphoria',
    'cherry_picking', 'cliche'
])

# Mapping from your training data labels to SUBMISSION_LABELS
# Add/modify mappings based on *your* specific train.csv 'techniques' values
TRAINING_LABEL_MAP = {
    'loaded_language': 'loaded_language',
    'glittering_generalities': 'glittering_generalities',
    'euphoria': 'euphoria',
    'appeal_to_fear': 'appeal_to_fear',
    'fud': 'fud',
    'fear_uncertainty_doubt': 'fud', # Example variation
    'bandwagon': 'bandwagon',
    'appeal_to_people': 'bandwagon',
    'thought_terminating_cliche': 'cliche',
    'cliche': 'cliche',
    'whataboutism': 'whataboutism',
    'cherry_picking': 'cherry_picking',
    'straw_man': 'straw_man',
    # Add more mappings here if needed...
}

# Model Parameters
model_name = "unsloth/gemma-3-1b-it-unsloth-bnb-4bit" # Or choose another model
max_seq_length = 2048  # Adjust based on GPU memory and typical text length
load_in_4bit = False  # Set to True to use 4-bit quantization
load_in_8bit = False  # Set to True to use 8-bit quantization (ignored if 4-bit is True)

# File Paths (!!! ADJUST THESE PATHS !!!)
train_csv_path = '/content/drive/MyDrive/For Colab/train.csv' # Path to your training CSV file
test_csv_path = '/content/drive/MyDrive/For Colab/test.csv'   # Path to your test CSV file
output_dir = "manipulation_classifier_finetune" # Directory for saving checkpoints and adapters
submission_path = "submission.csv" # Name for the output submission file

# LoRA Parameters
lora_r = 8
lora_alpha = 16
lora_dropout = 0.05
lora_target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                       "gate_proj", "up_proj", "down_proj"]

# Start by trying a larger batch size
batch_size = 64           # TRY THIS FIRST (adjust based on VRAM usage)
gradient_accumulation = 2 # Reduce accumulation (Effective batch size = 128)
warmup_steps = 10
num_epochs = 14
learning_rate = 2e-5
logging_steps = 10
# save_strategy = "epoch" # Option 1: Save only once per epoch
save_strategy = "steps"   # Option 2: Save every N steps
save_steps = 50          # Increase if using save_strategy="steps"
seed = 3407

# --- Derived Configuration ---
# Determine compute dtype based on GPU capability
compute_dtype = None
if torch.cuda.is_available():
    if torch.cuda.is_bf16_supported():
        compute_dtype = torch.bfloat16
        print("Using bfloat16")
    else:
        compute_dtype = torch.float16
        print("Using float16")
else:
    # CPU or unsupported GPU
    compute_dtype = torch.float32
    print("GPU not available or doesn't support float16/bfloat16. Using float32 (warning: slow).")


print(f"Target labels for classification: {SUBMISSION_LABELS}")

Using bfloat16
Target labels for classification: ['appeal_to_fear', 'bandwagon', 'cherry_picking', 'cliche', 'euphoria', 'fud', 'glittering_generalities', 'loaded_language', 'straw_man', 'whataboutism']


In [12]:
# --- Load Model and Tokenizer ---
print(f"Loading model: {model_name}")

model, tokenizer = FastModel.from_pretrained(
    model_name=model_name,
    max_seq_length=max_seq_length,
    dtype=compute_dtype,  # Set dtype based on detected capability
    load_in_4bit=load_in_4bit,
    load_in_8bit=load_in_8bit,
    # token = "hf_...", # Add Hugging Face token if using gated models like Llama
)
print("Model and tokenizer loaded.")

Loading model: unsloth/gemma-3-1b-it-unsloth-bnb-4bit
==((====))==  Unsloth 2025.3.19: Fast Gemma3 patching. Transformers: 4.50.0.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: QLoRA and full finetuning all not selected. Switching to 16bit LoRA.
Model and tokenizer loaded.


In [13]:
# --- Add LoRA Adapters ---
print("Applying LoRA adapters...")
model = FastModel.get_peft_model(
    model,
    r=lora_r,
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    bias="none",
    target_modules=lora_target_modules,
    random_state=seed,
)
print("LoRA adapters applied.")
model.print_trainable_parameters()

Applying LoRA adapters...
Unsloth: Making `model.base_model.model.model` require gradients
LoRA adapters applied.
trainable params: 6,522,880 || all params: 1,006,408,832 || trainable%: 0.6481


In [27]:
# Add this dictionary, likely in Cell 3 (Configuration & Label Definitions)
LABEL_DESCRIPTIONS = {
    'appeal_to_fear': "The misuse of fear (often based on stereotypes or prejudices) to support a particular proposal.",
    'bandwagon': "An attempt to persuade the audience to join and take action because “others are doing the same thing.”",
    'cherry_picking': "Selective use of data or facts that support a hypothesis while ignoring counterarguments.",
    'cliche': "Commonly used phrases that mitigate cognitive dissonance and block critical thinking (e.g., “Everything is not so clear-cut”).",
    'euphoria': "Using an event that causes euphoria or a feeling of happiness, or a positive event to boost morale, often used to mobilize the population.",
    'fud': "Presenting information in a way that sows uncertainty and doubt, causing fear. A subtype of Appeal to Fear.",
    'glittering_generalities': "Exploitation of people's positive attitude towards abstract concepts such as “justice,” “freedom,” “democracy,” “patriotism,” “peace,” etc., to provoke strong emotional reactions without specific information.",
    'loaded_language': "The use of words and phrases with a strong emotional connotation (positive or negative) to influence the audience.",
    'straw_man': "Distorting the opponent's position by replacing it with a weaker or outwardly similar one and refuting it instead.",
    'whataboutism': "Discrediting the opponent's position by accusing them of hypocrisy without directly refuting their arguments."
}

# CELL 6: Data Preparation - Helper Functions (Multi-Label Revised)

# --- Data Formatting Logic ---

# Create the detailed labels string including descriptions
# Ensure labels in SUBMISSION_LABELS have corresponding descriptions in LABEL_DESCRIPTIONS
detailed_labels_list = []
for label in SUBMISSION_LABELS:
    description = LABEL_DESCRIPTIONS.get(label, "No description available.") # Fallback
    detailed_labels_list.append(f"- **{label}**: {description}")
labels_string_with_descriptions = "\n".join(detailed_labels_list)

# Updated instruction template asking for ALL applicable techniques
instruction_prompt_template = """Identify ALL applicable manipulation techniques used in the following text.
Choose from the list below. If multiple techniques apply, list them separated by commas.

Available Techniques:
{labels_string_with_descriptions}

Text:
{text}

Applicable Manipulation Technique(s):""" # Changed the final line label


# Regex to extract labels like 'euphoria' from "['euphoria' 'loaded_language']"
technique_pattern = re.compile(r"'(.*?)'")

# Function to parse the techniques string and get ALL valid mapped labels
def get_all_valid_labels(techniques_str):
    """
    Parses the techniques string (e.g., "['label1' 'label2']")
    and returns a sorted list of all labels found that map to a SUBMISSION_LABEL.
    """
    valid_labels = set() # Use a set to avoid duplicates initially
    if not isinstance(techniques_str, str):
        return [] # Return empty list

    potential_labels = technique_pattern.findall(techniques_str)

    for train_label in potential_labels:
        train_label_clean = train_label.strip().lower() # Normalize
        mapped_label = TRAINING_LABEL_MAP.get(train_label_clean)
        if mapped_label in SUBMISSION_LABELS:
            valid_labels.add(mapped_label) # Add valid label to set

    return sorted(list(valid_labels)) # Return sorted list

# Function to format examples for SFTTrainer (using 'messages' format)
def format_data_for_sft(example):
    """
    Formats a single example for SFT. Uses 'content' for text and extracts
    ALL valid labels from 'techniques', formatting them as a comma-separated string.
    Returns {'messages': None} if the example should be skipped.
    """
    text_content = example.get('content')
    techniques_str = example.get('techniques')

    user_content = instruction_prompt_template.format(
        labels_string_with_descriptions=labels_string_with_descriptions,
        text=text_content
    )

    if not techniques_str:
        # Return {'messages': None} for schema consistency
        return {
            "messages": [
                {"role": "user", "content": user_content},
                {"role": "assistant", "content": ""} # Model learns this string
            ]
        }
    else:
      # Structure for SFTTrainer with chatml format
      return {
          "messages": [
              {"role": "user", "content": user_content},
              {"role": "assistant", "content": techniques_str} # Model learns this string
          ]
      }




print("Data preparation helper functions defined (Multi-Label Revised).")
print("\nExample Prompt Structure:")
# Generate an example prompt text (without actual text content)
example_prompt_text_only = instruction_prompt_template.format(
    labels_string_with_descriptions=labels_string_with_descriptions,
    text="[Sample text would go here]"
)
print(example_prompt_text_only) # Print the first 1000 chars

Data preparation helper functions defined (Multi-Label Revised).

Example Prompt Structure:
Identify ALL applicable manipulation techniques used in the following text.
Choose from the list below. If multiple techniques apply, list them separated by commas.

Available Techniques:
- **appeal_to_fear**: The misuse of fear (often based on stereotypes or prejudices) to support a particular proposal.
- **bandwagon**: An attempt to persuade the audience to join and take action because “others are doing the same thing.”
- **cherry_picking**: Selective use of data or facts that support a hypothesis while ignoring counterarguments.
- **cliche**: Commonly used phrases that mitigate cognitive dissonance and block critical thinking (e.g., “Everything is not so clear-cut”).
- **euphoria**: Using an event that causes euphoria or a feeling of happiness, or a positive event to boost morale, often used to mobilize the population.
- **fud**: Presenting information in a way that sows uncertainty and doubt

In [28]:
# CELL 7: Data Loading & Processing (REVISED FILTER AGAIN)

# --- Load and Process Datasets ---

print("Loading datasets...")
try:
    # Load training data (adjust column names if your CSV differs)
    train_dataset_raw = load_dataset(
        "csv",
        data_files={"train": train_csv_path},
        split="train",
        column_names=['id', 'content', 'lang', 'manipulative', 'techniques', 'trigger_words'],
        keep_in_memory=False # Keep False for large datasets
    )
    # Load test data (adjust column names if your CSV differs)
    test_dataset_raw = load_dataset(
        "csv",
        data_files={"test": test_csv_path},
        split="test",
        column_names=['id', 'content'], # Only need id and content
        keep_in_memory=False
    )
    print(f"Loaded {len(train_dataset_raw)} training examples and {len(test_dataset_raw)} test examples.")

except Exception as e:
    print(f"ERROR loading CSV datasets: {e}")
    print("Please ensure CSV files exist at the specified paths and have the expected columns.")
    print("Using dummy data for demonstration purposes.")
    # Fallback to dummy data if loading fails
    dummy_train_data = {'id': ['id1', 'id2'], 'content': ['Some text 1', 'Some text 2'], 'lang': ['uk', 'uk'], 'manipulative': [True, True], 'techniques': ["['euphoria']", "['loaded_language' 'straw_man']"], 'trigger_words': ['', '']}
    dummy_test_data = {'id': ['test_id1', 'test_id2'], 'content': ['Test text 1', 'Test text 2']}
    train_dataset_raw = Dataset.from_dict(dummy_train_data)
    test_dataset_raw = Dataset.from_dict(dummy_test_data)


# Apply formatting function (returns {'messages': [...] } or {'messages': None})
print("Formatting training data (Step 1: Structure)...")
original_columns = train_dataset_raw.column_names
num_proc = os.cpu_count() // 2 if os.cpu_count() else 1
print(f"Using num_proc={num_proc} for mapping.")

structured_train_dataset = train_dataset_raw.map(
    format_data_for_sft, # Function defined in Cell 6
    remove_columns=original_columns,
    num_proc=num_proc,
    batched=False
)

# Filter out examples where 'messages' is None
num_raw_examples = len(train_dataset_raw)
filtered_train_dataset = structured_train_dataset.filter(
    lambda example: example.get('messages') is not None,
    num_proc=num_proc
)
num_filtered_examples = len(filtered_train_dataset)

# --- NEW STEP: Apply chat template to create a 'text' column ---
print("Applying chat template (Step 2: Format to String)...")

def apply_chat_template_func(example):
    """Applies the chat template to the 'messages' list."""
    messages = example.get('messages')
    if messages:
        try:
            # tokenize=False returns the formatted string
            # add_generation_prompt=False because SFTTrainer adds it later during training
            formatted_text = tokenizer.apply_chat_template(
                messages,
                tokenize=False,
                add_generation_prompt=False # SFTTrainer handles this
            )
            return {"text": formatted_text}
        except Exception as e:
            print(f"Error applying chat template: {e}\nMessages: {messages}")
            return {"text": None} # Return None text on error
    else:
        return {"text": None}

# Apply the function, keeping only the new 'text' column
final_train_dataset = filtered_train_dataset.map(
    apply_chat_template_func,
    remove_columns=filtered_train_dataset.column_names, # Remove the 'messages' column
    num_proc=num_proc,
    batched=False
)

# Filter out any examples where chat template application failed
final_train_dataset = final_train_dataset.filter(
    lambda example: example.get('text') is not None,
    num_proc=num_proc
)
num_final_examples = len(final_train_dataset)


print("\n--- Data Processing Summary ---")
print(f"Original training examples: {num_raw_examples}")
print(f"Examples after structure formatting & filtering: {num_filtered_examples}")
print(f"Examples after applying chat template & final filtering: {num_final_examples}")
skipped_count = num_raw_examples - num_final_examples
if skipped_count > 0:
    print(f"NOTE: {skipped_count} examples were skipped in total during processing.")

if num_final_examples > 0:
    print("\nExample instance in final 'text' format:")
    print(final_train_dataset[0]['text']) # Display the formatted string
else:
    print("\nWARNING: No valid training examples remaining after applying chat template! Cannot train.")
    print("Check errors during chat template application or earlier steps.")

Loading datasets...
Loaded 3823 training examples and 5736 test examples.
Formatting training data (Step 1: Structure)...
Using num_proc=6 for mapping.


Map (num_proc=6):   0%|          | 0/3823 [00:00<?, ? examples/s]

Filter (num_proc=6):   0%|          | 0/3823 [00:00<?, ? examples/s]

Applying chat template (Step 2: Format to String)...


Map (num_proc=6):   0%|          | 0/3823 [00:00<?, ? examples/s]

Filter (num_proc=6):   0%|          | 0/3823 [00:00<?, ? examples/s]


--- Data Processing Summary ---
Original training examples: 3823
Examples after structure formatting & filtering: 3823
Examples after applying chat template & final filtering: 3823

Example instance in final 'text' format:
<bos><start_of_turn>user
Identify ALL applicable manipulation techniques used in the following text.
Choose from the list below. If multiple techniques apply, list them separated by commas.

Available Techniques:
- **appeal_to_fear**: The misuse of fear (often based on stereotypes or prejudices) to support a particular proposal.
- **bandwagon**: An attempt to persuade the audience to join and take action because “others are doing the same thing.”
- **cherry_picking**: Selective use of data or facts that support a hypothesis while ignoring counterarguments.
- **cliche**: Commonly used phrases that mitigate cognitive dissonance and block critical thinking (e.g., “Everything is not so clear-cut”).
- **euphoria**: Using an event that causes euphoria or a feeling of happ

In [54]:
# --- Configure Training Arguments ---
print("Configuring training arguments...")

training_args = TrainingArguments(
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=gradient_accumulation,
    warmup_steps=warmup_steps,
    num_train_epochs=num_epochs,
    # max_steps=max_steps, # Uncomment to limit steps instead of epochs
    learning_rate=learning_rate,
    bf16=compute_dtype == torch.bfloat16, # Enable bf16 if supported
    fp16=compute_dtype == torch.float16, # Enable fp16 if bf16 not supported but fp16 is
    logging_steps=logging_steps,
    optim="adamw_torch", # Standard optimizer
    weight_decay=0.01,
    lr_scheduler_type="linear", # Or "cosine"
    seed=seed,
    output_dir=output_dir,
    save_strategy="epoch", # Save at the end of each epoch (or "steps")
    # save_steps=save_steps, # Define frequency if save_strategy="steps"
    save_total_limit=3,     # Keep only the last checkpoint
    report_to="none",     # Disable wandb/tensorboard reporting unless configured
    dataloader_num_workers = 2, # Adjust based on your system for data loading speed
)

print(f"Training arguments configured. Output directory: {output_dir}")

Configuring training arguments...
Training arguments configured. Output directory: manipulation_classifier_finetune


In [None]:
# CELL 9: Initialize & Run SFT Trainer (Use 'text' Field)

# --- Initialize and Run Trainer ---

trainer = None
final_adapter_path = None

# Use num_final_examples from Cell 7
if num_final_examples > 0:
    print("Initializing SFTTrainer...")
    trainer = SFTTrainer(
        model=model,
        tokenizer=tokenizer,
        train_dataset=final_train_dataset, # Use the dataset with the 'text' column
        args=training_args,
        max_seq_length=max_seq_length,
        dataset_text_field="text", # <<< TELL TRAINER TO USE THE 'text' COLUMN
        # formatting_func=... # <<< REMOVE formatting_func
        # packing=True, # Consider packing=True if sequences are short relative to max_seq_length
    )

    print("\n--- Starting Training ---")
    try:
        # train_result = trainer.train()
        train_result = trainer.train(resume_from_checkpoint="/content/manipulation_classifier_finetune/checkpoint-240")
        print("--- Training Finished ---")

        # --- Save Final Adapters ---
        print("Saving final LoRA adapters...")
        final_adapter_path = os.path.join(output_dir, "final_adapters")
        trainer.model.save_pretrained(final_adapter_path)
        tokenizer.save_pretrained(final_adapter_path)
        print(f"Final LoRA adapters saved to: {final_adapter_path}")

        # Optional: Log metrics
        metrics = train_result.metrics
        print("Training Metrics:", metrics)

    except Exception as e:
        print(f"An error occurred during training: {e}")
        # final_adapter_path remains None

else:
    print("Skipping training because no valid formatted training data is available.")
    # final_adapter_path remains None

Initializing SFTTrainer...

--- Starting Training ---


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 3,823 | Num Epochs = 14 | Total steps = 420
O^O/ \_/ \    Batch size per device = 64 | Gradient accumulation steps = 2
\        /    Data Parallel GPUs = 1 | Total batch size (64 x 2 x 1) = 128
 "-____-"     Trainable parameters = 6,522,880/1,006,408,832 (0.65% trained)


Step,Training Loss
250,1.2837
260,1.2582
270,1.2453
280,1.2179
290,1.2742
300,1.2061
310,1.2342
320,1.2347
330,1.2066
340,1.2063


In [56]:
# CELL 10: Inference Setup - Load Trained Model from Specific Checkpoint

# --- Prepare Model for Inference ---

model_for_inference = None
tokenizer_for_inference = tokenizer # Use the tokenizer loaded earlier (should match base model)

# --- Specify the checkpoint path to load ---
specific_checkpoint_path = "/content/manipulation_classifier_finetune/checkpoint-420"
print(f"Attempting to load adapters from checkpoint: {specific_checkpoint_path}")

# Check if the specified checkpoint path exists
if specific_checkpoint_path and os.path.exists(specific_checkpoint_path):
    print(f"Checkpoint directory found. Loading base model and merging adapters...")

    # Determine dtype for inference (usually same as training or float16)
    inference_dtype = compute_dtype if compute_dtype != torch.float32 else torch.float16
    print(f"Using inference dtype: {inference_dtype}")

    # Reload the base model in the desired inference precision
    # Important: Load base model WITHOUT quantization for merging adapters
    print(f"Reloading base model ({model_name}) in {inference_dtype}...")
    try:
        base_model, tokenizer_for_inference = FastModel.from_pretrained(
            model_name=model_name,
            max_seq_length=max_seq_length,
            dtype=inference_dtype,
            load_in_4bit=False, # Must load in float16/bfloat16 to merge
            load_in_8bit=False,
            # token = "hf_...", # Add token if needed
        )
        print("Base model reloaded.")

        # Load the LoRA adapters onto the base model FROM THE CHECKPOINT PATH
        print(f"Applying trained LoRA adapters from {specific_checkpoint_path}...")
        # Make sure PeftModel loads correctly from the checkpoint subdirectory
        model_for_inference = PeftModel.from_pretrained(base_model, specific_checkpoint_path)
        print("Adapters loaded.")

        # Merge the adapters into the base model and unload PeftModel
        print("Merging adapters...")
        model_for_inference = model_for_inference.merge_and_unload()
        print("Adapters merged. Model ready for inference.")

        # Set to evaluation mode and move to GPU if available
        model_for_inference.eval()
        if torch.cuda.is_available():
            print("Moving model to GPU...")
            model_for_inference.to("cuda")
            print("Model on GPU.")
        else:
            print("Warning: No GPU available for inference.")

    except Exception as e:
        print(f"ERROR loading base model or applying/merging adapters: {e}")
        model_for_inference = None # Ensure model is None if loading fails

else:
    print(f"ERROR: Specified checkpoint path not found: {specific_checkpoint_path}")
    print("Skipping inference setup.")
    model_for_inference = None

Attempting to load adapters from checkpoint: /content/manipulation_classifier_finetune/checkpoint-420
Checkpoint directory found. Loading base model and merging adapters...
Using inference dtype: torch.bfloat16
Reloading base model (unsloth/gemma-3-1b-it-unsloth-bnb-4bit) in torch.bfloat16...
==((====))==  Unsloth 2025.3.19: Fast Gemma3 patching. Transformers: 4.50.0.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: QLoRA and full finetuning all not selected. Switching to 16bit LoRA.
Base model reloaded.
Applying trained LoRA adapters from /content/manipulation_classifier_finetune/checkpoint-420...
Adapters loaded.
Merging adapters...
Adap

In [57]:
# CELL 11: Generate and Save RAW Predictions

# Check if the inference model is ready
if model_for_inference is not None and test_dataset_raw is not None:
    print("\n--- Generating Raw Predictions ---")
    raw_results = [] # Store dictionaries {'id': ..., 'raw_prediction': ...}
    total_test_examples = len(test_dataset_raw)
    print(f"Processing {total_test_examples} test examples...")

    # Set generation parameters
    generation_max_new_tokens = 100 # Max length for expected label list
    generation_temperature = 0.1
    generation_do_sample = True # Use sampling with low temp
    generation_top_p = 0.9

    for i, example in enumerate(test_dataset_raw):
        test_id = example.get('id')
        test_content = example.get('content')

        if not test_id or not test_content:
            print(f"Warning: Skipping test example at index {i} due to missing id or content.")
            continue

        # 1. Format input using the prompt structure
        user_content = instruction_prompt_template.format(
            labels_string_with_descriptions=labels_string_with_descriptions, # Use detailed prompt
            text=test_content
        )
        messages = [{"role": "user", "content": user_content}]
        input_text = tokenizer_for_inference.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        )

        # 2. Tokenize input
        inputs = tokenizer_for_inference(
            input_text,
            return_tensors="pt",
            truncation=True,
            max_length=max_seq_length - generation_max_new_tokens # Reserve space
        )

        # 3. Move inputs to GPU if available
        if torch.cuda.is_available():
            inputs = inputs.to("cuda")

        # 4. Generate prediction
        predicted_text_raw = "" # Initialize
        try:
            with torch.no_grad():
                outputs = model_for_inference.generate(
                    **inputs,
                    max_new_tokens=generation_max_new_tokens,
                    temperature=generation_temperature,
                    do_sample=generation_do_sample,
                    top_p=generation_top_p,
                    pad_token_id=tokenizer_for_inference.eos_token_id,
                )

            # 5. Decode raw output string
            prediction_ids = outputs[0][inputs['input_ids'].shape[1]:]
            predicted_text_raw = tokenizer_for_inference.decode(prediction_ids, skip_special_tokens=True).strip()

        except Exception as e:
            print(f"Error during generation for ID {test_id}: {e}")
            predicted_text_raw = "[GENERATION_ERROR]" # Placeholder on error

        # 6. Store ID and Raw Prediction
        raw_results.append({'id': test_id, 'raw_prediction': predicted_text_raw})

        # Print progress
        if (i + 1) % 50 == 0 or (i + 1) == total_test_examples:
             print(f"Processed {i+1}/{total_test_examples}... Raw Output: '{predicted_text_raw[:60]}...'")

    # --- Save Raw Results to CSV ---
    raw_predictions_path = "raw_predictions.csv"
    print(f"\nSaving raw predictions to: {raw_predictions_path}")
    if raw_results:
        raw_df = pd.DataFrame(raw_results)
        raw_df.to_csv(raw_predictions_path, index=False)
        print("Raw predictions saved.")
        print("\nRaw predictions sample:")
        print(raw_df.head())
    else:
        print("No raw results generated.")

else:
    print("\nSkipping raw prediction generation: Inference model not loaded or test data missing.")


--- Generating Raw Predictions ---
Processing 5736 test examples...
Processed 50/5736... Raw Output: '['cherry_picking' 'loaded_language']...'
Processed 100/5736... Raw Output: '['loaded_language' 'glittering_generalities']...'
Processed 150/5736... Raw Output: '['loaded_language']...'
Processed 200/5736... Raw Output: '['cliche' 'fud' 'glittering_generalities']...'
Processed 250/5736... Raw Output: '['cliche' 'euphoria' 'fud' 'glittering_generalities' 'loaded...'
Processed 300/5736... Raw Output: '['loaded_language']...'
Processed 350/5736... Raw Output: '['loaded_language']...'
Processed 400/5736... Raw Output: '['euphoria' 'glittering_generalities']...'
Processed 450/5736... Raw Output: '['straw_man']...'
Processed 500/5736... Raw Output: '['loaded_language' 'euphoria']...'
Processed 550/5736... Raw Output: '['cherry_picking' 'loaded_language']...'
Processed 600/5736... Raw Output: '['euphoria' 'loaded_language' 'cherry_picking']...'
Processed 650/5736... Raw Output: '['loaded_lang

In [58]:
# CELL 12: Stage 1 - Parse Strict Format & Separate Exceptions (Includes Empty as Good)

import re
import pandas as pd
import numpy as np # Import numpy for NaN checking

# --- Configuration (Ensure SUBMISSION_LABELS is available from Cell 3) ---
# SUBMISSION_LABELS = sorted([...]) # Copy from Cell 3 if needed

raw_predictions_path = "raw_predictions.csv"
good_rows_output_path = "submission_part1_good_format.csv" # Output for rows with definite parsed labels (0 or more)
exceptions_output_path = "parsing_exceptions.csv" # Output for rows with parsing errors or unrecognized text parts

# --- Regex Pattern for ['label' 'label'] format (from Cell 6) ---
technique_pattern = re.compile(r"'(.*?)'")

# Check if raw predictions file exists
if os.path.exists(raw_predictions_path):
    print(f"Loading raw predictions from {raw_predictions_path}...")
    try:
        raw_df = pd.read_csv(raw_predictions_path).replace({np.nan: None}) # Load and replace NaN
        print(f"Loaded {len(raw_df)} raw predictions.")

        # --- Prepare for Validation ---
        lowercase_to_canonical_label = {label.lower(): label for label in SUBMISSION_LABELS}
        valid_lowercase_labels = set(lowercase_to_canonical_label.keys())

        # --- Initialize Lists for Results ---
        good_submission_rows = []
        exception_rows = []
        print("Parsing predictions - Stage 1: Creating rows for strictly formatted AND empty/no-label predictions...")

        for index, row in raw_df.iterrows():
            test_id = row['id']
            raw_prediction = row['raw_prediction']

            parsed_labels_for_row = set()
            potential_labels_to_validate = []
            invalid_parts_found = 0
            is_exception = False # Flag for definite exceptions

            # --- Basic Checks ---
            # Treat None, non-strings, or explicit errors as needing review (exceptions)
            # Treat empty/whitespace-only strings as "good" (resulting in all zeros)
            if raw_prediction is None or not isinstance(raw_prediction, str):
                # Keep parsed_labels_for_row empty, proceed to create all-zero row
                 pass # No invalid parts found yet
            elif not raw_prediction.strip():
                 # Empty string is considered "good" -> all zeros
                 pass # No invalid parts found yet
            elif "[GENERATION_ERROR]" in raw_prediction:
                 is_exception = True # Definitely an exception
            else:
                # --- Attempt Strict Parsing ---
                cleaned_prediction = raw_prediction.strip()

                # Check for ['label' 'label'] format FIRST
                if cleaned_prediction.startswith("['") and cleaned_prediction.endswith("']"):
                    potential_labels_to_validate = technique_pattern.findall(cleaned_prediction)
                    if not potential_labels_to_validate and len(cleaned_prediction) > 2:
                         invalid_parts_found += 1 # Malformed bracketed string

                # ELSE, assume comma-separated format (or single label)
                else:
                    potential_labels_to_validate = cleaned_prediction.split(',')

                # --- Validate the extracted potential labels ---
                # Only validate if no invalid parts flagged yet from format check
                if invalid_parts_found == 0 and potential_labels_to_validate:
                    for part in potential_labels_to_validate:
                        cleaned_part = part.strip()
                        if not cleaned_part: # Ignore empty parts between commas etc.
                            continue

                        cleaned_part_lower = cleaned_part.lower()

                        if cleaned_part_lower in valid_lowercase_labels:
                            # Part is valid
                            canonical_label = lowercase_to_canonical_label[cleaned_part_lower]
                            parsed_labels_for_row.add(canonical_label)
                        else:
                            # Part is NOT a valid label -> THIS IS AN EXCEPTION
                            invalid_parts_found += 1
                            is_exception = True # Mark as definite exception
                            break # Stop processing this row

            # --- Assign to Correct List ---
            if is_exception or invalid_parts_found > 0:
                # Add to exceptions if explicit error, malformed, or contained invalid text parts
                exception_rows.append({'id': test_id, 'raw_prediction': raw_prediction})
            else:
                # Otherwise, it's considered "good" - either parsed correctly OR resulted in zero labels cleanly
                # Create the final submission row dictionary (will have all zeros if parsed_labels_for_row is empty)
                submission_row = {'id': test_id}
                for label in SUBMISSION_LABELS:
                    submission_row[label] = 1 if label in parsed_labels_for_row else 0
                good_submission_rows.append(submission_row)

            # Optional: Print progress
            if (index + 1) % 500 == 0 or (index + 1) == len(raw_df):
                status = "EXCEPTION" if (is_exception or invalid_parts_found > 0) else "GOOD"
                found_labels_str = str(parsed_labels_for_row) if status == "GOOD" else "N/A"
                print(f"Processed {index+1}/{len(raw_df)}... ID: {test_id} -> Status: {status}, Found: {found_labels_str}")


        # --- Save Good Rows ---
        if good_submission_rows:
            print(f"\nGenerated {len(good_submission_rows)} rows for submission (including all-zero rows).")
            good_df = pd.DataFrame(good_submission_rows)
            final_columns = ['id'] + SUBMISSION_LABELS
            good_df = good_df.reindex(columns=final_columns, fill_value=0)
            print(f"Saving 'good' rows (including empty) to: {good_rows_output_path}")
            good_df.to_csv(good_rows_output_path, index=False)
            print("'Good' rows saved.")
        else:
            print("\nNo 'good' rows generated for submission.")

        # --- Save Exception Rows ---
        if exception_rows:
            print(f"\nFound {len(exception_rows)} exceptions (malformed or contained unrecognized text).")
            exceptions_df = pd.DataFrame(exception_rows)
            print(f"Saving exceptions to: {exceptions_output_path}")
            exceptions_df.to_csv(exceptions_output_path, index=False)
            print("Exceptions saved for review.")
        else:
            print("\nNo exceptions found.")

    except Exception as e:
        print(f"An error occurred during Stage 1 processing: {e}")

else:
    print(f"ERROR: Raw predictions file not found at {raw_predictions_path}. Cannot process Stage 1.")

print("\n--- Stage 1 Processing Finished ---")

Loading raw predictions from raw_predictions.csv...
Loaded 5736 raw predictions.
Parsing predictions - Stage 1: Creating rows for strictly formatted AND empty/no-label predictions...
Processed 500/5736... ID: c172c4c3-6753-4a7f-818b-505c156eb3f1 -> Status: GOOD, Found: {'loaded_language', 'euphoria'}
Processed 1000/5736... ID: 0f2ed9ad-ed12-4e49-bfd3-2c2342780344 -> Status: GOOD, Found: {'loaded_language', 'euphoria', 'glittering_generalities'}
Processed 1500/5736... ID: 3f10588b-d226-428c-ba0e-272b1c2b6bd0 -> Status: GOOD, Found: {'euphoria'}
Processed 2000/5736... ID: cb1ec277-8304-433f-85a4-772694f93c1e -> Status: GOOD, Found: {'loaded_language'}
Processed 2500/5736... ID: 0ee21de4-c1b3-4329-9640-1eb533e16558 -> Status: GOOD, Found: {'glittering_generalities', 'loaded_language', 'fud', 'cliche', 'straw_man'}
Processed 3000/5736... ID: 4152e588-7bc5-4221-902d-d18fda24028f -> Status: GOOD, Found: {'loaded_language'}
Processed 3500/5736... ID: e6e58fbc-6564-4690-be62-370b1a2dbb15 -> St

In [59]:
# CELL 13: Stage 2 - Regex Parsing for Exceptions & Final Submission (Get Content from test_dataset_raw)

import re
import pandas as pd
import numpy as np
import os

# --- Configuration (Ensure SUBMISSION_LABELS is available) ---
# SUBMISSION_LABELS = sorted([...]) # Copy from Cell 3 if needed

# --- Input / Output Files ---
exceptions_input_path = "parsing_exceptions_fixed.csv" # OR "parsing_exceptions_retry.csv", etc.
stage1_good_rows_path = "submission_part1_good_format.csv" # OR "submission_part1_retry.csv", etc.
stage2_exceptions_output_path = "parsing_exceptions_stage2_with_content.csv"
final_submission_path = "submission.csv"

# --- Load Original Test Content from test_dataset_raw ---
test_content_map = {}
# Check if test_dataset_raw (from Cell 7) exists and has data
if 'test_dataset_raw' in locals() and test_dataset_raw is not None and len(test_dataset_raw) > 0:
    print("Creating content map from 'test_dataset_raw' object...")
    try:
        # Iterate through the dataset object once to build the map
        for example in test_dataset_raw:
            test_content_map[example['id']] = example['content']
        print(f"Created content map for {len(test_content_map)} test IDs.")
        if len(test_content_map) != len(test_dataset_raw):
             print(f"Warning: Number of items in map ({len(test_content_map)}) differs from test_dataset_raw size ({len(test_dataset_raw)}). Possible duplicate IDs?")
    except Exception as e:
        print(f"Warning: Could not create content map from 'test_dataset_raw': {e}")
        print("Final exceptions file will not contain original content.")
else:
    print("Warning: 'test_dataset_raw' object not found or empty. Cannot add content to exceptions.")
    print("Consider re-running Cell 7 or using the previous version of Cell 13 that loads test.csv.")


# --- Check if Stage 1 Exceptions File Exists ---
if os.path.exists(exceptions_input_path):
    print(f"\nLoading Stage 1 exceptions from {exceptions_input_path}...")
    try:
        exceptions_df_s1 = pd.read_csv(exceptions_input_path).replace({np.nan: None}) # Load/clean
        print(f"Loaded {len(exceptions_df_s1)} exceptions for final processing.")

        # --- Prepare Regex and Validation Info ---
        if 'SUBMISSION_LABELS' not in locals():
             print("ERROR: SUBMISSION_LABELS not defined. Please run Cell 3 or define it.")
             raise NameError("SUBMISSION_LABELS not defined")

        lowercase_to_canonical_label = {label.lower(): label for label in SUBMISSION_LABELS}
        pattern_string = r'\b(' + '|'.join(map(re.escape, lowercase_to_canonical_label.keys())) + r')\b'
        label_pattern = re.compile(pattern_string, re.IGNORECASE)
        print(f"Using regex pattern: {label_pattern.pattern}")

        # --- Initialize Lists for Stage 2 Results ---
        stage2_good_rows = []
        stage2_exception_rows = []
        print("Parsing exceptions - Stage 2: Applying lenient regex...")

        for index, row in exceptions_df_s1.iterrows():
            test_id = row['id']
            raw_prediction = row['raw_prediction']
            parsed_labels_for_row = set()
            found_labels_lower = []

            # --- Apply Regex ---
            if raw_prediction and isinstance(raw_prediction, str):
                try:
                    found_labels_lower = label_pattern.findall(raw_prediction.lower())
                except Exception as e:
                    print(f"Warning: Regex error on ID {test_id}, Raw: '{raw_prediction}'. Error: {e}")
                    found_labels_lower = []

            # --- Process Matches ---
            if found_labels_lower: # If regex found at least one label
                for label_lower in found_labels_lower:
                    canonical_label = lowercase_to_canonical_label.get(label_lower)
                    if canonical_label:
                        parsed_labels_for_row.add(canonical_label)

                submission_row = {'id': test_id}
                for label in SUBMISSION_LABELS:
                    submission_row[label] = 1 if label in parsed_labels_for_row else 0
                stage2_good_rows.append(submission_row)
            else:
                # Regex found NO labels, this is a final exception
                # Retrieve original content using the map created from test_dataset_raw
                original_content = test_content_map.get(test_id, "[CONTENT NOT FOUND IN test_dataset_raw]") # Get content
                stage2_exception_rows.append({
                    'id': test_id,
                    'raw_prediction': raw_prediction,
                    'content': original_content # <<< ADDED CONTENT FROM MAP
                })

            # Optional: Print progress
            if (index + 1) % 500 == 0 or (index + 1) == len(exceptions_df_s1):
                 status = "RECOVERED" if found_labels_lower else "EXCEPTION_FINAL"
                 found_labels_str = str(parsed_labels_for_row) if found_labels_lower else "None"
                 print(f"Processed S2 {index+1}/{len(exceptions_df_s1)}... ID: {test_id} -> Status: {status}, Found: {found_labels_str}")


        # --- Save Final (Stage 2) Exceptions (Now with Content) ---
        if stage2_exception_rows:
            print(f"\nFound {len(stage2_exception_rows)} final exceptions after Stage 2 regex parsing.")
            exceptions_df_s2 = pd.DataFrame(stage2_exception_rows)
            # Reorder columns if content was successfully added
            if not test_content_map: # Check if map creation failed
                 exceptions_df_s2 = exceptions_df_s2[['id', 'raw_prediction']]
                 print("Warning: Saving exceptions without 'content' column as map creation failed.")
            else:
                 exceptions_df_s2 = exceptions_df_s2[['id', 'content', 'raw_prediction']]
            print(f"Saving final exceptions to: {stage2_exceptions_output_path}")
            exceptions_df_s2.to_csv(stage2_exceptions_output_path, index=False)
            print("Final exceptions saved.")
        else:
            print("\nNo remaining exceptions after Stage 2.")


        # --- Combine Good Rows from Stage 1 and Stage 2 ---
        print("\nCombining results for final submission...")
        all_processed_rows_list = []
        # ... (Load good_df_s1 as before) ...
        if os.path.exists(stage1_good_rows_path):
             try:
                 good_df_s1 = pd.read_csv(stage1_good_rows_path)
                 all_processed_rows_list.append(good_df_s1)
                 print(f"Loaded {len(good_df_s1)} rows from Stage 1 ({stage1_good_rows_path}).")
             except Exception as e:
                 print(f"Warning: Could not load good rows from Stage 1 file ({stage1_good_rows_path}): {e}")
        else:
             print(f"Warning: Stage 1 good rows file not found at {stage1_good_rows_path}.")
        # ... (Add stage2_good_rows as before) ...
        if stage2_good_rows:
            good_df_s2 = pd.DataFrame(stage2_good_rows)
            final_columns = ['id'] + SUBMISSION_LABELS
            good_df_s2 = good_df_s2.reindex(columns=final_columns, fill_value=0)
            all_processed_rows_list.append(good_df_s2)
            print(f"Recovered {len(good_df_s2)} rows in Stage 2.")

        # ... (Concatenate, ensure uniqueness, reindex, save final submission as before) ...
        if all_processed_rows_list:
            final_submission_df = pd.concat(all_processed_rows_list, ignore_index=True)
            final_submission_df = final_submission_df.drop_duplicates(subset=['id'], keep='first')
            print(f"Total unique rows in final submission: {len(final_submission_df)}")
            final_submission_df = final_submission_df.reindex(columns=['id'] + SUBMISSION_LABELS, fill_value=0)
            print(f"Saving final combined submission file to: {final_submission_path}")
            final_submission_df.to_csv(final_submission_path, index=False)
            print("\nFinal submission file head:")
            print(final_submission_df.head())
            # ... (Optional verification count, compare against test_content_map size) ...
            if test_content_map and len(test_content_map) != len(final_submission_df):
                 print(f"\nWarning: Final submission row count ({len(final_submission_df)}) does not match original test set size ({len(test_content_map)}). Check for dropped IDs.")
            elif test_content_map:
                 print(f"\nFinal submission row count ({len(final_submission_df)}) matches original test set size.")

        else:
            print("ERROR: No rows processed successfully from any stage. Final submission file not created.")


    except FileNotFoundError:
        print(f"ERROR: Input exceptions file not found at {exceptions_input_path}. Cannot run final parsing stage.")
    except Exception as e:
        print(f"An error occurred during final parsing stage: {e}")

else:
    print(f"Skipping final parsing stage: Input exceptions file not found at {exceptions_input_path}.")

print("\n--- Final Parsing Stage Finished ---")

Creating content map from 'test_dataset_raw' object...
Created content map for 5736 test IDs.

Loading Stage 1 exceptions from parsing_exceptions_fixed.csv...
Loaded 98 exceptions for final processing.
Using regex pattern: \b(appeal_to_fear|bandwagon|cherry_picking|cliche|euphoria|fud|glittering_generalities|loaded_language|straw_man|whataboutism)\b
Parsing exceptions - Stage 2: Applying lenient regex...
Processed S2 98/98... ID: 0369e166-88f0-4522-a127-51e542e3fa85 -> Status: RECOVERED, Found: {'whataboutism'}

Found 3 final exceptions after Stage 2 regex parsing.
Saving final exceptions to: parsing_exceptions_stage2_with_content.csv
Final exceptions saved.

Combining results for final submission...
Loaded 5638 rows from Stage 1 (submission_part1_good_format.csv).
Recovered 95 rows in Stage 2.
Total unique rows in final submission: 5733
Saving final combined submission file to: submission.csv

Final submission file head:
                                     id  appeal_to_fear  bandwag

In [None]:
# --- Optional: Save Merged Model or GGUF ---

# If you need the full merged model or GGUF format, add the saving code here.
# Ensure 'model_for_inference' (the merged model) is available from the previous cell.

save_merged_16bit = False # Set to True to save the full 16-bit merged model
save_gguf = False       # Set to True to save in GGUF format

if model_for_inference is not None:
    # --- Save Merged Model (float16 / bfloat16) ---
    if save_merged_16bit:
        merged_path = os.path.join(output_dir, "final_merged_16bit")
        print(f"\nSaving merged 16-bit model to {merged_path}...")
        try:
            model_for_inference.save_pretrained(merged_path)
            tokenizer_for_inference.save_pretrained(merged_path)
            print("Merged model saved.")
        except Exception as e:
            print(f"Error saving merged model: {e}")

    # --- Save GGUF Model ---
    if save_gguf:
        # Define quantization type for GGUF (e.g., "Q8_0", "Q4_K_M", "F16")
        # Note: Unsloth's GGUF saving might have specific supported types. Check their docs.
        gguf_quant_type = "Q8_0"
        gguf_path = os.path.join(output_dir, f"final_model_{gguf_quant_type}.gguf")
        print(f"\nSaving GGUF model ({gguf_quant_type}) to {gguf_path}...")
        try:
            # Use the correct method from Unsloth/FastModel if available, or PeftModel's method
            # Assuming 'model_for_inference' is the merged PeftModel/FastModel object
             model_for_inference.save_pretrained_gguf(
                 output_dir, # Specifies the base name/directory
                 tokenizer_for_inference,
                 quantization_type = gguf_quant_type
             )
             # Rename the default gguf file if needed
             default_gguf_name = os.path.join(output_dir, "ggml-model-f16.gguf") # Adjust if default name differs
             quantized_gguf_name = os.path.join(output_dir, f"ggml-model-{gguf_quant_type}.gguf")
             if os.path.exists(quantized_gguf_name):
                 os.rename(quantized_gguf_name, gguf_path)
                 print(f"GGUF model saved as {gguf_path}")
             elif os.path.exists(default_gguf_name) and gguf_quant_type.upper() == "F16":
                 os.rename(default_gguf_name, gguf_path)
                 print(f"GGUF model saved as {gguf_path}")
             else:
                 print(f"GGUF file with expected name not found after save attempt.")

        except AttributeError:
             print("`save_pretrained_gguf` method not found on the model object.")
             print("Ensure you are using a recent version of Unsloth/Transformers that supports it, or use llama.cpp conversion tools.")
        except Exception as e:
            print(f"Error saving GGUF model: {e}")

else:
    print("\nSkipping optional saving: Inference model not available.")