Main Code

In [None]:
# 1. Install libraries 
!pip install "unsloth[colab-new]"
!pip install trl==0.22.0
!pip install --no-deps xformers peft accelerate bitsandbytes
!pip install jsonlines

import os
import json
import re
import pandas as pd
from unsloth import FastLanguageModel
import torch
from trl import SFTTrainer
from transformers import TrainingArguments
from datasets import Dataset

ü¶• Unsloth: Will patch your computer to enable 2x faster free finetuning.
ü¶• Unsloth Zoo will now patch everything to make training faster!


300 step training

In [None]:
# 2. Download and prepare data
!rm -rf semeval2026-task12-dataset
!git clone https://github.com/sooo66/semeval2026-task12-dataset.git

BASE_PATH = "/content/semeval2026-task12-dataset"

# --- Function to load all docs.json files from all folders ---
def load_all_docs(base_path):
    all_docs_map = {}
    splits =['train_data', 'dev_data', 'test_data', 'sample_data']

    print("Loading documents...")
    for split in splits:
        doc_path = os.path.join(base_path, split, 'docs.json')
        if os.path.exists(doc_path):
            try:
                with open(doc_path, 'r', encoding='utf-8') as f:
                    data = json.load(f)
                    if isinstance(data, list):
                        for item in data:
                            t_id = item.get('topic_id')
                            if t_id is not None:
                                all_docs_map[t_id] = item.get('docs',[])
                    elif isinstance(data, dict):
                         all_docs_map.update(data)
                print(f"Loaded docs from {split}")
            except Exception as e:
                print(f"Error loading {doc_path}: {e}")
    return all_docs_map

docs_map = load_all_docs(BASE_PATH)

def read_jsonl(path):
    data = []
    if not os.path.exists(path):
        return[]
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            if line.strip():
                data.append(json.loads(line))
    return data

# --- Modified to keep track of golden answers for evaluation ---
def format_data(split_folder, is_test=False):
    file_path = os.path.join(BASE_PATH, split_folder, 'questions.jsonl')
    raw_data = read_jsonl(file_path)
    formatted_data =[]

    print(f"Processing {split_folder} - Found {len(raw_data)} questions.")

    for item in raw_data:
        topic_id = item.get('topic_id')
        context_text = ""

        if topic_id in docs_map:
            docs_list = docs_map[topic_id]
            texts =[]
            for d in docs_list:
                if isinstance(d, dict):
                    text = d.get('content', d.get('snippet', ''))
                    if text: texts.append(text)
                else:
                    texts.append(str(d))
            context_text = "\n\n".join(texts)

        # Truncate text to prevent memory errors
        context_text = context_text[:2500]
        event = item.get('target_event', '')
        options = f"A: {item.get('option_A', '')}\nB: {item.get('option_B', '')}\nC: {item.get('option_C', '')}\nD: {item.get('option_D', '')}"

        prompt = f"""Below is an event and context documents. Identify the most probable cause using Abductive Reasoning.

### Context:
{context_text}

### Event:
{event}

### Options:
{options}

### Answer:
"""
        # Save golden answer as a list
        ans = item.get('golden_answer',[])
        if isinstance(ans, str):
            ans = [ans] if ans else[]

        entry = {"text": prompt, "prompt": prompt, "id": item.get("id"), "golden": ans}

        if not is_test and ans:
            ans_str = ", ".join(ans)
            entry["text"] += ans_str + "<|end_of_text|>"

        formatted_data.append(entry)

    return formatted_data

# Load datasets
train_data = format_data('train_data', is_test=False)
dev_data = format_data('dev_data', is_test=False)
test_data = format_data('test_data', is_test=True)

train_dataset = Dataset.from_pandas(pd.DataFrame(train_data))

# 3. Load model and train
max_seq_length = 2048
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3.2-1B-Instruct-bnb-4bit",
    max_seq_length = max_seq_length,
    load_in_4bit = True,
)

model = FastLanguageModel.get_peft_model(
    model,
    r = 16, target_modules =["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 16, lora_dropout = 0, bias = "none",
    use_gradient_checkpointing = "unsloth", random_state = 3407,
)

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 10,
        max_steps = 300,
        learning_rate = 2e-4,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 10,
        optim = "adamw_8bit",
        output_dir = "outputs",
    ),
)

print("Starting training...")
trainer.train()

# 4. EVALUATION MODULE (Added for the Report)
FastLanguageModel.for_inference(model)

def extract_predicted_options(text):
    """Extracts A, B, C, D from the model's raw text using Regex."""
    matches = re.findall(r'\b[A-D]\b', text.upper())
    return set(matches)

def calculate_score(pred_set, gold_set):
    """Calculates SemEval metric: 1 for Full Match, 0.5 for Partial, 0 for Incorrect"""
    if not pred_set or not gold_set:
        return 0.0
    if pred_set == gold_set:
        return 1.0
    if pred_set.issubset(gold_set):
        return 0.5
    return 0.0

def evaluate_on_dev(dev_data):
    print(f"\nEvaluating on DEV set ({len(dev_data)} samples)...")
    total_score = 0.0
    error_samples =[]

    for i, item in enumerate(dev_data):
        if i % 20 == 0 and i > 0: print(f"Evaluated {i}/{len(dev_data)}")

        inputs = tokenizer([item["prompt"]], return_tensors="pt").to("cuda")
        outputs = model.generate(
            **inputs, max_new_tokens=10, use_cache=True, pad_token_id=tokenizer.eos_token_id
        )

        decoded = tokenizer.batch_decode(outputs)[0]
        response = decoded.split("### Answer:\n")[-1].replace("<|end_of_text|>", "").strip()

        pred_set = extract_predicted_options(response)
        gold_set = set(item.get("golden",[]))

        score = calculate_score(pred_set, gold_set)
        total_score += score

        # Store a few errors for "Error Analysis" in the report
        if score < 1.0 and len(error_samples) < 3:
            error_samples.append({
                "predicted": list(pred_set),
                "golden": list(gold_set),
                "raw_response": response,
            })

    avg_score = total_score / len(dev_data) if len(dev_data) > 0 else 0
    print(f"\n=========================================")
    print(f"üèÜ FINAL DEV SCORE (Accuracy): {avg_score:.4f}")
    print(f"=========================================\n")

    print("üîç Error Analysis Samples (Include these in your report!):")
    for idx, err in enumerate(error_samples):
        print(f"  Error {idx+1}:")
        print(f"  - Model's Raw Output: '{err['raw_response']}'")
        print(f"  - Extracted Prediction: {err['predicted']}")
        print(f"  - Actual Golden Answer: {err['golden']}\n")

# Run Evaluation
if len(dev_data) > 0:
    evaluate_on_dev(dev_data)

# 5. Predict Test Data and Save
def generate_test_predictions(data_list):
    predictions =[]
    print(f"Generating Codabench predictions for test set ({len(data_list)} items)...")
    for i, item in enumerate(data_list):
        inputs = tokenizer([item["prompt"]], return_tensors="pt").to("cuda")
        outputs = model.generate(**inputs, max_new_tokens=10, use_cache=True, pad_token_id=tokenizer.eos_token_id)
        decoded = tokenizer.batch_decode(outputs)[0]
        response = decoded.split("### Answer:\n")[-1].replace("<|end_of_text|>", "").strip()

        # Format as list of strings for Codabench
        pred_list = list(extract_predicted_options(response))

        predictions.append({
            "id": item["id"],
            "prediction": pred_list if pred_list else["D"] # Default to D if extraction fails
        })
    return predictions

test_preds = generate_test_predictions(test_data)

# Save file for Codabench (even if site is closed, include in zip)
output_filename = "predictions.jsonl"
with open(output_filename, 'w') as f:
    for entry in test_preds:
        json.dump(entry, f)
        f.write('\n')

print(f"\nDone! File '{output_filename}' saved.")

Cloning into 'semeval2026-task12-dataset'...
remote: Enumerating objects: 69, done.[K
remote: Counting objects: 100% (69/69), done.[K
remote: Compressing objects: 100% (52/52), done.[K
remote: Total 69 (delta 30), reused 51 (delta 15), pack-reused 0 (from 0)[K
Receiving objects: 100% (69/69), 6.72 MiB | 14.30 MiB/s, done.
Resolving deltas: 100% (30/30), done.
Loading documents...
Loaded docs from train_data
Loaded docs from dev_data
Loaded docs from test_data
Loaded docs from sample_data
Processing train_data - Found 1819 questions.
Processing dev_data - Found 400 questions.
Processing test_data - Found 612 questions.
==((====))==  Unsloth 2026.2.1: Fast Llama patching. Transformers: 4.57.6.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.563 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.10.0+cu128. CUDA: 7.5. CUDA Toolkit: 12.8. Triton: 3.6.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.35. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unslo

Unsloth 2026.2.1 patched 16 layers with 16 QKV layers, 16 O layers and 16 MLP layers.


Unsloth: Tokenizing ["text"] (num_proc=6):   0%|          | 0/1819 [00:00<?, ? examples/s]

Starting training...


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 1,819 | Num Epochs = 2 | Total steps = 300
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 11,272,192 of 1,247,086,592 (0.90% trained)
wandb: (1) Create a W&B account
wandb: (2) Use an existing W&B account
wandb: (3) Don't visualize my results
wandb: Enter your choice:wandb: You chose "Don't visualize my results"
wandb: Using W&B in offline mode.
wandb: W&B API key is configured. Use `wandb login --relogin` to force relogin


wandb: Detected [huggingface_hub.inference, openai] in use.
wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/


Step,Training Loss
10,2.8809
20,2.5126
30,2.231
40,1.8794
50,1.5688
60,1.1414
70,0.9568
80,0.6946
90,0.5891
100,0.4605




0,1
train/epoch,‚ñÅ‚ñÅ‚ñÅ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÑ‚ñÑ‚ñÑ‚ñÑ‚ñÖ‚ñÖ‚ñÖ‚ñÖ‚ñÜ‚ñÜ‚ñÜ‚ñÜ‚ñá‚ñá‚ñá‚ñá‚ñà‚ñà‚ñà‚ñà
train/global_step,‚ñÅ‚ñÅ‚ñÅ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÑ‚ñÑ‚ñÑ‚ñÑ‚ñÖ‚ñÖ‚ñÖ‚ñÖ‚ñÜ‚ñÜ‚ñÜ‚ñÜ‚ñá‚ñá‚ñá‚ñá‚ñà‚ñà‚ñà‚ñà
train/grad_norm,‚ñÇ‚ñÅ‚ñÇ‚ñÉ‚ñÖ‚ñà‚ñÑ‚ñÖ‚ñÜ‚ñÇ‚ñÑ‚ñÖ‚ñÉ‚ñÇ‚ñÑ‚ñÅ‚ñÇ‚ñÉ‚ñÅ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÅ‚ñÅ‚ñÇ‚ñÇ‚ñÇ‚ñÇ
train/learning_rate,‚ñá‚ñà‚ñà‚ñá‚ñá‚ñá‚ñá‚ñÜ‚ñÜ‚ñÜ‚ñÜ‚ñÖ‚ñÖ‚ñÖ‚ñÖ‚ñÑ‚ñÑ‚ñÑ‚ñÑ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÅ‚ñÅ
train/loss,‚ñà‚ñá‚ñÜ‚ñÖ‚ñÖ‚ñÑ‚ñÉ‚ñÉ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ

0,1
total_flos,9239322963800064.0
train/epoch,1.31648
train/global_step,300.0
train/grad_norm,0.53527
train/learning_rate,0.0
train/loss,0.0781
train_loss,0.59848
train_runtime,616.6255
train_samples_per_second,3.892
train_steps_per_second,0.487



Evaluating on DEV set (400 samples)...
Evaluated 20/400
Evaluated 40/400
Evaluated 60/400
Evaluated 80/400
Evaluated 100/400
Evaluated 120/400
Evaluated 140/400
Evaluated 160/400
Evaluated 180/400
Evaluated 200/400
Evaluated 220/400
Evaluated 240/400
Evaluated 260/400
Evaluated 280/400
Evaluated 300/400
Evaluated 320/400
Evaluated 340/400
Evaluated 360/400
Evaluated 380/400

üèÜ FINAL DEV SCORE (Accuracy): 0.3225

üîç Error Analysis Samples (Include these in your report!):
  Error 1:
  - Model's Raw Output: 'A'
  - Extracted Prediction: ['A']
  - Actual Golden Answer: ['B']

  Error 2:
  - Model's Raw Output: 'B'
  - Extracted Prediction: ['B']
  - Actual Golden Answer: ['A,B,C']

  Error 3:
  - Model's Raw Output: 'A,C'
  - Extracted Prediction: ['C', 'A']
  - Actual Golden Answer: ['A,C']

Generating Codabench predictions for test set (612 items)...

Done! File 'predictions.jsonl' saved.
