In [9]:
# ! pip install transformers accelerate datasets peft bitsandbytes evaluate sentencepiece

In [None]:
# train_bert_mnli_kaggle.py
# Fine-tune BERT-base on MNLI and push final model to HF Hub repo "{HF_USERNAME}/BERT-Base-MNLI-Orig"
import os
import argparse
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
    AutoConfig
)
import evaluate
from huggingface_hub import HfApi, login
from kaggle_secrets import UserSecretsClient

def push_folder_to_hub(folder_path, repo_id, token):
    api = HfApi()
    api.create_repo(repo_id=repo_id, token=token, exist_ok=True)
    api.upload_folder(folder_path=folder_path, repo_id=repo_id, token=token, repo_type="model")

def preprocess_batch(examples, tokenizer, max_length=256):
    return tokenizer(examples["premise"], examples["hypothesis"], truncation=True, max_length=max_length)

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--model_id", default="bert-base-uncased")
    parser.add_argument("--output_dir", default="/kaggle/working/bert_mnli_out")
    parser.add_argument("--epochs", type=int, default=5)
    parser.add_argument("--train_bs", type=int, default=8)
    parser.add_argument("--eval_bs", type=int, default=16)
    parser.add_argument("--lr", type=float, default=2e-5)
    parser.add_argument("--max_len", type=int, default=256)
    args, unknown = parser.parse_known_args()

    # HF secrets (Kaggle)
    user_secrets = UserSecretsClient()
    HF_TOKEN = user_secrets.get_secret("HF_TOKEN")
    HF_USERNAME = user_secrets.get_secret("HF_USERNAME")
    if not HF_TOKEN or not HF_USERNAME:
        raise RuntimeError("Set HF_TOKEN and HF_USERNAME as Kaggle Secrets before running.")
    repo_id = f"{HF_USERNAME}/BERT-Base-MNLI-Orig"

    # Login for uploads
    login(token=HF_TOKEN)

    # Load tokenizer & (sequence-classification) model
    tokenizer = AutoTokenizer.from_pretrained(args.model_id, use_fast=True)
    # Create config with correct num_labels
    cfg = AutoConfig.from_pretrained(args.model_id)
    cfg.num_labels = 3
    model = AutoModelForSequenceClassification.from_pretrained(args.model_id, config=cfg)

    # Load MNLI
    ds = load_dataset("multi_nli")
    train = ds["train"]
    val_matched = ds["validation_matched"]
    val_mismatched = ds["validation_mismatched"]

    # Preprocess (batched)
    train = train.map(lambda x: preprocess_batch(x, tokenizer, args.max_len), batched=True, remove_columns=[c for c in train.column_names if c not in ("premise","hypothesis","label")])
    val_matched = val_matched.map(lambda x: preprocess_batch(x, tokenizer, args.max_len), batched=True, remove_columns=[c for c in val_matched.column_names if c not in ("premise","hypothesis","label")])
    val_mismatched = val_mismatched.map(lambda x: preprocess_batch(x, tokenizer, args.max_len), batched=True, remove_columns=[c for c in val_mismatched.column_names if c not in ("premise","hypothesis","label")])

    # Combine matched + mismatched as test set for final evaluation
    from datasets import concatenate_datasets

# Combine two datasets
    test = concatenate_datasets([val_matched, val_mismatched])
    # Data collator
    data_collator = DataCollatorWithPadding(tokenizer)
    print(test[:2])
    # Metrics
    metric_acc = evaluate.load("accuracy")
    metric_f1 = evaluate.load("f1")
    metric_prec = evaluate.load("precision")
    metric_rec = evaluate.load("recall")

    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        preds = logits.argmax(axis=-1)
        return {
            "accuracy": metric_acc.compute(predictions=preds, references=labels)["accuracy"],
            "precision_macro": metric_prec.compute(predictions=preds, references=labels, average="macro")["precision"],
            "recall_macro": metric_rec.compute(predictions=preds, references=labels, average="macro")["recall"],
            "f1_macro": metric_f1.compute(predictions=preds, references=labels, average="macro")["f1"],
        }

    # Training args
    training_args = TrainingArguments(
        output_dir=args.output_dir,
        num_train_epochs=args.epochs,
        per_device_train_batch_size=args.train_bs,
        per_device_eval_batch_size=args.eval_bs,
        learning_rate=args.lr,
        eval_strategy="epoch",
        save_strategy="epoch",
        logging_strategy="steps",
        weight_decay=0.01,
        warmup_steps=500,
        logging_steps=100,
        load_best_model_at_end=True,
        metric_for_best_model="accuracy",
        remove_unused_columns=True,
        fp16=True,
        report_to="none"
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train,
        eval_dataset=test,   # evaluate on combined validation during training
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics
    )

    # Train
    trainer.train()

    # Final evaluation on test (validation_matched + validation_mismatched)
    eval_res = trainer.evaluate(eval_dataset=test)
    print("Final evaluation (combined val_matched + val_mismatched):")
    print(eval_res)

    # Save locally and push to Hugging Face hub
    final_dir = os.path.join(args.output_dir, "final_model")
    os.makedirs(final_dir, exist_ok=True)
    model.save_pretrained(final_dir)
    tokenizer.save_pretrained(final_dir)

    push_folder_to_hub(final_dir, repo_id, HF_TOKEN)
    print("Pushed model to:", repo_id)

if __name__ == "__main__":
    main()


## Debiased Model training and evaluation

In [11]:
#Create and process debiased MNLI dataset

# Create folder and download debiased MNLI JSONL (works on Kaggle)
import os, sys, subprocess
os.makedirs('data/debiased', exist_ok=True)
# Prefer wget on Kaggle; fallback to curl if needed
url = 'https://storage.googleapis.com/allennlp-public-data/gen-debiased-nli/mnli_z-aug.jsonl'
out = 'data/debiased/mnli_aug-z.jsonl'
if not os.path.exists(out):
    try:
        # Try wget first
        subprocess.run(['wget', '-O', out, url], check=True)
    except Exception:
        # Fallback to curl
        subprocess.run(['curl', '-L', url, '-o', out], check=True)
print('File present:', os.path.exists(out), out)

import random, numpy as np

seed = 42
random.seed(seed); np.random.seed(seed);


File present: True data/debiased/mnli_aug-z.jsonl


In [12]:
import pandas as pd
import json
from pathlib import Path
from sklearn.model_selection import train_test_split

# Load debiased MNLI JSONL
path = Path('./data/debiased/mnli_aug-z.jsonl')
data = pd.read_json(path, lines=True)

# Default split ratios and seed
train_frac, eval_frac, test_frac = 0.8, 0.1, 0.1
seed = 42

# Two-step split with stratification by 'type' to preserve distribution across splits
# 1) Split into train and temp (eval+test)
temp_frac = eval_frac + test_frac
stratify_train = data['type'] if 'type' in data.columns else None
train_df, temp_df = train_test_split(
    data,
    test_size=temp_frac,
    random_state=seed,
    shuffle=True,
    stratify=stratify_train
)

# 2) Split temp into eval and test, preserving overall ratios
stratify_temp = temp_df['type'] if 'type' in temp_df.columns else None
eval_df, test_df = train_test_split(
    temp_df,
    test_size=(test_frac / temp_frac) if temp_frac > 0 else 0.5,
    random_state=seed + 1,
    shuffle=True,
    stratify=stratify_temp
)

# Report sizes and per-type counts for sanity
print({
    'total': len(data),
    'train': len(train_df),
    'eval': len(eval_df),
    'test': len(test_df)
})
if 'type' in data.columns:
    print('Per-type counts:')
    print('train:', train_df['type'].value_counts().to_dict())
    print('eval :', eval_df['type'].value_counts().to_dict())
    print('test :', test_df['type'].value_counts().to_dict())
#Print per label counts
print('Per label counts:')
print('train:', train_df['label'].value_counts().to_dict())
print('eval :', eval_df['label'].value_counts().to_dict())
print('test :', test_df['label'].value_counts().to_dict())

# Optionally save splits (JSONL)
out_dir = Path('./data/debiased')
train_df.to_json(out_dir / 'mnli_aug-z.train.jsonl', orient='records', lines=True, force_ascii=False)
eval_df.to_json(out_dir / 'mnli_aug-z.eval.jsonl', orient='records', lines=True, force_ascii=False)
test_df.to_json(out_dir / 'mnli_aug-z.test.jsonl', orient='records', lines=True, force_ascii=False)
print('Saved splits to', str(out_dir))

{'total': 744326, 'train': 595460, 'eval': 74433, 'test': 74433}
Per-type counts:
train: {'original': 306161, 'generated': 289299}
eval : {'original': 38271, 'generated': 36162}
test : {'original': 38270, 'generated': 36163}
Per label counts:
train: {2: 204637, 1: 201242, 0: 189581}
eval : {2: 25676, 1: 25118, 0: 23639}
test : {2: 25575, 1: 25038, 0: 23820}
Saved splits to data\debiased


In [None]:
# train_bert_mnli_kaggle.py
# Fine-tune BERT-base on MNLI and push final model to HF Hub repo "{HF_USERNAME}/BERT-Base-MNLI-Orig"
import os
import argparse
from datasets import load_dataset
from transformers import EarlyStoppingCallback

import datasets
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
    AutoConfig
)
import evaluate
from huggingface_hub import HfApi, login

def push_folder_to_hub(folder_path, repo_id, token):
    api = HfApi()
    api.create_repo(repo_id=repo_id, token=token, exist_ok=True)
    api.upload_folder(folder_path=folder_path, repo_id=repo_id, token=token, repo_type="model")

def preprocess_batch(examples, tokenizer, max_length=256):
    return tokenizer(examples["premise"], examples["hypothesis"], truncation=True, max_length=max_length)

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--model_id", default="bert-base-uncased")
    parser.add_argument("--output_dir", default="/kaggle/working/bert_mnli_out")
    parser.add_argument("--epochs", type=int, default=3)
    parser.add_argument("--train_bs", type=int, default=8)
    parser.add_argument("--eval_bs", type=int, default=16)
    parser.add_argument("--lr", type=float, default=2e-5)
    parser.add_argument("--max_len", type=int, default=256)
    args, unknown = parser.parse_known_args()


    HF_TOKEN = ""
    HF_USERNAME = "ndsanjana"
    wandb_token = os.environ.get("WANDB_TOKEN")
    # os.environ["WANDB_API_KEY"] = wandb_token
    os.environ["WANDB_PROJECT"] = "mnli-debiased"
    import wandb
    wandb.login(key=wandb_token)  # Usually you only need to run this once per machine
    wandb.init(entity="rahul-krishnan27-universit-t-trier", project="mnli-debiased")
    if not HF_TOKEN or not HF_USERNAME:
        raise RuntimeError("Set HF_TOKEN and HF_USERNAME as Kaggle Secrets before running.")
    repo_id = f"{HF_USERNAME}/BERT-Base-MNLI-Debiased"

    # Login for uploads
    login(token=HF_TOKEN)

    # Load tokenizer & (sequence-classification) model
    tokenizer = AutoTokenizer.from_pretrained(args.model_id, use_fast=True)
    # Create config with correct num_labels and explicit label mapping
    cfg = AutoConfig.from_pretrained(args.model_id)
    cfg.num_labels = 3
    # Explicit class order: 0=ENTAILMENT, 1=NEUTRAL, 2=CONTRADICTION, plus LABEL_* for eval compatibility
    cfg.id2label = {0: "LABEL_0", 1: "LABEL_1", 2: "LABEL_2"}
    cfg.label2id = {"LABEL_0": 0, "LABEL_1": 1, "LABEL_2": 2,
                    "ENTAILMENT": 0, "NEUTRAL": 1, "CONTRADICTION": 2}
    model = AutoModelForSequenceClassification.from_pretrained(args.model_id, config=cfg)
    model.config.hidden_dropout_prob = 0.2
    model.config.attention_probs_dropout_prob = 0.1
    # Load MNLI
    train = Dataset.from_pandas(train_df)
    eval_set = Dataset.from_pandas(eval_df)
    test = Dataset.from_pandas(test_df)
    # Preprocess (batched)
    train = train.map(lambda x: preprocess_batch(x, tokenizer, args.max_len), batched=True, remove_columns=[c for c in train.column_names if c not in ("premise","hypothesis","label")])
    eval_set = eval_set.map(lambda x: preprocess_batch(x, tokenizer, args.max_len), batched=True, remove_columns=[c for c in eval_set.column_names if c not in ("premise","hypothesis","label")])
    test = test.map(lambda x: preprocess_batch(x, tokenizer, args.max_len), batched=True, remove_columns=[c for c in test.column_names if c not in ("premise","hypothesis","label")])
    print(eval_set[:2])
    # Combine matched + mismatched as test set for final evaluation
    #test = val_matched.concatenate(val_mismatched)
    # Data collator
    data_collator = DataCollatorWithPadding(tokenizer)

    # Metrics
    metric_acc = evaluate.load("accuracy")
    metric_f1 = evaluate.load("f1")
    metric_prec = evaluate.load("precision")
    metric_rec = evaluate.load("recall")

    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        preds = logits.argmax(axis=-1)
        return {
            "accuracy": metric_acc.compute(predictions=preds, references=labels)["accuracy"],
            "precision_macro": metric_prec.compute(predictions=preds, references=labels, average="macro")["precision"],
            "recall_macro": metric_rec.compute(predictions=preds, references=labels, average="macro")["recall"],
            "f1_macro": metric_f1.compute(predictions=preds, references=labels, average="macro")["f1"],
            }

    # Training args
    training_args = TrainingArguments(
        output_dir="your_output_dir",
        per_device_train_batch_size=32,
        per_device_eval_batch_size=16,
        learning_rate=1e-5,
        num_train_epochs=5,
        lr_scheduler_type="constant_with_warmup",
        warmup_steps=2000,
        weight_decay=0.01,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="accuracy",
        greater_is_better=True,
        logging_steps=200,
        fp16=True,  # if you have GPU support
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train,
        eval_dataset=eval_set,   # evaluate on combined validation during training
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
    )

    # Train
    trainer.train()

    # Final evaluation on test (validation_matched + validation_mismatched)

    eval_res = trainer.evaluate(eval_dataset=test)
    print("Final evaluation - MNLI Debiased:")
    print(eval_res)

    # Save locally and push to Hugging Face hub
    final_dir = os.path.join(args.output_dir, "final_model_debiased")
    os.makedirs(final_dir, exist_ok=True)
    model.save_pretrained(final_dir)
    tokenizer.save_pretrained(final_dir)

    push_folder_to_hub(final_dir, repo_id, HF_TOKEN)
    print("Pushed model to:", repo_id)

if __name__ == "__main__":
    main()




[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


0,1
eval/accuracy,▁▇█▇█▇
eval/f1_macro,▁▇█▇▇▇
eval/loss,▂▁▂▆█▂
eval/precision_macro,▁▆█▇▇▇
eval/recall_macro,▁▇███▇
eval/runtime,▁▁▅▅██
eval/samples_per_second,██▃▄▁▁
eval/steps_per_second,██▃▄▁▁
train/epoch,▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇███
train/global_step,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇████

0,1
eval/accuracy,0.88817
eval/f1_macro,0.88861
eval/loss,0.33112
eval/precision_macro,0.88986
eval/recall_macro,0.88808
eval/runtime,266.9512
eval/samples_per_second,278.826
eval/steps_per_second,17.43
total_flos,1.3269533217921792e+17
train/epoch,5


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 595460/595460 [00:40<00:00, 14608.39 examples/s]
Map: 100%|██████████| 74433/74433 [00:05<00:00, 12919.73 examples/s]
Map: 100%|██████████| 74433/74433 [00:04<00:00, 15036.66 examples/s]


{'premise': ["JERUSALEM (AP) - Israel's top court Wednesday upheld the right of the country's largest daily paper to publish editorially critical of the government, a decision that outraged politicians and human rights groups but won admiration from most Israelis.", 'And each year at festival time, Edinburgh willingly gives its streets over to stilt walkers, automatons, satirists, and barbershop quartets along with 500,000 visitors belying its reputation for being sober and staid.'], 'hypothesis': ['Politicians and other groups were upset by the ruling.', 'With all the people and activity, Edinburgh is quite chaotic around festival time. '], 'label': [0, 1], 'input_ids': [[101, 6744, 1006, 9706, 1007, 1011, 3956, 1005, 1055, 2327, 2457, 9317, 16813, 1996, 2157, 1997, 1996, 2406, 1005, 1055, 2922, 3679, 3259, 2000, 10172, 8368, 2135, 4187, 1997, 1996, 2231, 1010, 1037, 3247, 2008, 23558, 8801, 1998, 2529, 2916, 2967, 2021, 2180, 17005, 2013, 2087, 28363, 1012, 102, 8801, 1998, 2060, 296

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision Macro,Recall Macro,F1 Macro
1,0.3546,0.334551,0.875821,0.876144,0.876054,0.876097
2,0.2889,0.309098,0.888692,0.889359,0.888806,0.889008
3,0.2108,0.327444,0.891003,0.892059,0.890956,0.891341
4,0.1658,0.373436,0.89111,0.891512,0.891265,0.891357
5,0.1287,0.455744,0.888154,0.888611,0.888076,0.888293


Final evaluation - MNLI Debiased:
{'eval_loss': 0.3783320486545563, 'eval_accuracy': 0.8899547243830022, 'eval_precision_macro': 0.8902772518489513, 'eval_recall_macro': 0.8901263932942888, 'eval_f1_macro': 0.8901749240240694, 'eval_runtime': 159.853, 'eval_samples_per_second': 465.634, 'eval_steps_per_second': 29.108, 'epoch': 5.0}


model.safetensors: 100%|██████████| 438M/438M [06:18<00:00, 1.16MB/s]   


Pushed model to: ndsanjana/BERT-Base-MNLI-Debiased


In [14]:
# # kaggle_train_gemma_nli.py
# # Kaggle-ready trainer: Gemma-3-270m + LoRA -> merge -> push to HF Hub repo "{HF_USERNAME}/gemma3_mnli_orig"
# import os
# import argparse
# from datasets import load_dataset
# import evaluate
# from transformers import (
#     AutoTokenizer,
#     AutoConfig,
#     AutoModelForSequenceClassification,
#     Trainer,
#     TrainingArguments,
#     DataCollatorWithPadding
# )
# from peft import LoraConfig, get_peft_model
# from peft import PeftModel
# from huggingface_hub import HfApi, login

# def get_model_and_tokenizer(model_name, num_labels, token):
#     tok = AutoTokenizer.from_pretrained(model_name, use_fast=True, token=token)
#     try:
#         model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels, trust_remote_code=True, token=token)
#     except Exception:
#         from transformers import AutoModel, AutoConfig, PreTrainedModel
#         import torch.nn as nn, torch
#         cfg = AutoConfig.from_pretrained(model_name, token=token)
#         cfg.num_labels = num_labels
#         base = AutoModel.from_pretrained(model_name, config=cfg, trust_remote_code=True, token=token)
#         class SimpleClsModel(PreTrainedModel):
#             config_class = type(cfg)
#             def __init__(self, base_model, config):
#                 super().__init__(config)
#                 self.base = base_model
#                 hidden = base_model.config.hidden_size
#                 self.pooler = nn.Linear(hidden, hidden)
#                 self.classifier = nn.Linear(hidden, config.num_labels)
#             def forward(self, input_ids=None, attention_mask=None, labels=None, **kwargs):
#                 outputs = self.base(input_ids=input_ids, attention_mask=attention_mask, return_dict=True)
#                 last = outputs.last_hidden_state
#                 pooled = last.mean(dim=1)
#                 pooled = torch.tanh(self.pooler(pooled))
#                 logits = self.classifier(pooled)
#                 from transformers.modeling_outputs import SequenceClassifierOutput
#                 loss = None
#                 if labels is not None:
#                     import torch.nn.functional as F
#                     if self.config.num_labels == 1:
#                         loss = F.mse_loss(logits.view(-1), labels.view(-1).float())
#                     else:
#                         loss = F.cross_entropy(logits.view(-1, self.config.num_labels), labels.view(-1))
#                 return SequenceClassifierOutput(loss=loss, logits=logits, hidden_states=None, attentions=None)
#         model = SimpleClsModel(base, cfg)
#     return tok, model

# def preprocess(batch, tokenizer, max_length=256):
#     # ensure truncation; do NOT set return_tensors here
#     return tokenizer(batch["premise"], batch["hypothesis"], truncation=True, max_length=max_length)

# def push_folder_to_hub(folder_path, repo_id, token):
#     api = HfApi()
#     # create repo if not exists
#     api.create_repo(repo_id=repo_id, token=token, exist_ok=True)
#     # upload entire folder
#     api.upload_folder(folder_path=folder_path, repo_id=repo_id, token=token, repo_type="model")

# def main():
#     parser = argparse.ArgumentParser()
#     parser.add_argument("--model_name", default="google/gemma-3-270m")
#     parser.add_argument("--output_dir", default="/kaggle/working/gemma_nli_out")
#     # fixed sensible defaults (term-paper friendly)
#     parser.add_argument("--per_device_train_batch_size", type=int, default=8)
#     parser.add_argument("--per_device_eval_batch_size", type=int, default=16)
#     parser.add_argument("--epochs", type=int, default=3)
#     parser.add_argument("--lr", type=float, default=2e-5)
#     parser.add_argument("--lora_r", type=int, default=16)
#     parser.add_argument("--lora_alpha", type=int, default=32)
#     parser.add_argument("--max_len", type=int, default=256)
#     args, unknown = parser.parse_known_args()
#     from kaggle_secrets import UserSecretsClient
#     user_secrets = UserSecretsClient()
#     HF_TOKEN = user_secrets.get_secret("HF_TOKEN")
#     HF_USERNAME = user_secrets.get_secret("HF_USERNAME")

#     #HF_TOKEN = os.environ.get("HF_TOKEN")
#     #HF_USERNAME = os.environ.get("HF_USERNAME")
#     if not HF_TOKEN or not HF_USERNAME:
#         raise RuntimeError("Set HF_TOKEN and HF_USERNAME as Kaggle Secrets or environment variables before running.")

#     repo_id = f"{HF_USERNAME}/gemma3_mnli_debiased"
#     from huggingface_hub import login
#     from datasets import Dataset
#     login(token=HF_TOKEN)
#     num_labels = 3
#     tokenizer, model = get_model_and_tokenizer(args.model_name, num_labels, token=HF_TOKEN)

#     # Apply LoRA / PEFT
#     lora_config = LoraConfig(
#         task_type="SEQ_CLS",
#         r=args.lora_r,
#         lora_alpha=args.lora_alpha,
#         target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
#         inference_mode=False
#     )
#     model = get_peft_model(model, lora_config)
#         # Unfreeze typical head names — print names first to confirm an exact substring
#     for n,p in model.named_parameters():
#         if any(k in n.lower() for k in ("classifier","pooler","out_proj","dense")):
#             p.requires_grad = True
#     # Load MNLI
#     #dataset = load_dataset("multi_nli")
#     train = Dataset.from_pandas(train_df)
#     eval_set = Dataset.from_pandas(eval_df)

#     #eval_set = dataset["validation_matched"]
#     n = len(train)               # total samples
#     print(n)
#     # tokenization (batched)
#     train = train.map(lambda x: preprocess(x, tokenizer, args.max_len), batched=True, remove_columns=[c for c in train.column_names if c not in ("premise","hypothesis","label")])
#     eval_set = eval_set.map(lambda x: preprocess(x, tokenizer, args.max_len), batched=True, remove_columns=[c for c in eval_set.column_names if c not in ("premise","hypothesis","label")])

#     data_collator = DataCollatorWithPadding(tokenizer)
#     metric = evaluate.load("accuracy")

#     def compute_metrics(eval_pred):
#         logits, labels = eval_pred
#         preds = logits.argmax(axis=-1)
#         return metric.compute(predictions=preds, references=labels)

#     training_args = TrainingArguments(
#         output_dir=args.output_dir,
#         per_device_train_batch_size=args.per_device_train_batch_size,
#         per_device_eval_batch_size=args.per_device_eval_batch_size,
#         num_train_epochs=args.epochs,
#         learning_rate=args.lr,
#         eval_strategy="epoch",   # older/newer transformers differences: this is compatible on many versions; if error, use eval_strategy
#         save_strategy="epoch",
#         logging_strategy="steps",
#         logging_steps=100,
#         remove_unused_columns=True,
#         fp16=True,
#         push_to_hub=False,   # we will push merged model ourselves
#         report_to="none"
#     )

#     trainer = Trainer(
#         model=model,
#         args=training_args,
#         train_dataset=train,
#         eval_dataset=eval_set,
#         tokenizer=tokenizer,
#         data_collator=data_collator,
#         compute_metrics=compute_metrics
#     )

#     trainer.train()
#     trainer.evaluate()

#     # Save the PEFT adapter first
#     adapter_dir = os.path.join(args.output_dir, "peft_adapter_debias")
#     model.save_pretrained(adapter_dir)

#     # Merge PEFT weights into a standalone model for inference
#     # PeftModel.merge_and_unload() returns a merged model (may be framework dependent)
#     try:
#         # If model is a PeftModel, merge adapters
#         if hasattr(model, "merge_and_unload"):
#             merged_model = model.merge_and_unload()
#         else:
#             # fallback: load base and PeftModel then merge
#             base = AutoModelForSequenceClassification.from_pretrained(args.model_name, num_labels=num_labels, trust_remote_code=True, token=HF_TOKEN)
#             pft = PeftModel.from_pretrained(base, adapter_dir, token=HF_TOKEN)
#             merged_model = pft.merge_and_unload()
#     except Exception as e:
#         print("Warning: merging adapters failed — saving adapter only. Error:", e)
#         merged_model = None

#     # Save merged model if available; otherwise save adapter + base reference
#     if merged_model is not None:
#         merged_dir = os.path.join(args.output_dir, "merged_model_debias")
#         merged_model.save_pretrained(merged_dir)
#         tokenizer.save_pretrained(merged_dir)
#         # push merged_dir to Hub
#         push_folder_to_hub(merged_dir, repo_id, HF_TOKEN)
#         print("Merged model pushed to:", repo_id)
#     else:
#         # push adapter + instruction for loading
#         push_folder_to_hub(adapter_dir, repo_id, HF_TOKEN)
#         tokenizer.save_pretrained(adapter_dir)
#         print("Adapter pushed to:", repo_id, " — note: loading requires base model + adapter")

# if __name__ == "__main__":
#     main()


In [15]:
# # Evaluate fine-tuned model on test_df with accuracy, precision, recall, F1
# import os
# import numpy as np
# from datasets import Dataset
# from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, Trainer, TrainingArguments
# from peft import PeftModel
# import evaluate

# # Ensure the split cell has run
# assert 'test_df' in globals(), "test_df not found. Run the split cell first."

# # Paths consistent with the training cell's defaults
# output_dir = "/kaggle/working/gemma_nli_out"
# merged_dir = os.path.join(output_dir, "merged_model_debias")
# adapter_dir = os.path.join(output_dir, "peft_adapter_debias")
# model_name = "google/gemma-3-270m"
# num_labels = 3
# max_len = 256

# # Tokenizer: prefer merged_dir if available, else base model
# tok_path = merged_dir if os.path.isdir(merged_dir) else model_name
# tokenizer = AutoTokenizer.from_pretrained(tok_path, use_fast=True)

# # Model: prefer merged standalone model; else base + adapter
# model = None
# if os.path.isdir(merged_dir):
#     try:
#         model = AutoModelForSequenceClassification.from_pretrained(merged_dir, trust_remote_code=True)
#     except Exception as e:
#         print("Warning: failed to load merged model; falling back to base + adapter.", e)

# if model is None:
#     base = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels, trust_remote_code=True)
#     model = PeftModel.from_pretrained(base, adapter_dir)

# # Convert pandas -> HF Dataset
# test_ds = Dataset.from_pandas(test_df)

# # Tokenize
# def preprocess_eval(batch):
#     return tokenizer(batch["premise"], batch["hypothesis"], truncation=True, max_length=max_len)

# keep_cols = ["premise", "hypothesis", "label"]
# remove_cols = [c for c in test_ds.column_names if c not in keep_cols]
# test_ds_tok = test_ds.map(preprocess_eval, batched=True, remove_columns=remove_cols)

# data_collator = DataCollatorWithPadding(tokenizer)

# eval_args = TrainingArguments(
#     output_dir=output_dir,
#     per_device_eval_batch_size=32,
#     dataloader_drop_last=False,
#     report_to="none"
# )

# trainer = Trainer(
#     model=model,
#     args=eval_args,
#     tokenizer=tokenizer,
#     data_collator=data_collator
# )

# pred_out = trainer.predict(test_ds_tok)
# logits = pred_out.predictions
# labels = pred_out.label_ids
# preds = np.argmax(logits, axis=-1)

# acc = evaluate.load("accuracy")
# prec = evaluate.load("precision")
# rec = evaluate.load("recall")
# f1 = evaluate.load("f1")

# results = {
#     "accuracy": acc.compute(predictions=preds, references=labels)["accuracy"],
#     "precision": prec.compute(predictions=preds, references=labels, average="macro")["precision"],
#     "recall": rec.compute(predictions=preds, references=labels, average="macro")["recall"],
#     "f1": f1.compute(predictions=preds, references=labels, average="macro")["f1"],
# }
# print(results)

In [16]:
links = [
    'https://github.com/Aatlantise/syntactic-augmentation-nli/blob/master/datasets/comb_orig_large.tsv',
    'https://github.com/Aatlantise/syntactic-augmentation-nli/blob/master/datasets/comb_trsf_large.tsv'
    'https://github.com/Aatlantise/syntactic-augmentation-nli/blob/master/datasets/inv_orig_large.tsv',
    'https://github.com/Aatlantise/syntactic-augmentation-nli/blob/master/datasets/inv_trsf_large.tsv',
    'https://github.com/Aatlantise/syntactic-augmentation-nli/blob/master/datasets/pass_orig_large.tsv',
    'https://github.com/Aatlantise/syntactic-augmentation-nli/blob/master/datasets/pass_trsf_large.tsv'
]
