# Team Name: Team NeuralSight<br>
Model: *facebook/nllb-200-1.3B* with LoRA fine-tuning<br>
Technique: Parameter-Efficient Fine-Tuning (PEFT) using LoRA




Table of Contents:
1. Environment Setup & Dependencies
2. Data Loading
3. Model & Tokenizer Initialization
4. Data Preprocessing
5. LoRA Configuration
6. Training Configuration
7. Model Training
8. Inference & Prediction
9. Submission Generation

In [None]:
# Install dependencies
!pip install -q transformers datasets accelerate peft sentencepiece sacrebleu

In [None]:
# Imports
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
from peft import LoraConfig, get_peft_model
import torch


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# Load CSVs
train_df = pd.read_csv("/kaggle/input/aification/train.csv")
test_df = pd.read_csv("/kaggle/input/aification/test.csv")


In [None]:
# Reset index to avoid HF Dataset KeyErrors
train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

In [None]:
# Convert to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

In [None]:
# Load tokenizer & model
model_name = "facebook/nllb-200-1.3B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Preprocessing
# - Remove rows with missing source/target
# - Normalize whitespace
# - Optionally drop very long examples that will cause OOM

def preprocess_train(examples):
    inputs = [ex for ex in examples["bangla_question"]]
    targets = [ex for ex in examples["english_question"]]

    model_inputs = tokenizer(inputs, max_length=128, padding="max_length", truncation=True)
    labels = tokenizer(targets, max_length=128, padding="max_length", truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


def preprocess_test(examples):
    inputs = [ex for ex in examples["bangla_question"]]
    model_inputs = tokenizer(inputs, max_length=128, padding="max_length", truncation=True)
    return model_inputs


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/564 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.3M [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/808 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/5.48G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/5.48G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

In [None]:
#Tokenizer Initialization
tokenized_train = train_dataset.map(preprocess_train, batched=True)
tokenized_test = test_dataset.map(preprocess_test, batched=True)

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

In [None]:
# Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
# Added LoRA (Lightweight fine-tuning)
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],  # attention layers
    lora_dropout=0.05,
    bias="none",
    task_type="SEQ_2_SEQ_LM"
)

model = get_peft_model(model, lora_config)

In [None]:
# Training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./ben_en_stem_model",
    learning_rate=2e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=30,
    save_total_limit=2,
    predict_with_generate=True,
    fp16=True,
    logging_steps=50,
    report_to="none"
)

In [None]:
# Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator
)


  trainer = Seq2SeqTrainer(


In [None]:
# Start training
trainer.train()

Step,Training Loss
50,9.4561
100,7.7105
150,7.0045
200,6.9763
250,6.7749
300,6.8085
350,6.8107
400,6.9519
450,6.7333
500,6.8024


TrainOutput(global_step=18750, training_loss=6.661796681315105, metrics={'train_runtime': 9473.6279, 'train_samples_per_second': 15.833, 'train_steps_per_second': 1.979, 'total_flos': 1.282178285568e+17, 'train_loss': 6.661796681315105, 'epoch': 30.0})

In [None]:
# Translation inference function
def translate_text(texts):
    inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=max_len).to("cuda")
    outputs = model.generate(**inputs, max_length=max_len)
    return [tokenizer.decode(t, skip_special_tokens=True) for t in outputs]

In [None]:
# Predict on test CSV
if "bangla_question" in test_df.columns:
    test_df["english_question"] = test_df["bangla_question"].apply(lambda x: translate_text([x])[0])

    test_df = test_df.drop(columns=["bangla_question"])

    test_df.to_csv("submission.csv", index=False)
