In [None]:
from google.colab import drive
drive.mount('/content/drive')

#installing required libraries
!pip install -q transformers accelerate peft bitsandbytes datasets evaluate bert-score sacremoses sacrebleu
!pip install sentencepiece  # required for Gemma tokenizer
!pip install torchmetrics  #SARI calculation

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"


In [None]:
from datasets import load_dataset
import json

dataset_path = "/content/drive/MyDrive/Sinhala-simp-complex/Data/sitse_instruct_data.jsonl" #dataset uploaded and loaded from my drive

#loading dataset with Hugging Face
try:
    dataset = load_dataset("json", data_files=dataset_path)["train"]
    print("Dataset loaded successfully with Hugging Face")
    print(dataset[0])
except Exception as e:
    print(f"Hugging Face loading failed: {e}")
    print("Loading manually...")
    with open(dataset_path, "r", encoding="utf-8") as f:
        lines = f.readlines()
    dataset = [json.loads(line) for line in lines]
    print("Manual loading successful")
    print(dataset[0])

Hugging Face loading failed: Loading a dataset cached in a LocalFileSystem is not supported.
Loading manually...
Manual loading successful
{'instruction': 'Simplify the following Sinhala sentence.', 'input': '‡∂∏‡∑ô‡∂∏ ‡∂Ü‡∑É‡∑í‡∂∫‡∑è‡∂±‡∑î ‡∑É‡∂Ç‡∑Ä‡∂ª‡∑ä‡∂∞‡∂± ‡∂∂‡∑ê‡∂Ç‡∂ö‡∑î ‡∂´‡∂∫ ‡∂Ü‡∂∞‡∑è‡∂ª ‡∂∏‡∑î‡∂Ø‡∂Ω ‡∂ú‡∑ô‡∑Ä‡∑ì‡∂∏ ‡∑É‡∂≥‡∑Ñ‡∑è ‡∑Ä‡∑É‡∂ª ‡∂¥‡∑Ñ‡∂ö ‡∑É‡∑Ñ‡∂± ‡∂ö‡∑è‡∂Ω‡∂∫‡∂ö‡∑ä ‡∂Ø , ‡∂´‡∂∫ ‡∂∏‡∑î‡∂Ø‡∂Ω ‡∂ú‡∑ô‡∑Ä‡∑ì‡∂∏ ‡∑É‡∂≥‡∑Ñ‡∑è ‡∑Ä‡∑É‡∂ª 25 ‡∂ö‡∑è‡∂Ω‡∂∫‡∂ö‡∑ä‡∂Ø ‡∂Ω‡∑ê‡∂∂‡∑ô‡∂± ‡∂Ö‡∂≠‡∂ª ‡∂∏‡∑ô‡∂∏ ‡∂´‡∂∫ ‡∂∏‡∑î‡∂Ø‡∂Ω ‡∑É‡∂≥‡∑Ñ‡∑è ‡∂Ö‡∂∫‡∂ö‡∂ª‡∂± ‡∂¥‡∑ú‡∑Ö‡∑ì ‡∂Ö‡∂±‡∑î‡∂¥‡∑è‡∂≠‡∑í‡∂ö‡∂∫ ‡∑Ä‡∂±‡∑ä‡∂±‡∑ö ‡∂Ü‡∑É‡∑í‡∂∫‡∑è‡∂±‡∑î ‡∑É‡∂Ç‡∑Ä‡∂ª‡∑ä‡∂∞‡∂± ‡∂∂‡∑ê‡∂Ç‡∂ö‡∑î‡∑Ä‡∂ß ‡∂â‡∑Ñ‡∂Ω ‡∂Ω‡∂±‡∑ä‡∂©‡∂±‡∑ä ‡∂Ö‡∂±‡∑ä‡∂≠‡∂ª ‡∂∂‡∑ê‡∂Ç‡∂ö‡∑î‡∑Ä‡∑ö ‡∂¥‡∑ú‡∑Ö‡∑ì ‡∂Ö‡∂±‡∑î‡∂¥‡∑è‡∂≠‡∑í‡∂ö‡∂∫‡∂ß 0.6 % ‡∂ö‡∑ä ‡∂ë‡∂ö‡∂≠‡∑î ‡∂ö‡∂ª ‡∑Ä‡∑ö .', 'output': '‡∂∏‡∑ô‡∂∏ ‡∂Ü‡∑É‡∑í‡∂∫‡∑è‡∂±‡∑î ‡∑É‡∂Ç‡∑Ä‡∂ª‡∑ä‡∂∞‡∂± ‡∂∂‡∑ê‡∂Ç‡∂ö‡∑î ‡∂´‡∂∫ ‡∂Ü‡∂∞‡∑è‡∂ª ‡∂∏‡∑î‡∂Ø‡∂Ω ‡∂ú‡∑ô‡∑Ä‡∑ì‡∂∏ ‡∑É‡∂≥‡∑Ñ‡∑è ‡∑Ä‡∑É‡∂ª ‡∂¥‡

In [None]:
def format_prompt(example):
    return {
        "prompt": f"{example['instruction']}\n{example['input']}",
        "output": example["output"]
    }

#formatting
if isinstance(dataset, list):
    formatted_data = [format_prompt(ex) for ex in dataset]
else:
    formatted_data = dataset.map(format_prompt)

# Train-test split
from sklearn.model_selection import train_test_split

if isinstance(formatted_data, list):
    train_data, test_data = train_test_split(formatted_data, test_size=0.1)
else:
    split_data = formatted_data.train_test_split(test_size=0.1)
    train_data = split_data["train"]
    test_data = split_data["test"]

print(f"Train samples: {len(train_data)}")
print(f"Test samples: {len(test_data)}")

Train samples: 2700
Test samples: 300


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch

model_id = "google/gemma-2b"

hf_token = "your_huggingface_token_here......."  #hugging face token (add yours here)

# quantization config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)


tokenizer = AutoTokenizer.from_pretrained(
    model_id,
    token=hf_token  
)
tokenizer.pad_token = tokenizer.eos_token  

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",
    token=hf_token  
)

print("Model and tokenizer loaded successfully!")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Model and tokenizer loaded successfully!


In [None]:
from peft import LoraConfig, get_peft_model

lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 9,805,824 || all params: 2,515,978,240 || trainable%: 0.3897


In [None]:
import evaluate

bertscore = evaluate.load("bertscore")
sari = evaluate.load("sari")

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    pred_texts = tokenizer.batch_decode(preds, skip_special_tokens=True)
    label_texts = tokenizer.batch_decode(labels, skip_special_tokens=True)

    source_texts = [p.split("\n")[-1] for p in pred_texts]

    #compute BERTScore
    bertscore_results = bertscore.compute(
        predictions=pred_texts,
        references=label_texts,
        lang="en",  # Sinhala not directly supported
        model_type="bert-base-multilingual-cased"
    )

    #SARI
    sari_score = sari.compute(
        sources=source_texts,
        predictions=pred_texts,
        references=[[ref] for ref in label_texts]
    )["sari"]

    return {
        "bertscore_precision": sum(bertscore_results["precision"]) / len(bertscore_results["precision"]),
        "bertscore_recall": sum(bertscore_results["recall"]) / len(bertscore_results["recall"]),
        "bertscore_f1": sum(bertscore_results["f1"]) / len(bertscore_results["f1"]),
        "sari": sari_score
    }


In [None]:
from transformers import TrainingArguments, Trainer

# Convert data to tokenized format
def tokenize_function(examples):
    model_inputs = tokenizer(examples["prompt"], truncation=True, max_length=1024, padding="max_length")
    labels = tokenizer(examples["output"], truncation=True, max_length=1024, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


if isinstance(train_data, list):
    from datasets import Dataset
    train_dataset = Dataset.from_dict({"prompt": [x["prompt"] for x in train_data],
                                    "output": [x["output"] for x in train_data]})
    test_dataset = Dataset.from_dict({"prompt": [x["prompt"] for x in test_data],
                                   "output": [x["output"] for x in test_data]})
else:
    train_dataset = train_data
    test_dataset = test_data

tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)

# training arguments
training_args = TrainingArguments(
    output_dir="./gemma-lora-sinhala",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,        #  low memory eval
    eval_accumulation_steps=1,          
    gradient_accumulation_steps=8,
    learning_rate=2e-4,
    num_train_epochs=3,
    logging_dir="./logs",
    logging_steps=10,
    save_strategy="epoch",
    eval_strategy="epoch",
    fp16=True,
    bf16=False,
    remove_unused_columns=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
)

#check token lengths of input and output
input_lengths = [len(tokenizer(x["prompt"])["input_ids"]) for x in formatted_data]
output_lengths = [len(tokenizer(x["output"])["input_ids"]) for x in formatted_data]

print("Max input length:", max(input_lengths))
print("Max output length:", max(output_lengths))


Map:   0%|          | 0/2700 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Max input length: 590
Max output length: 622


In [None]:
#training
trainer.train()

# Save model
model.save_pretrained("gemma-lora-sinhala")
tokenizer.save_pretrained("gemma-lora-sinhala")

[34m[1mwandb[0m: Currently logged in as: [33manshath7[0m ([33manshath7-albukhary-international-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss
1,0.6583,0.648146
2,0.6791,0.63916
3,0.6084,0.636996



Cannot access gated repo for url https://huggingface.co/google/gemma-2b/resolve/main/config.json.
Access to model google/gemma-2b is restricted. You must have access to it and be authenticated to access it. Please log in. - silently ignoring the lookup for the file config.json in google/gemma-2b.

Cannot access gated repo for url https://huggingface.co/google/gemma-2b/resolve/main/config.json.
Access to model google/gemma-2b is restricted. You must have access to it and be authenticated to access it. Please log in. - silently ignoring the lookup for the file config.json in google/gemma-2b.

Cannot access gated repo for url https://huggingface.co/google/gemma-2b/resolve/main/config.json.
Access to model google/gemma-2b is restricted. You must have access to it and be authenticated to access it. Please log in. - silently ignoring the lookup for the file config.json in google/gemma-2b.

Cannot access gated repo for url https://huggingface.co/google/gemma-2b/resolve/main/config.json.
Acce

('gemma-lora-sinhala/tokenizer_config.json',
 'gemma-lora-sinhala/special_tokens_map.json',
 'gemma-lora-sinhala/tokenizer.model',
 'gemma-lora-sinhala/added_tokens.json',
 'gemma-lora-sinhala/tokenizer.json')

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from tqdm import tqdm
import evaluate

# load model and tokenizer
model_path = "./gemma-lora-sinhala"
model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32)
tokenizer = AutoTokenizer.from_pretrained(model_path)
model.eval()
model.to("cuda" if torch.cuda.is_available() else "cpu")

# evaluate metrics setup
bertscore = evaluate.load("bertscore")
sari_metric = evaluate.load("sari")

#convert dataset to normal dict if needed
if not isinstance(test_dataset, list):
    test_dataset = test_dataset.to_dict()
    formatted_test_data = [{"prompt": test_dataset["prompt"][i], "output": test_dataset["output"][i]} for i in range(len(test_dataset["prompt"]))]
else:
    formatted_test_data = test_dataset

generated = []
references = []
sources = []

print("üîç Generating predictions...")

for example in tqdm(formatted_test_data):
    prompt = example["prompt"]
    source = prompt.split("\n")[-1]
    input_ids = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024).input_ids.to(model.device)

    with torch.no_grad():
        output_ids = model.generate(input_ids, max_new_tokens=512, do_sample=False)

    pred = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    # Post-process to remove instruction if still present
    if "\n" in pred:
        pred = pred.split("\n")[-1]

    generated.append(pred.strip())
    references.append([example["output"].strip()])
    sources.append(source.strip())

print("‚úÖ Generation complete. Now computing metrics...")

#BERTScore
bertscore_result = bertscore.compute(predictions=generated, references=[r[0] for r in references], lang="en", model_type="bert-base-multilingual-cased")

#SARI
sari_scores = [
    sari_metric.compute(
        sources=[sources[i]],
        predictions=[generated[i]],
        references=[references[i]]
    )["sari"]
    for i in range(len(generated))
]

avg_sari = sum(sari_scores) / len(sari_scores)

#results
print("\nüìä Evaluation Metrics:")
print(f"BERTScore Precision: {sum(bertscore_result['precision']) / len(bertscore_result['precision']):.4f}")
print(f"BERTScore Recall:    {sum(bertscore_result['recall']) / len(bertscore_result['recall']):.4f}")
print(f"BERTScore F1:        {sum(bertscore_result['f1']) / len(bertscore_result['f1']):.4f}")
print(f"SARI (avg):          {avg_sari:.4f}")


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

üîç Generating predictions...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 300/300 [1:56:02<00:00, 23.21s/it]


‚úÖ Generation complete. Now computing metrics...


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]


üìä Evaluation Metrics:
BERTScore Precision: 0.8418
BERTScore Recall:    0.8844
BERTScore F1:        0.8603
SARI (avg):          61.1118


In [None]:
!cp -r ./gemma-lora-sinhala /content/drive/MyDrive/Sinhala-simp-complex

model.save_pretrained("/content/drive/MyDrive/Sinhala-simp-complex/model/gemma-lora-sinhala")
tokenizer.save_pretrained("/content/drive/MyDrive/Sinhala-simp-complex/model/gemma-lora-sinhala-tokenizer")