In [2]:
import sys
import subprocess

def install(package):
    subprocess.check_call([sys.executable, "-m", "pip", "install", package])

install("trl")
install("bitsandbytes")
install("datasets")



In [None]:
import sys
sys.path.append("..")  # Add parent directory to the path

import os
from typing import List
from pathlib import Path
import numpy as np

# DO NOT EDIT
# create submission file
import pandas as pd
from huggingface_hub import login
from transformers import (
    AutoTokenizer,
)
from utils import (
    eval,
    model_function,
    multitask,
    experiment_logger
    )

import torch
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, PeftModel
from datasets import load_dataset, concatenate_datasets, Dataset, Value
from trl import SFTConfig, SFTTrainer, DataCollatorForCompletionOnlyLM

## **Load and Explore Datasets**

In [None]:
print("# Loading datasets")
train_dataset = multitask.load_and_combine_datasets("Train")
test_dataset = multitask.load_and_combine_datasets("Test")

print("\n# Example from training dataset:")
print(train_dataset[0])

print("\n# Example from test dataset:")
print(test_dataset[0])

In [None]:
print("# Converting to DataFrame and extracting task types")
train_df = train_dataset.to_pandas()
train_df['task'] = train_df.ID.apply(multitask.extract_task_from_id)

print("\n# Dataset distribution by task:")
print(train_df.task.value_counts())

In [None]:
multitask.display_formatted_examples(train_df)

Analyse target sequence lengths (without balancing)

In [None]:
print("\n# Analyzing target sequence lengths without balancing")
task_stats_before = multitask.analyze_task_lengths(train_df)

In [None]:
unbalanced_dataset = Dataset.from_pandas(train_df.reset_index(drop=True))

print("\n# The problem: Tasks with shorter outputs (like sentiment) will be underrepresented")
print("in the loss function compared to tasks with longer outputs (like mt).")

In [None]:
print("\n# Applying target length balancing fix")
balanced_df = multitask.balance_target_lengths(train_df)

print("\n# Analyzing target sequence lengths after balancing")
task_stats_after = multitask.analyze_task_lengths(balanced_df)

balanced_dataset = Dataset.from_pandas(balanced_df.reset_index(drop=True))

In [None]:
multitask.plot_target_lengths(train_df, balanced_df)

### Load the Model

In [None]:
from transformers import BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,  # or load_in_8bit=True for 8-bit quantization
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",  # or "fp4"
    bnb_4bit_compute_dtype=torch.bfloat16  # or torch.float16, depending on your hardware
)

In [None]:
print("\n# Setting up model with QLoRA")
model_name = "lelapa/InkubaLM-0.4B"
model, tokenizer, bnb_config = multitask.setup_model_and_tokenizer(model_name)

## Apply LoRA adapters

In [None]:
model = multitask.apply_lora_adapters(model)

In [None]:
tokenizer.pad_token = tokenizer.eos_token

In [None]:
print("\n# Training with unbalanced dataset (demonstrating the problem)")
unbalanced_trainer = multitask.setup_trainer(
    model=model,
    dataset= unbalanced_dataset,
    tokenizer=tokenizer,
    output_dir="./sft_model/unbalanced"
)
unbalanced_trainer.train()

In [None]:
balanced_trainer = multitask.setup_trainer(
    model=model,
    dataset=balanced_dataset,
    tokenizer=tokenizer,
    output_dir="./sft_model/balanced"
)
train_output = balanced_trainer.train()

## Save model in Google Drive

In [None]:
output_dir = "/content/drive/MyDrive/InkubaLM/outputs/lora_checkpoint"

In [None]:
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

In [None]:
## save the train_output metadata (loss, epochs, etc)
import json
with open(f"{output_dir}/training_output.json", "w") as f:
    json.dump({"training_loss": train_output.training_loss}, f)


## Extract Training Results and Log Experiment

In [None]:
final_loss = train_output.training_loss

In [None]:
zindi_score, dict_scores = eval.evaluate_zindi(os.path.join(
    output_path,
    "submission_test.csv")
)

In [None]:
zindi_score

In [None]:
experiment_logger.log_experiment_auto(
    trainer=balanced_trainer,
    train_output=train_output,
    prompt_variant="Instruction v3 + few-shot",
    task_metrics=dict_scores,
    lb_score=dict_scores["zindi_score"],
    notes="Added larger rank, new Swahili MT prompt"
)

### Check output

In [None]:
log_path = "/content/drive/MyDrive/InkubaLM/outputs/experiment_log.csv"
df = pd.read_csv(log_path)
df.head()  # Show first 5 rows
