In [None]:
import subprocess
import sys
import os
import pickle


def install(package):
    subprocess.check_call([sys.executable, "-m", "pip", "install", package])

# Install packages
install("trl")
install("bitsandbytes")
install("datasets")




In [None]:
import sys
sys.path.append("..")  # Add parent directory to the path

import os
from typing import List
from pathlib import Path
import numpy as np

# DO NOT EDIT
# create submission file
import pandas as pd
from huggingface_hub import login
from transformers import (
    AutoTokenizer,
)
from utils import (
    eval,
    model_function,
    multitask,
    supplement,
    ab_testing
    )

import torch
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, PeftModel
from datasets import load_dataset, concatenate_datasets, Dataset, Value
from trl import SFTConfig, SFTTrainer, DataCollatorForCompletionOnlyLM
from peft import PeftModel

import shap

In [None]:
def build_prompt(example):
    return f"{example['instruction']}\n{example['inputs']}\nAnswer:"

def log_likelihood_score(prompt, label):
    full_input = prompt + " " + label
    input_ids = tokenizer(full_input, return_tensors="pt").input_ids.to("cuda")
    with torch.no_grad():
        outputs = model(input_ids, labels=input_ids)
        loss = outputs.loss
    return -loss.item()


def predict_fn(prompt_list):
    all_scores = []
    for prompt in prompt_list:
        scores = [log_likelihood_score(prompt, " " + label) for label in labels]
        probs = torch.nn.functional.softmax(torch.tensor(scores), dim=0)
        all_scores.append(probs.numpy())
    return all_scores

In [None]:
print("# Loading datasets")
se_dataset, mt_dataset,xn_dataset = multitask.load_and_combine_datasets("Train")
test_dataset = multitask.load_and_combine_datasets("Test")


test_dataset_list = list(test_dataset)
combined_test_dataset = concatenate_datasets(test_dataset_list)

## Load Base Model and Fine-Tuned QLoRA model

## Base Model:

In [None]:
model_name = "lelapa/InkubaLM-0.4B"
model, tokenizer, bnb_config = multitask.setup_model_and_tokenizer(model_name, token=token,use_4bit=False)

In [21]:
prompt = "Translate to Swahili: I love science."

inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
outputs = model.generate(**inputs, max_new_tokens=20)

print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Translate to Swahili: I love science. Translate to Swahili: I love science. Translate to Swahili: I love science.


### QLoRA Model

In [22]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model = AutoModelForCausalLM.from_pretrained("melissafasol/Inkuba_QLoRA", device_map="auto")
tokenizer = AutoTokenizer.from_pretrained("melissafasol/Inkuba_QLoRA")



In [19]:
prompt = "Translate to Swahili: I love science."

inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
outputs = model.generate(**inputs, max_new_tokens=20)

print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Translate to Swahili: I love science. | Apg29 Translate to Swahili: I love science. | Apg29 Trans


### Code to implement SHAP analysis on base model

In [None]:
#Separate by language
se_df = pd.DataFrame(se_dataset)
xn_df = pd.DataFrame(xn_dataset)

swa_se_dset = Dataset.from_pandas(se_df.loc[se_df['langs'] == 'swahili'])
hau_se_dset = Dataset.from_pandas(se_df.loc[se_df['langs'] == 'hausa'])
swa_xn_dset = Dataset.from_pandas(se_df.loc[xn_df['langs'] == 'swahili'])
hau_xn_dset = Dataset.from_pandas(se_df.loc[xn_df['langs'] == 'hausa'])


In [37]:
labels = swa_se_dset['targets']
sample_data = swa_se_dset.select(range(3))
sample_prompts = [build_prompt(example) for example in sample_data]

In [38]:
model.to("cuda")

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(61788, 2048)
    (layers): ModuleList(
      (0-7): 8 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (v_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (up_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (down_proj): Linear(in_features=5632, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((2048,), eps=1e-05)
    (rotary_emb): 

In [None]:
explainer = shap.Explainer(predict_fn, tokenizer, algorithm = "partition")
shap_values = explainer(sample_prompts)

In [None]:
# Define save path
save_dir = "/content/drive/MyDrive/InkubaLM/outputs/swahili_sentiment"
os.makedirs(save_dir, exist_ok=True)

# Recreate explainer and generate SHAP values
explainer = shap.Explainer(predict_fn, tokenizer, algorithm="partition")
shap_values = explainer(sample_prompts)

# Save explainer
with open(os.path.join(save_dir, "explainer.pkl"), "wb") as f:
    pickle.dump(explainer, f)

# Save shap values
with open(os.path.join(save_dir, "shap_values.pkl"), "wb") as f:
    pickle.dump(shap_values, f)

print("✅ SHAP values and explainer saved successfully.")


  0%|          | 0/498 [00:00<?, ?it/s]

PartitionExplainer explainer:  33%|███▎      | 1/3 [00:00<?, ?it/s]

  0%|          | 0/498 [00:00<?, ?it/s]

PartitionExplainer explainer: 100%|██████████| 3/3 [1:20:22<00:00, 1195.53s/it]

  0%|          | 0/498 [00:00<?, ?it/s]

PartitionExplainer explainer: 4it [2:00:13, 2404.37s/it]

✅ SHAP values and explainer saved successfully.



