In [1]:
!git clone https://github.com/melissafasol/InkubaLM-Challenge.git
%cd InkubaLM-Challenge

Cloning into 'InkubaLM-Challenge'...
remote: Enumerating objects: 29, done.[K
remote: Counting objects: 100% (29/29), done.[K
remote: Compressing objects: 100% (24/24), done.[K
remote: Total 29 (delta 9), reused 22 (delta 4), pack-reused 0 (from 0)[K
Receiving objects: 100% (29/29), 252.47 KiB | 5.74 MiB/s, done.
Resolving deltas: 100% (9/9), done.
/content/InkubaLM-Challenge


In [2]:
%load_ext autoreload
%autoreload 2

In [21]:
import sys
sys.path.append("..")  # Add parent directory to the path

import os
from typing import List
from pathlib import Path

import numpy as np

# DO NOT EDIT
# create submission file
import pandas as pd
from huggingface_hub import login
from transformers import (
    AutoTokenizer,
)
from utils import (
    eval,
    model_function,  # functions to load model and run inference
)

In [4]:
!pip install datasets


Collecting datasets
  Downloading datasets-3.4.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.4.1-py3-none-any.whl (487 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m487.4/487.4 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.w

In [5]:
from google.colab import drive
drive.mount('/content/drive')
output_path = "/content/drive/MyDrive/InkubaLM-Challenge/Output"
os.makedirs(output_path, exist_ok=True)

Mounted at /content/drive


In [6]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

try:
    from google.colab import userdata

    # Note: `userdata.get` is a Colab API. If you're not using Colab, set the env
    # vars as appropriate for your system.
    # userdata.get("HF_TOKEN") indicates that the name of the token in the Colab env is HF_TOKEN
    os.environ["HF_TOKEN"] = userdata.get("HF_TOKEN")
except:
    os.environ["HF_TOKEN"] = "----"

login(token=os.environ["HF_TOKEN"])

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [7]:
def process_likelihood(likelihood_str: str) -> List[float]:
    """
    Process a likelihood string to clean and convert it to a list of floats.
    """
    # Clean the string to remove unwanted characters
    clean_str = (
        likelihood_str.replace("tensor(", "").replace(")", "").strip()
        .replace("[[", "").replace("]]", "").strip()
        .replace(" device='cuda:0'", "").replace(" dtype=torch.float16", "").strip()
        .replace("tensor", "").strip()
    )

    # Remove any empty strings caused by extra commas
    clean_str = clean_str.replace(",,", ",")  # Remove duplicate commas if they exist

    # Convert to a list of floats
    likelihood = [
        float(x) for x in clean_str.split(",") if x.strip()
    ]  # Ensure non-empty strings are converted
    return likelihood

def create_submission(output_path, test_flag: bool):
    """
    Creates submission files based on the provided test_flag.

    Args:
    test_flag (bool): If True, creates a test submission file; otherwise, creates a final submission file.
    """
    if test_flag:
        try:
            df1 = pd.read_csv(os.path.join(
                output_path,
                "hau_sent_prediction_dev.csv")
                 )
            df2 = pd.read_csv(os.path.join(
                output_path,
                "swa_sent_prediction_dev.csv")
            )
            df3 = pd.read_csv(os.path.join(
                output_path,
                "hau_mt_prediction_dev.csv")
                             )
            df4 = pd.read_csv(os.path.join(
                output_path,
                "swa_mt_prediction_dev.csv"))
            df5 = pd.read_csv(os.path.join(
                output_path,
                "hau_xnli_prediction_dev.csv"))
            df6 = pd.read_csv(os.path.join(
                output_path,
                "swa_xnli_prediction_dev.csv"))
        except FileNotFoundError as e:
            print(
                "Seems you have not completed all the tasks, please complete all the tasks before attempting to create your submission file"
            )
            raise e
    else:
        filename = "submission.csv"
        try:
            df1 = pd.read_csv(os.path.join(
                output_path,
                "hau_sent_prediction.csv"))
            df2 = pd.read_csv(os.path.join(
                output_path,
                "swa_sent_prediction.csv"))
            df3 = pd.read_csv(os.path.join(
                output_path,
                "hau_mt_prediction.csv"))
            df4 = pd.read_csv(os.path.join(
                output_path,
                "swa_mt_prediction.csv"))
            df5 = pd.read_csv(os.path.join(
                output_path,
                "hau_xnli_prediction.csv"))
            df6 = pd.read_csv(os.path.join(
                output_path,
                "swa_xnli_prediction.csv"))
        except FileNotFoundError as e:
            print(
                "Seems you have not completed all the tasks, please complete all the tasks before attempting to create your submission file"
            )
            raise e

    # Combine and process data
    resmt = pd.concat([df3, df4], ignore_index=True)
    res_log = pd.concat([df1, df2, df5, df6], ignore_index=True)
    res_log.drop(columns=["Response"], inplace=True)
    res_log.rename(columns={"Log-Likelihood": "Response"}, inplace=True)
    res = pd.concat([res_log, resmt], ignore_index=True)

    def process_row(row):
        if "xnli" in row["ID"] or "sent" in row["ID"]:
            likelihoods = process_likelihood(row["Response"])
            predicted_label = np.argmax(likelihoods)
            return predicted_label
        return row["Response"]  # Default for other cases

    # Update the Response column in-place
    res["Response"] = res.apply(process_row, axis=1)

    if test_flag:
        filename = os.path.join(
                output_path,
                "submission_test.csv")
        # Save the submission file
        submission = res[["ID", "Response", "Targets"]]
        submission.to_csv(filename, index=False)
    else:
        filename = os.path.join(
                output_path,
                "submission.csv")
        # Save the submission file
        submission = res[["ID", "Response"]]
        submission.to_csv(filename, index=False)
    return submission

### Load the Model

In [8]:
model_name = "lelapa/InkubaLM-0.4B"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=os.environ["HF_TOKEN"])
model = model_function.load_model(model_name)



tokenizer_config.json:   0%|          | 0.00/960 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/991k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.95M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/763 [00:00<?, ?B/s]

vulavulaslm.py:   0%|          | 0.00/42.2k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/lelapa/InkubaLM-0.4B:
- vulavulaslm.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


pytorch_model.bin:   0%|          | 0.00/2.66G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

#### Sentiment Analysis Dataset

In [9]:
#!pip uninstall -y fsspec
#!pip install --no-cache-dir -U fsspec hf_transfer

Found existing installation: fsspec 2024.12.0
Uninstalling fsspec-2024.12.0:
  Successfully uninstalled fsspec-2024.12.0


In [11]:
from datasets import Dataset, DatasetDict
sentiment_train_df = pd.read_parquet("hf://datasets/lelapa/SentimentTrain/data/train-00000-of-00001.parquet")

hau_dataset = Dataset.from_pandas(
    sentiment_train_df[sentiment_train_df['langs']=='hausa']
)
swa_dataset = Dataset.from_pandas(
    sentiment_train_df[sentiment_train_df['langs']=='swahili']
)

# If you need a DatasetDict to mimic the Hugging Face structure
dataset_dict = DatasetDict({
    "swahili": swa_dataset,
    "hausa": hau_dataset
})

# Print to verify
print(swa_dataset)
print(hau_dataset)


Dataset({
    features: ['ID', 'task', 'langs', 'data_source', 'instruction', 'inputs', 'targets', '__index_level_0__'],
    num_rows: 200
})
Dataset({
    features: ['ID', 'task', 'langs', 'data_source', 'instruction', 'inputs', 'targets', '__index_level_0__'],
    num_rows: 200
})


In [29]:
swa_dataset[0]

{'ID': 'ID_dfb02831_sentiment_ dev_swahili',
 'task': 'sentiment',
 'langs': 'swahili',
 'data_source': 'swahili_tweet',
 'instruction': 'Tafadhali tambua mawazo yaliyoonyeshwa kwenye matini haya kwa kutegemea miongozo ifuatayo: Chanya: iwapo matini yanadokeza mawazo, mtazamo na hali chanya ya kihisia. Hasi: iwapo matini yanadokeza mawazo au hisia hasi. Wastani: iwapo matini hayadokezi lugha chanya au hasi kwa njia ya moja kwa moja au isiyo ya moja kwa moja.',
 'inputs': 'picha mbunge wa kilombero peter lijualikali akitoka leo katika gereza la ukonga baada ya mahakama kuu kufuta hukumu',
 'targets': 'Wastani',
 '__index_level_0__': 2}

In [12]:
# don't change this instruction
sent_instruction = "Please identify the sentiment reflected in this text based on the following guidelines: Positive: if a text implies positive sentiment, attitude, and emotional state. Negative: if a text implies negative sentiment or emotion. Neutral: if a text does not imply positive or negative language directly or indirectly. Provide sentiment labels only"

In [14]:
BASE_PROMPT = "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n ### Instruction: {}\n\n ### Response: "

In [15]:
# for swahili
model_function.main(
    model,
    tokenizer,
    BASE_PROMPT,
    sample_size=3,
    max_new_tokens=15,
    task_instruction=sent_instruction,
    dataset=swa_dataset,
    csv_file_path=os.path.join(
        output_path,
        "swa_sent_prediction_dev.csv"
    ),
    custom_instruct=False,
)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


In [23]:
import time
import os
import pandas as pd

def benchmark_task(dataset, task_instruction, output_path, csv_name, sample_size=3, max_new_tokens=15):
    """
    Runs model inference on a dataset and evaluates performance using evaluate_zindi().

    Returns:
        - zindi_score: Average of F1/chrF across tasks
        - avg_time_per_sample: Inference time per example
    """
    start = time.time()

    csv_path = os.path.join(output_path, csv_name)

    # Run inference
    model_function.main(
        model=model,
        tokenizer=tokenizer,
        BASE_PROMPT=BASE_PROMPT,
        task_instruction=task_instruction,
        dataset=dataset,
        csv_file_path=csv_path,
        sample_size=sample_size,
        max_new_tokens=max_new_tokens,
        custom_instruct=False,
    )

    elapsed = time.time() - start

    # Evaluate using provided function
    zindi_score = eval.evaluate_zindi(os.path.join(output_path,"submission_test.csv"))

    # Compute time per sample
    avg_time_per_sample = elapsed / len(dataset)

    return zindi_score, avg_time_per_sample


In [26]:
zindi_score, time_per_sample = benchmark_task(
    dataset=swa_dataset,
    task_instruction=sent_instruction,
    output_path= output_path,
    csv_name="swa_sent_prediction_dev.csv",
    sample_size=len(swa_dataset),  # or smaller for testing
    max_new_tokens=15
)

print(f"Swahili Sentiment | Zindi Score: {zindi_score:.3f} | Time/sample: {time_per_sample:.2f}s")


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for o

Swahili Sentiment | Zindi Score: 0.205 | Time/sample: 0.24s
