<a href="https://colab.research.google.com/github/msamwelmollel/Evaluate-Liger/blob/main/Evaluate_Liger.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
!pip install liger-kernel
!pip install datasets



In [10]:
import os
import torch
import time
import transformers
from datasets import load_dataset
from google.colab import userdata

In [11]:
HF_TOKEN = userdata.get('HF_TOKEN')

!huggingface-cli login --token $HF_TOKEN

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [12]:
from transformers import AutoTokenizer
from liger_kernel.transformers import apply_liger_kernel_to_llama, apply_liger_kernel_to_gemma

In [13]:
model_name = "sartifyllc/sartify_gemma2-2B-16bit"

model = transformers.AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Adding this line automatically monkey-patches the model with the optimized Liger kernels
# apply_liger_kernel_to_llama()
model = model.to('cuda')
apply_liger_kernel_to_gemma()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [14]:
alpaca_prompt = """Hapo chini kuna maelezo ya kazi, pamoja na maelezo ya ziada yanayotoa muktadha zaidi. Andika jibu ambalo linakamilisha ombi hilo ipasavyo.

### Maelezo:
{}

### Ziada:
{}

### Jibu:
{}"""

def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    questions = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        question = alpaca_prompt.format(instruction, input, "")
        questions.append(question)
    return { "questions" : questions, }
pass



dataset = load_dataset("Mollel/alpaca-swahili", split = "train").select(range(0,30))
# selected_dataset = dataset.select(range(0,10))
dataset = dataset.map(formatting_prompts_func, batched = True,)

In [15]:
inputs = []
for i in range(len(dataset)):
  inputs.append(tokenizer([dataset["questions"][i]], return_tensors = "pt").to("cuda"))

In [None]:
torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats()

# Initialize variables for total tokens and time
total_generated_tokens = 0
total_time = 0

# Start time measurement
for i in range(len(dataset)):
    start_time = time.time()
    outputs = model.generate(**inputs[i], max_new_tokens=4000, use_cache=True)
    end_time = time.time()

    # Calculate number of generated tokens for this iteration
    num_generated_tokens = outputs.size(-1) - inputs[i]['input_ids'].size(-1)
    total_generated_tokens += num_generated_tokens

    # Add time taken for this iteration
    total_time += (end_time - start_time)

# Calculate throughput (tokens per second)
throughput = total_generated_tokens / total_time

# Measure peak memory used
peak_memory = torch.cuda.max_memory_reserved()

# Convert peak memory to MB
peak_memory_mb = peak_memory / (1024 ** 2)

# Output the results
print(f"Total generated tokens: {total_generated_tokens}")
print(f"Total time taken: {total_time:.2f} seconds")
print(f"Throughput (tokens per second): {throughput:.2f}")
print(f"Peak memory reserved (MB): {peak_memory_mb:.2f}")