<a href="https://colab.research.google.com/github/msamwelmollel/Evaluate-Liger/blob/main/Evaluate_Liger.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install liger-kernel
!pip install datasets

Collecting liger-kernel
  Downloading liger_kernel-0.1.1-py3-none-any.whl.metadata (14 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=2.1.2->liger-kernel)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=2.1.2->liger-kernel)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=2.1.2->liger-kernel)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=2.1.2->liger-kernel)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=2.1.2->liger-kernel)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch>=

In [2]:
import os
import torch
import time
import transformers
from datasets import load_dataset
from google.colab import userdata

In [3]:
HF_TOKEN = userdata.get('HF_TOKEN')

!huggingface-cli login --token $HF_TOKEN

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [4]:
from transformers import AutoTokenizer
from liger_kernel.transformers import apply_liger_kernel_to_llama, apply_liger_kernel_to_gemma

In [5]:
model_name = "sartifyllc/sartify_gemma2-2B-16bit"
# model_name = "google/gemma-2-9b-it"

model = transformers.AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Adding this line automatically monkey-patches the model with the optimized Liger kernels
# apply_liger_kernel_to_llama()
model = model.to('cuda')
apply_liger_kernel_to_gemma()

config.json:   0%|          | 0.00/951 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/24.2k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/241M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/46.3k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/555 [00:00<?, ?B/s]

In [6]:
alpaca_prompt = """Hapo chini kuna maelezo ya kazi, pamoja na maelezo ya ziada yanayotoa muktadha zaidi. Andika jibu ambalo linakamilisha ombi hilo ipasavyo.

### Maelezo:
{}

### Ziada:
{}

### Jibu:
{}"""

def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    questions = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        question = alpaca_prompt.format(instruction, input, "")
        questions.append(question)
    return { "questions" : questions, }
pass



dataset = load_dataset("Mollel/alpaca-swahili", split = "train").select(range(0,5))
dataset = dataset.map(formatting_prompts_func, batched = True,)

Downloading readme:   0%|          | 0.00/354 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/18.3M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/51760 [00:00<?, ? examples/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

In [7]:
inputs = []
for i in range(len(dataset)):
  inputs.append(tokenizer([dataset["questions"][i]], return_tensors = "pt").to("cuda"))

In [8]:
torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats()

# Initialize variables for total tokens and time
total_generated_tokens = 0
total_time = 0

# Start time measurement
for i in range(len(dataset)):
    start_time = time.time()
    outputs = model.generate(**inputs[i], max_new_tokens=300, use_cache=True)
    end_time = time.time()

    # Calculate number of generated tokens for this iteration
    num_generated_tokens = outputs.size(-1) - inputs[i]['input_ids'].size(-1)
    total_generated_tokens += num_generated_tokens

    # Add time taken for this iteration
    total_time += (end_time - start_time)

# Calculate throughput (tokens per second)
throughput = total_generated_tokens / total_time

# Measure peak memory used
peak_memory = torch.cuda.max_memory_reserved()

# Convert peak memory to MB
peak_memory_mb = peak_memory / (1024 ** 2)

# Output the results
print(f"Total generated tokens: {total_generated_tokens}")
print(f"Total time taken: {total_time:.2f} seconds")
print(f"Throughput (tokens per second): {throughput:.2f}")
print(f"Peak memory (MB): {peak_memory_mb:.2f}")

Total generated tokens: 1100
Total time taken: 64.29 seconds
Throughput (tokens per second): 17.11
Peak memory (MB): 10490.00
