# Evaluation quantized models (pruned/no-pruned versions)

In [1]:
!pip install -q -U transformers bitsandbytes accelerate lm-eval datasets jinja2

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/50.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.5/50.5 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.2/10.2 MB[0m [31m110.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.1/76.1 MB[0m [31m27.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.9/3.9 MB[0m [31m38.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m487.4/487.4 kB[0m [31m35.7 MB/s[0m eta [36m0:00:00[0

In [2]:
import datasets
from datasets import load_dataset, Dataset
from transformers.testing_utils import CaptureLogger
from transformers.models.mixtral.modeling_mixtral import (
    MixtralForCausalLM,
    MixtralSparseMoeBlock,
    MixtralBlockSparseTop2MLP
    )
from transformers import BitsAndBytesConfig, AutoTokenizer, AutoModelForCausalLM, set_seed
import torch
import bitsandbytes as bnb
from lm_eval.api.model import LM
from lm_eval.models.huggingface import HFLM

from lm_eval import evaluator
from jinja2 import Template
import json
import os

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
try:
  import google.colab
  IN_COLAB = True
except:
  IN_COLAB = False

if IN_COLAB:
  from google.colab import drive
  drive.mount('/content/drive')
  results_dir = "/content/drive/MyDrive/MVA/LLM/results"
  # Create it if does not exist
  if not os.path.exists(results_dir):
    !mkdir -p $results_dir
else:
  results_dir = "./results"
  if not os.path.exists(results_dir):
    os.mkdir(results_dir)


Mounted at /content/drive


# 1. Testing inference in quantized - pruned Mixtral-8x7B-Instruct-v.0.1

A few examples about generation text with pruned version of Mixtral-8x7B-Instruct-v.0.1.

In [None]:
# Load model directly
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mixtral-8x7B-Instruct-v0.1")
model = AutoModelForCausalLM.from_pretrained("JavierLopetegui/Mixtral8x7B-4bit-pruned_4_experts")

In [None]:
model

MixtralForCausalLM(
  (model): MixtralModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x MixtralDecoderLayer(
        (self_attn): MixtralAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
        )
        (block_sparse_moe): MixtralSparseMoeBlock(
          (gate): Linear4bit(in_features=4096, out_features=4, bias=False)
          (experts): ModuleList(
            (0-3): 4 x MixtralBlockSparseTop2MLP(
              (w1): Linear4bit(in_features=4096, out_features=14336, bias=False)
              (w2): Linear4bit(in_features=14336, out_features=4096, bias=False)
              (w3): Linear4bit(in_features=4096, out_features=14336, bias=False)
              (act_fn): S

In [None]:
def print_instance_params(instance):
    for attr in [
        "blocksize",
        "compress_statistics",
        "quant_type",
        "quant_state",
        "quant_storage",
        "bnb_quantized",
        "data",
        "module",
    ]:
        print(f"{attr}: {getattr(instance, attr, 'Not Found')}")


In [None]:
def print_instance_params_v2(instance):
    for attr in [
        "weight",
        "compute_dtype",
        "compute_type_is_set",
        "quant_state",
        "quant_storage",
    ]:
        print(f"{attr}: {getattr(instance, attr, 'Not Found')}")


In [None]:
def generate_text_given_prompt(model,tokenizer, prompt, max_new_tokens=500, temperature=0.7, num_return_sequences=1, repetition_penalty=1.0):
    if isinstance(prompt, list):
      prompt = tokenizer.apply_chat_template(prompt, tokenize=False, return_tensors="pt")
    inputs = tokenizer(prompt, return_tensors="pt")
    input_ids = inputs.input_ids.to('cuda')
    attention_mask = inputs.attention_mask.to('cuda')
    prompt_length = input_ids.shape[-1]

    with torch.inference_mode():
        output = model.generate(
            input_ids,
            max_new_tokens=max_new_tokens,
            attention_mask=attention_mask,
            temperature=temperature,
            do_sample=True,
            num_return_sequences=num_return_sequences,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
            repetition_penalty=repetition_penalty,
        )
    '''
    if isinstance(prompt, list):
      inputs = tokenizer.apply_chat_template(prompt, return_tensors="pt").to('cuda')
      prompt_length = inputs.shape[-1]
      with torch.inference_mode():
          output = model.generate(
              inputs,
              max_new_tokens=max_new_tokens,
              temperature=temperature,
              do_sample=True,
              num_return_sequences=num_return_sequences,
              pad_token_id=tokenizer.eos_token_id,
              eos_token_id=tokenizer.eos_token_id,
              repetition_penalty=repetition_penalty,
          )

    else:
      inputs = tokenizer(prompt, return_tensors="pt")
      input_ids = inputs.input_ids.to('cuda')
      attention_mask = inputs.attention_mask.to('cuda')
      prompt_length = input_ids.shape[-1]

      with torch.inference_mode():
          output = model.generate(
              input_ids,
              max_new_tokens=max_new_tokens,
              attention_mask=attention_mask,
              temperature=temperature,
              do_sample=True,
              num_return_sequences=num_return_sequences,
              pad_token_id=tokenizer.eos_token_id,
              eos_token_id=tokenizer.eos_token_id,
              repetition_penalty=repetition_penalty,
          )
    '''

    # Process each sequence and remove the prompt part
    generated_texts = []
    for sequence in output:
        generated_part = sequence[prompt_length:]  # Remove prompt
        text = tokenizer.decode(generated_part, skip_special_tokens=True)
        generated_texts.append(text)

    return generated_texts if num_return_sequences > 1 else generated_texts[0]

In [None]:
prompts = [
    "Complete the story: Once upon a time, my mother taught me how to make a simple loaf of bread",
    "Give me the result of the next operation: 15 + 25",
    "Brief summary of France history:",
    "The result of multiplying 13 by 3 is: ",
    "Once upon a time,"
    "Yo voy todos los días a la escuela se traduce a frances como: "
]

In [None]:
for prompt in prompts:
    print(f"Given prompt: \n\t{prompt}\n\n")
    print(f"Generated answer: \n\t{generate_text_given_prompt(model, tokenizer, prompt)}\n")
    print("-"*100)

Given prompt: 
	Complete the story: Once upon a time, my mother taught me how to make a simple loaf of bread


Generated answer: 
	. This recipe was passed to me by my grandmother. At that time, I was too young to know what importance laid in that small piece of bread. Little did I know that one day I might have to rely on this recipe to feed my family. Now I am elderly, and this recipe is my only treasure. I am grateful that my mother and grandmother passed this recipe to me. I wish that someday I could pass this recipe to someone I love.

This is a great recipe. I’ve only made the loaf but I am sure I will be making the rest. The instructions are easy and clear, the ingredients are easy and available, and the taste is great. I am less than 80 years old, but I hope to have the opportunity to pass this recipe on to someone I love.

Ingredients

1

----------------------------------------------------------------------------------------------------
Given prompt: 
	Give me the result of t

It can be seen that the model tend to hallucinate or answer extra questions. We try to fix this by generating structured templates as prompts.

In [None]:
# Functions to use templates as normal prompt

def chat_messages_to_prompt(messages):
    '''
    Modify a templete to generate a plain prompt
    '''
    prompt = ""
    for msg in messages:
        if msg["role"] == "system":
            prompt += f"System: {msg['content']}\n"
        elif msg["role"] == "user":
            prompt += f"User: {msg['content']}\n"
        elif msg["role"] == "assistant":
            prompt += f"Assistant: {msg['content']}\n"
    prompt += "Assistant: "  # Signals the model to continue
    return prompt

def prepare_prompt(prompt_or_messages):
    '''
    Auxiliar function to modify a list of templates to plain prompts
    '''
    if isinstance(prompt_or_messages, list):
        return chat_messages_to_prompt(prompt_or_messages)
    return prompt_or_messages

In [None]:
chat_prompts = [
    [
        {"role": "system", "content": "You are a creative and imaginative storyteller AI."},
        {"role": "user", "content": "Complete the story: Once upon a time, my mother taught me how to make a simple loaf of bread."}
    ],
    [
        {"role": "system", "content": "You are a helpful math expert. ONLY provide the result of the operation, just one answer."},
        {"role": "user", "content": "What is the result of the following operation: 15 + 25"},
        {"role": "assistant", "content": "The answer is: 40"},
        {"role": "user", "content": "What is the result of the following operation: 15 - 25"}
    ],
    [
        {"role": "system", "content": "You are a helpful math expert. Provide first the explanation, then the result. END after the result. Think step-by-step."},
        {"role": "user", "content": "I want you to solve the following mathematical problem. Please, you should be detailed in you answer and do it in logical steps. \
        This is the problem: In how many ways can 5 students be selected from a group of 6 students?"}
    ],
    [
        {"role": "system", "content": "You are a knowledgeable historian."},
        {"role": "user", "content": "Give me a brief summary of the history of France."}
    ],
    [
        {"role": "system", "content": "You are a math expert. END after the result. Think step-by-step."},
        {"role": "user", "content": "What is the result of multiplying 13 by 3?"}
    ],
    [
        {"role": "system", "content": "You are a translator Spanish-to-French expert. ONLY provide the translated sentence in FRENCH. Do NOT add any explanations, or additional content. End after the translation."},
        {"role": "user", "content": "Translate the following: Yo voy todos los días a la escuela"}
    ]
]

In [None]:
for prompt in chat_prompts:
    print(f"Given prompt: \n{prepare_prompt(prompt)}\n\n")
    print(f"{generate_text_given_prompt(model, tokenizer, prompt)}\n")
    print("-"*100)

Given prompt: 
System: You are a creative and imaginative storyteller AI.
User: Complete the story: Once upon a time, my mother taught me how to make a simple loaf of bread.



Once upon a time, my mother taught me how to make a simple loaf of bread. It was a magical experience, as if I had been transported into a mystical world of baking and feasting. I was only six years old, but my mother believed in my abilities. She guided me through the process, step by step. I remember the warmth of the oven, the smell of the bread, and the pride I had when I pulled out the loaf, all on my own. I knew that I had created something truly special. I was not just a simple loaf of bread, but a piece of art, a product of my creativity, and the beginnings of a master baker.

----------------------------------------------------------------------------------------------------
Given prompt: 
System: You are a helpful math expert. ONLY provide the result of the operation, just one answer.
User: What is the

As can be seen with this simple examples, using a template with proper prompts improve the quality of the predictions. We will use this kind of prompts to compare the performance between pruned models and no-pruned models.

# Experimental pipeline functions

Following the same general tasks that have been evaluated in [1] we will use the library [lm-eval](https://github.com/EleutherAI/lm-evaluation-harness/tree/2a47159caff00135b026f724ace2a2011f3c7621). A unified framework to test generative language models on a large number of different evaluation tasks.

**Remark:** For compatibility with the library a new class is defined to have tokenizer and model inside the HFLM/LM class from lm-eval.

In [4]:
class MyCustomQuantizedLM(HFLM):
    def __init__(self, model, tokenizer, already_quant = False, device=device):
        if isinstance(model, str):
            if already_quant:
              model = AutoModelForCausalLM.from_pretrained(model)
            else:
              quantization_config = BitsAndBytesConfig(
                  load_in_4bit=True,
                  bnb_4bit_compute_dtype=torch.float16,
                  bnb_4bit_quant_type="nf4",
                  bnb_4bit_use_double_quant=True,
              )
              model = AutoModelForCausalLM.from_pretrained(
                                model,
                                device_map=device,
                                quantization_config=quantization_config
                            )

        if isinstance(tokenizer, str):
            tokenizer = AutoTokenizer.from_pretrained(tokenizer)

        super().__init__(
            pretrained=model,
            tokenizer=tokenizer,
            device=device,
            max_length=2048
        )
        self.tokenizer.pad_token = self.tokenizer.eos_token

In [12]:
class CustomEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, torch.dtype):
            return str(obj)  # Convert torch.dtype to string
        if isinstance(obj, torch.device):  # Check if it's a torch.device
            return str(obj)  # Convert torch.device to string
        return json.JSONEncoder.default(self, obj)

In [23]:
def performance_evaluation(model, tasks, new_fewshots=0, apply_chat_template=True, results_dir='./', device=device):
    '''
    Evaluates a generative language model on a list of evaluation tasks using the lm-eval framework.

    Args:
        model:
            The language model to be evaluated. Should be compatible with lm-eval's evaluation API.
        tasks (list):
            A list of task names (strings) representing evaluation benchmarks or datasets supported by lm-eval.
        new_fewshots (int, optional):
            Number of few-shot examples to use during evaluation. Default is 0 (zero-shot).
        apply_chat_template (bool, optional):
            Whether to apply chat formatting templates during evaluation. Useful for chat-based models. Default is True.
        results_dir (str, optional):
            Directory path where the evaluation results (JSON) will be saved. Default is './'.
        device (str or torch.device):
            The device on which the model should be evaluated (e.g., 'cuda', 'cpu').

    Output:
        JSON result files named "<task>_eval_results.json" saved in the results directory.
    '''
    for task in tasks:
      # Run evaluation (simulate here; real dataset will use task definition)
      results = evaluator.simple_evaluate(
          model=model,
          tasks=[task],
          num_fewshot=new_fewshots,
          device=device,
          apply_chat_template=apply_chat_template
      )

      # Make dire if doesn't exist
      if not os.path.exists(results_dir):
        os.mkdir(results_dir)
      # Save or print result
      with open(f"{results_dir}/{task}_eval_results.json", "w") as f:
          json.dump(results, f, indent=2, cls=CustomEncoder)

      print(f" Evaluation complete for {task}. Results saved.\n")

# 3. Evaluation: Mixtral-Pruned (r=4)

**Remark:** Due to computational limitations we jsut run the inference evaluation on `["arc_challenge","arc_easy", "boolq", "openbookqa","rte"]` datasets.

In [None]:
# Uncomment if not loaded before
#tokenizer = AutoTokenizer.from_pretrained("mistralai/Mixtral-8x7B-Instruct-v0.1")
#model = AutoModelForCausalLM.from_pretrained("JavierLopetegui/Mixtral8x7B-4bit-pruned_4_experts")

# Instantate model
quantized_lm = MyCustomQuantizedLM(model, tokenizer)

In [None]:
performance_evaluation(model=quantized_lm,
                       tasks=["arc_challenge","arc_easy", "boolq", "openbookqa","rte"],
                       new_fewshots=0,
                       apply_chat_template=True,
                       results_dir=f"{results_dir}/mixtral-pruned-r4",
                       device=device)

# 4. Evaluation: Mixtral-Pruned (r=6)

In [None]:
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mixtral-8x7B-Instruct-v0.1")
model = AutoModelForCausalLM.from_pretrained("JavierLopetegui/Mixtral8x7B-4bit-pruned")

quantized_lm = MyCustomQuantizedLM(model, tokenizer)

In [None]:
performance_evaluation(model=quantized_lm,
                       tasks=["arc_challenge","arc_easy", "boolq", "openbookqa","rte"],
                       new_fewshots=0,
                       apply_chat_template=True,
                       results_dir=f"{results_dir}/mixtral-pruned-r6",
                       device=device)

# Evaluation: Mixtral no-pruned

In [None]:
# Load model quantized in 4-bit
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)

tokenizer = AutoTokenizer.from_pretrained("mistralai/Mixtral-8x7B-Instruct-v0.1")

model = AutoModelForCausalLM.from_pretrained(
    "mistralai/Mixtral-8x7B-Instruct-v0.1",
    device_map='cuda',
    quantization_config=quantization_config
)

quantized_lm = MyCustomQuantizedLM(model, tokenizer)

In [None]:
performance_evaluation(model=quantized_lm,
                       tasks=["arc_challenge","arc_easy", "boolq", "openbookqa","rte"],
                       new_fewshots=0,
                       apply_chat_template=True,
                       results_dir=f"{results_dir}/mixtral-no-pruned",
                       device=device)

# Evaluation: DeepSeek pruned

In [None]:
tokenizer = AutoTokenizer.from_pretrained("olijacklu/deepseek-moe-16b-pruned", trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained("olijacklu/deepseek-moe-16b-pruned", trust_remote_code=True)

quantized_lm = MyCustomQuantizedLM(model, tokenizer)

In [None]:
performance_evaluation(model=quantized_lm,
                       tasks=["arc_challenge","arc_easy", "boolq", "openbookqa","rte"],
                       new_fewshots=0,
                       apply_chat_template=False,
                       results_dir=f"{results_dir}/deepseek-pruned",
                       device=device)

# Evaluation: DeepSeek no-pruned

In [None]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)

tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/deepseek-moe-16b-base", trust_remote_code=True)

model = AutoModelForCausalLM.from_pretrained(
    "deepseek-ai/deepseek-moe-16b-base",
    device_map='cuda',
    quantization_config=quantization_config,
    trust_remote_code=True
)

quantized_lm = MyCustomQuantizedLM(model, tokenizer)

In [None]:
performance_evaluation(model=quantized_lm,
                       tasks=["arc_challenge","arc_easy", "boolq", "openbookqa","rte"],
                       new_fewshots=0,
                       apply_chat_template=False,
                       results_dir=f"{results_dir}/deepseek-no-pruned",
                       device=device)