In [1]:
# pip install cpufeature
# pip install cmake
# git clone https://github.com/kvcache-ai/ktransformers.git
# cd ktransformers
# git submodule init
# git submodule update
# sudo apt-get install gcc g++ cmake ninja-build
# pip install torch packaging ninja
# bash install.sh

In [2]:
# python -m ktransformers.local_chat --model_path deepseek-ai/DeepSeek-V2.5 --gguf_path ./DeepSeek-V2.5-GGUF

In [3]:
model_id = "deepseek-ai/DeepSeek-V2.5"

In [4]:
import torch
print(torch.__version__)
import transformers
print(transformers.__version__)
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig

from ktransformers.models.modeling_deepseek import DeepseekV2ForCausalLM
from ktransformers.server.config.config import Config

model_config = AutoConfig.from_pretrained(model_id, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)

torch.set_grad_enabled(False)
torch.set_default_dtype(model_config.torch_dtype)
Config().cpu_infer = 44

with torch.device("meta"):
    model = DeepseekV2ForCausalLM(model_config)
    model.eval()
    model.use_cache = False

model

2.4.0+cu121


  from .autonotebook import tqdm as notebook_tqdm


4.43.2


DeepseekV2ForCausalLM(
  (model): DeepseekV2Model(
    (embed_tokens): Embedding(102400, 5120)
    (layers): ModuleList(
      (0): DeepseekV2DecoderLayer(
        (self_attn): DeepseekV2Attention(
          (q_a_proj): Linear(in_features=5120, out_features=1536, bias=False)
          (q_a_layernorm): DeepseekV2RMSNorm()
          (q_b_proj): Linear(in_features=1536, out_features=24576, bias=False)
          (kv_a_proj_with_mqa): Linear(in_features=5120, out_features=576, bias=False)
          (kv_a_layernorm): DeepseekV2RMSNorm()
          (kv_b_proj): Linear(in_features=512, out_features=32768, bias=False)
          (o_proj): Linear(in_features=16384, out_features=5120, bias=False)
          (rotary_emb): DeepseekV2YarnRotaryEmbedding()
        )
        (mlp): DeepseekV2MLP(
          (gate_proj): Linear(in_features=5120, out_features=12288, bias=False)
          (up_proj): Linear(in_features=5120, out_features=12288, bias=False)
          (down_proj): Linear(in_features=12288, out_

In [5]:
optimize_rule_path = "ktransformers/ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat.yaml"

gguf_path = "DeepSeek-V2.5-GGUF"

In [None]:
# %%capture
from ktransformers.optimize.optimize import optimize_and_load_gguf

optimize_and_load_gguf(model, optimize_rule_path, gguf_path, model_config)

In [7]:
model

DeepseekV2ForCausalLM(
  (model): KDeepseekV2Model(
    (orig_module): DeepseekV2Model(
      (embed_tokens): Embedding(102400, 5120)
      (layers): ModuleList(
        (0): DeepseekV2DecoderLayer(
          (self_attn): KDeepseekV2Attention(
            (orig_module): DeepseekV2Attention(
              (q_a_proj): KTransformersLinear(
                (orig_module): Linear(in_features=5120, out_features=1536, bias=False)
              )
              (q_a_layernorm): DeepseekV2RMSNorm()
              (q_b_proj): KTransformersLinear(
                (orig_module): Linear(in_features=1536, out_features=24576, bias=False)
              )
              (kv_a_proj_with_mqa): KTransformersLinear(
                (orig_module): Linear(in_features=5120, out_features=576, bias=False)
              )
              (kv_a_layernorm): DeepseekV2RMSNorm()
              (kv_b_proj): Linear(in_features=512, out_features=32768, bias=False)
              (o_proj): KTransformersLinear(
                (

# MMLU

In [1]:
import argparse
import json
import logging
import os
import pathlib
import typing as tp

import pandas as pd
import datasets
import transformers
import torch
from tqdm.auto import tqdm

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


LANGUAGE_CONFIG: tp.Dict[str, tp.Dict[str, str]] = {
    "en": {
        "headline_prefix": "The following are multiple choice questions (with answers) about",
        "answer_prefix": "Answer:",
    },
    "ru": {
        "headline_prefix": "Ниже приведены вопросы с множественным выбором (с ответами) по",
        "answer_prefix": "Ответ:",
    },
}

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
subcategories = {
    "abstract_algebra": ["math"],
    "anatomy": ["health"],
    "astronomy": ["physics"],
    "business_ethics": ["business"],
    "clinical_knowledge": ["health"],
    "college_biology": ["biology"],
    "college_chemistry": ["chemistry"],
    "college_computer_science": ["computer science"],
    "college_mathematics": ["math"],
    "college_medicine": ["health"],
    "college_physics": ["physics"],
    "computer_security": ["computer science"],
    "conceptual_physics": ["physics"],
    "econometrics": ["economics"],
    "electrical_engineering": ["engineering"],
    "elementary_mathematics": ["math"],
    "formal_logic": ["philosophy"],
    "global_facts": ["other"],
    "high_school_biology": ["biology"],
    "high_school_chemistry": ["chemistry"],
    "high_school_computer_science": ["computer science"],
    "high_school_european_history": ["history"],
    "high_school_geography": ["geography"],
    "high_school_government_and_politics": ["politics"],
    "high_school_macroeconomics": ["economics"],
    "high_school_mathematics": ["math"],
    "high_school_microeconomics": ["economics"],
    "high_school_physics": ["physics"],
    "high_school_psychology": ["psychology"],
    "high_school_statistics": ["math"],
    "high_school_us_history": ["history"],
    "high_school_world_history": ["history"],
    "human_aging": ["health"],
    "human_sexuality": ["culture"],
    "international_law": ["law"],
    "jurisprudence": ["law"],
    "logical_fallacies": ["philosophy"],
    "machine_learning": ["computer science"],
    "management": ["business"],
    "marketing": ["business"],
    "medical_genetics": ["health"],
    "miscellaneous": ["other"],
    "moral_disputes": ["philosophy"],
    "moral_scenarios": ["philosophy"],
    "nutrition": ["health"],
    "philosophy": ["philosophy"],
    "prehistory": ["history"],
    "professional_accounting": ["other"],
    "professional_law": ["law"],
    "professional_medicine": ["health"],
    "professional_psychology": ["psychology"],
    "public_relations": ["politics"],
    "security_studies": ["politics"],
    "sociology": ["culture"],
    "us_foreign_policy": ["politics"],
    "virology": ["health"],
    "world_religions": ["philosophy"],
}

categories = {
    "STEM": ["physics", "chemistry", "biology", "computer science", "math", "engineering"],
    "humanities": ["history", "philosophy", "law"],
    "social sciences": ["politics", "culture", "economics", "geography", "psychology"],
    "other (business, health, misc.)": ["other", "business", "health"],
}

# in the form to fit the prompt headline
subcategories_en2ru = {
    "abstract_algebra": "абстрактной_алгебре",
    "anatomy": "анатомии",
    "astronomy": "астрономии",
    "business_ethics": "деловой_этике",
    "clinical_knowledge": "медицинским_знаниям",
    "college_biology": "биологии_в_вузе",
    "college_chemistry": "химии_в_вузе",
    "college_computer_science": "компьютерным_наукам_в_вузе",
    "college_mathematics": "математике_в_вузе",
    "college_medicine": "медицине_в_вузе",
    "college_physics": "физике_в_вузе",
    "computer_security": "компьютерной_безопасности",
    "conceptual_physics": "теоретической_физике",
    "econometrics": "эконометрике",
    "electrical_engineering": "электротехнике",
    "elementary_mathematics": "элементарной_математике",
    "formal_logic": "формальной_логике",
    "global_facts": "фактам_о_мире",
    "high_school_biology": "биологии_в_старшей_школе",
    "high_school_chemistry": "химии_в_старшей_школе",
    "high_school_computer_science": "информатике_в_старшей_школе",
    "high_school_european_history": "истории_Европы_в_старшей_школе",
    "high_school_geography": "географии_в_старшей_школе",
    "high_school_government_and_politics": "государству_и_политике_в_старшей_школе",
    "high_school_macroeconomics": "макроэкономике_в_старшей_школе",
    "high_school_mathematics": "математике_в_старшей_школе",
    "high_school_microeconomics": "микроэкономике_в_старшей_школе",
    "high_school_physics": "физике_в_старшей_школе",
    "high_school_psychology": "психологии_в_старшей_школе",
    "high_school_statistics": "статистике_в_старшей_школе",
    "high_school_us_history": "истории_США_в_старшей_школе",
    "high_school_world_history": "всемирной_истории_в_старшей_школе",
    "human_aging": "старению_человека",
    "human_sexuality": "человеческой_сексуальности",
    "international_law": "международному_праву",
    "jurisprudence": "юриспруденции",
    "logical_fallacies": "логическим_ошибкам",
    "machine_learning": "машинному_обучению",
    "management": "менеджменту",
    "marketing": "маркетингу",
    "medical_genetics": "медицинской_генетике",
    "miscellaneous": "разным_темам",
    "moral_disputes": "нравственным_спорам",
    "moral_scenarios": "нравственным_сценариям",
    "nutrition": "правильному_питанию",
    "philosophy": "философии",
    "prehistory": "доисторической_эпохе",
    "professional_accounting": "профессиональному_бухгалтерскому_учету",
    "professional_law": "профессиональному_праву",
    "professional_medicine": "профессиональной_медицине",
    "professional_psychology": "профессиональной_психологии",
    "public_relations": "связям_с_общественностью",
    "security_studies": "исследованиям_в_области_безопасности",
    "sociology": "социологии",
    "us_foreign_policy": "внешней_политике_США",
    "virology": "вирусологии",
    "world_religions": "мировым_религиям",
}

In [10]:
import abc
import typing as tp

class Conversation(abc.ABC):
    """
    Inspired by https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py
    """
    def __init__(self, system_prompt: str, roles: tp.Tuple[str, str]):
        self.system_prompt = system_prompt
        self.roles = roles
        self.messages: tp.List[tp.List[str, str]] = []

    @abc.abstractmethod
    def get_prompt(self) -> str:
        pass

    def update_last_message(self, text: str) -> None:
        self.messages[-1] = (self.messages[-1][0], text)

    def append_message(self, role: str, text: str) -> None:
        self.messages.append({"role":role, "content":text})

class EmptyConversation(Conversation):

    #"Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Response:"
    def __init__(self):
        super().__init__(
            system_prompt="",
            roles=("user", "assistant"),
        )

    def get_prompt(self) -> str:
        prompt = self.system_prompt
        for m in self.messages:
            prompt += str(m)
        return prompt

conversation_classes = {
    "empy_prompt_conv": EmptyConversation,
}

In [11]:
def get_df_in_hendrycks_format(subject: str, split: str, lang: str) -> pd.DataFrame:
    dataset = datasets.load_dataset("NLPCoreTeam/mmlu_ru", name=subject, split=split)
    wanted_cols = {
        "en": ["question_en", "choices_en", "answer"],
        "ru": ["question_ru", "choices_ru", "answer"],
    }[lang]
    df = dataset.to_pandas()[wanted_cols]
    int2str = dataset.features["answer"].int2str
    df[df.columns[2]] = df[df.columns[2]].apply(lambda x: int2str(x))
    df = pd.concat([
        df[[df.columns[0]]],
        pd.DataFrame(df[df.columns[1]].tolist()),
        df[[df.columns[2]]],
    ], axis=1)
    df.columns = range(len(df.columns))
    return df

In [12]:
def format_subject(subject: str) -> str:
    l = subject.split("_")
    s = ""
    for entry in l:
        s += " " + entry
    return s.strip()

def get_pretty_subject(subject: str, lang: str) -> str:
    return format_subject({
        "en": subject,
        "ru": subcategories_en2ru[subject],  # predefined map
    }[lang])

def get_prompt_from_dataframes(dev_df: pd.DataFrame, test_df: pd.DataFrame,
                               k: int, test_iloc_idx: int, lang: str, subject: str, conversation_type: str):
    assert 0 <= k <= 5
    headline_prefix = LANGUAGE_CONFIG[lang]["headline_prefix"]
    headline_postfix = get_pretty_subject(subject=subject, lang=lang)
    headline = f"{headline_prefix} {headline_postfix}.\n\n"

    answer_prefix = LANGUAGE_CONFIG[lang]["answer_prefix"]

    conv = conversation_classes[conversation_type]()

    is_already_taken_headline = False
    for row_idx, row in dev_df.head(k).iterrows():
        q = row[0].strip()
        options = row[1:5].tolist()
        lettered_options = [f"{x}) {y}" for x, y in zip(["A", "B", "C", "D"], options)]
        q_with_lettered_options = "\n".join([q, "\n".join(lettered_options)])
        if row_idx == 0:
            q_with_lettered_options = headline + q_with_lettered_options
            is_already_taken_headline = True
        conv.append_message(conv.roles[0], q_with_lettered_options + "\n\n" + answer_prefix)
        a = row[5]
        
        # if is not instruct, needed to be manually separated for mmlu examples
        conv.append_message(conv.roles[1], f"{a}")

    row = test_df.iloc[test_iloc_idx]
    q = row[0]
    options = row[1:5].tolist()
    lettered_options = [f"{x}) {y}" for x, y in zip(["A", "B", "C", "D"], options)]
    q_with_lettered_options = "\n".join([q, "\n".join(lettered_options)])
    if not is_already_taken_headline:
        q_with_lettered_options = headline + q_with_lettered_options
        is_already_taken_headline = True
    conv.append_message(conv.roles[0], q_with_lettered_options + "\n\n" + answer_prefix)
    a = row[5]
    # conv.append_message(conv.roles[1], "\n" + answer_prefix)
    # prompt = f"{conv.get_prompt()}{answer_prefix}"
    return conv.messages



def calculate_token_interest_probs(
    input_ids,
    tokenizer: transformers.PreTrainedTokenizerBase,
    model,
) -> tp.Dict[str, float]:

    torch_device = "cuda:0"
    batch_size, seq_length = input_ids.shape
    cache_position = torch.arange(seq_length, device=torch_device)
    device_map = model.gguf_loader.tensor_device_map
    from ktransformers.models.custom_cache import StaticCache
    past_key_values = StaticCache(config = model.config, max_batch_size = batch_size, max_cache_len = seq_length + 1, device = device_map, dtype = model.dtype)

    with torch.no_grad():
        inputs_embeds = model.model.embed_tokens(input_ids.to("cpu")).to(torch_device)
        logits = model(inputs_embeds = inputs_embeds, cache_position=cache_position, past_key_values=past_key_values, return_dict=False, use_cache=False)
    # for device in ["cuda:0", "cuda:1"]:
    #     torch.cuda.synchronize(device)
    next_token_logits = logits[0][:, -1, :]  # shape (batch_size, vocab_size)

    next_token_logits = next_token_logits.flatten()
    assert next_token_logits.shape == torch.Size((model.config.vocab_size, ))

    next_token_probs = torch.nn.functional.softmax(next_token_logits, dim=-1).cpu()  # all probs over vocab
    # assert torch.isclose(next_token_probs.sum(), torch.tensor(1.0).to(next_token_probs.dtype), atol=1e-03)  # dtype for half/nothalf, -03 for float16
    
    tokens_of_interest = [
        tokenizer("A", add_special_tokens=False).input_ids[-1],
        tokenizer("B", add_special_tokens=False).input_ids[-1],
        tokenizer("C", add_special_tokens=False).input_ids[-1],
        tokenizer("D", add_special_tokens=False).input_ids[-1],
    ]

    probs = next_token_probs[tokens_of_interest].tolist()
    res = dict(zip(["A", "B", "C", "D"], probs))
    return res

def append_to_jsonl(data: list, filename: str) -> None:
    with open(filename, "a") as f:
        f.write(json.dumps(data) + "\n")

def evaluate_subject(
    subject: str,
    lang: str,
    k_shot: int,
    jsonl_filepath: str,
    maxlen: int,
    convtype: str,
    tokenizer: transformers.PreTrainedTokenizerBase,
    model,
) -> None:

    dev_df = get_df_in_hendrycks_format(subject=subject, split="dev", lang=lang)
    test_df = get_df_in_hendrycks_format(subject=subject, split="test", lang=lang)

    for idx, row in tqdm(test_df.iterrows(), total=len(test_df), desc=subject):

        current_k_shot = k_shot
        skip_too_lengthy = False
        while True:
            if current_k_shot < 0:
                logger.info("Skip too lengthy.")
                skip_too_lengthy = True
                break
            input_messages = get_prompt_from_dataframes(
                dev_df=dev_df,
                test_df=test_df,
                k=current_k_shot,
                test_iloc_idx=idx,
                lang=lang,
                subject=subject,
                conversation_type=convtype,
            )

            input_prompt = tokenizer.apply_chat_template(input_messages, tokenize=False, add_generation_prompt=True)
            input_ids = tokenizer.apply_chat_template(input_messages, add_generation_prompt=True, return_tensors="pt")

            if input_ids.shape[-1] > maxlen and current_k_shot >= 0:
                logger.info("Takes smaller current_k_shot since maxlen.")
                current_k_shot -= 1
            elif current_k_shot < 0:
                logger.info("Skip too lengthy.")
                skip_too_lengthy = True
            else:
                break
        if skip_too_lengthy:
            continue

        label = row[5]

        preds = calculate_token_interest_probs(
            input_ids=input_ids,
            tokenizer=tokenizer,
            model=model,
        )

        append_to_jsonl(data=[input_prompt, label, preds], filename=jsonl_filepath)

In [None]:
output_dir = "mmlu/mmlu_ru_DeepSeek-V2.5-Q4_K_M"
lang = "ru"
k_shot = 5
convtype = "empy_prompt_conv"
MAX_INPUT_LENGTH = 8192

subjects = list(subcategories.keys())
for each_subject in subjects:
    jsonl_filepath = str(pathlib.Path(output_dir) / f"{each_subject}.jsonl")
    pathlib.Path(output_dir).mkdir(parents=True, exist_ok=True)
    logger.info(f"Filepath JSONL: {jsonl_filepath}")
    if pathlib.Path(jsonl_filepath).exists():
        logger.info(f"File already exists! Please manually verify that it wasn't partially interrupted.")
        continue
    evaluate_subject(
            subject=each_subject,
            lang=lang,
            k_shot=k_shot,
            jsonl_filepath=jsonl_filepath,
            maxlen=MAX_INPUT_LENGTH, convtype=convtype,
            tokenizer=tokenizer,
            model=model,
    )

INFO:__main__:Filepath JSONL: mmlu/mmlu_ru_DeepSeek-V2.5-Q4_K_M/abstract_algebra.jsonl
INFO:__main__:File already exists! Please manually verify that it wasn't partially interrupted.
INFO:__main__:Filepath JSONL: mmlu/mmlu_ru_DeepSeek-V2.5-Q4_K_M/anatomy.jsonl
INFO:__main__:File already exists! Please manually verify that it wasn't partially interrupted.
INFO:__main__:Filepath JSONL: mmlu/mmlu_ru_DeepSeek-V2.5-Q4_K_M/astronomy.jsonl
astronomy: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 152/152 [20:49<00:00,  8.22s/it]
INFO:__main__:Filepath JSONL: mmlu/mmlu_ru_DeepSeek-V2.5-Q4_K_M/business_ethics.jsonl
Downloading data: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10.1k/10.1k [00:00<00:00, 28.4kB/s]
Downloading data: 100%|██████████████████████████████████████████████████████████████████

In [None]:
# INFO:__main__:Filepath JSONL: mmlu/mmlu_ru_DeepSeek-V2.5-Q4_K_M/abstract_algebra.jsonl
# abstract_algebra: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [06:47<00:00,  4.08s/it]
# anatomy:  27%|████████████████████████████████████████                                                                                                          | 37/135 [02:18<05:56,  3.64s/it]

In [3]:
import numpy as np

category_to_main_category = {value: key for key, sublist in categories.items() for value in sublist}
subcategories2categories = {key: category_to_main_category[value[0]] for key, value in subcategories.items()}

def calculate_accuracy_from_directory(dirpath: str) -> tp.Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    assert pathlib.Path(dirpath).exists()
    filepaths = [str(x) for x in pathlib.Path(dirpath).glob('*.jsonl')]
    # assert len(filepaths) == 57
    res = {}
    for each_filepath in filepaths:
        df = pd.read_json(each_filepath, lines=True)
        df.columns = ['prompt', 'label', 'preds']
        cors = []
        for idx, row in df.iterrows():
            preds = row['preds']
            best_idx = np.argmax(list(preds.values()))
            y_pred = list(preds.keys())[best_idx]
            y_true = row['label']
            y_pred = y_pred.strip()
            y_true = y_true.strip()
            cors.append(y_true == y_pred)
        acc = np.mean(cors)
        res[pathlib.Path(each_filepath).stem] = acc * 100

    df = pd.DataFrame({pathlib.Path(dirpath).stem: res}).reset_index()
    df = df.rename(columns={'index': 'subcategory'})
    subcategories_df = df.copy()
    
    df = subcategories_df.copy()
    df['subcategory'] = df['subcategory'].map(subcategories2categories)
    df = df.rename(columns={'subcategory': 'category'})
    df = df.groupby('category').mean().reset_index()
    categories_df = df.copy()

    total_df = pd.DataFrame({pathlib.Path(dirpath).stem: [categories_df[pathlib.Path(dirpath).stem].mean()]})

    # assert subcategories_df.shape == (57, 2)
    # assert categories_df.shape == (4, 2)
    # assert total_df.shape == (1, 1)
    return (subcategories_df, categories_df, total_df)

In [4]:
output_dir = "mmlu/mmlu_ru_DeepSeek-V2.5-Q4_K_M"

subcategories_df, categories_df, total_df = calculate_accuracy_from_directory(dirpath=output_dir)
print("mmlu", total_df.values[0][0])
subcategories_df

mmlu 69.56973829796362


Unnamed: 0,subcategory,mmlu_ru_DeepSeek-V2
0,abstract_algebra,53.0
1,astronomy,84.868421
2,anatomy,57.037037
3,business_ethics,77.0
4,college_biology,73.4375
5,clinical_knowledge,72.075472
