In [2]:
subcategories = {
    "abstract_algebra": ["math"],
    "anatomy": ["health"],
    "astronomy": ["physics"],
    "business_ethics": ["business"],
    "clinical_knowledge": ["health"],
    "college_biology": ["biology"],
    "college_chemistry": ["chemistry"],
    "college_computer_science": ["computer science"],
    "college_mathematics": ["math"],
    "college_medicine": ["health"],
    "college_physics": ["physics"],
    "computer_security": ["computer science"],
    "conceptual_physics": ["physics"],
    "econometrics": ["economics"],
    "electrical_engineering": ["engineering"],
    "elementary_mathematics": ["math"],
    "formal_logic": ["philosophy"],
    "global_facts": ["other"],
    "high_school_biology": ["biology"],
    "high_school_chemistry": ["chemistry"],
    "high_school_computer_science": ["computer science"],
    "high_school_european_history": ["history"],
    "high_school_geography": ["geography"],
    "high_school_government_and_politics": ["politics"],
    "high_school_macroeconomics": ["economics"],
    "high_school_mathematics": ["math"],
    "high_school_microeconomics": ["economics"],
    "high_school_physics": ["physics"],
    "high_school_psychology": ["psychology"],
    "high_school_statistics": ["math"],
    "high_school_us_history": ["history"],
    "high_school_world_history": ["history"],
    "human_aging": ["health"],
    "human_sexuality": ["culture"],
    "international_law": ["law"],
    "jurisprudence": ["law"],
    "logical_fallacies": ["philosophy"],
    "machine_learning": ["computer science"],
    "management": ["business"],
    "marketing": ["business"],
    "medical_genetics": ["health"],
    "miscellaneous": ["other"],
    "moral_disputes": ["philosophy"],
    "moral_scenarios": ["philosophy"],
    "nutrition": ["health"],
    "philosophy": ["philosophy"],
    "prehistory": ["history"],
    "professional_accounting": ["other"],
    "professional_law": ["law"],
    "professional_medicine": ["health"],
    "professional_psychology": ["psychology"],
    "public_relations": ["politics"],
    "security_studies": ["politics"],
    "sociology": ["culture"],
    "us_foreign_policy": ["politics"],
    "virology": ["health"],
    "world_religions": ["philosophy"],
}

categories = {
    "STEM": ["physics", "chemistry", "biology", "computer science", "math", "engineering"],
    "humanities": ["history", "philosophy", "law"],
    "social sciences": ["politics", "culture", "economics", "geography", "psychology"],
    "other (business, health, misc.)": ["other", "business", "health"],
}

# in the form to fit the prompt headline
subcategories_en2ru = {
    "abstract_algebra": "абстрактной_алгебре",
    "anatomy": "анатомии",
    "astronomy": "астрономии",
    "business_ethics": "деловой_этике",
    "clinical_knowledge": "медицинским_знаниям",
    "college_biology": "биологии_в_вузе",
    "college_chemistry": "химии_в_вузе",
    "college_computer_science": "компьютерным_наукам_в_вузе",
    "college_mathematics": "математике_в_вузе",
    "college_medicine": "медицине_в_вузе",
    "college_physics": "физике_в_вузе",
    "computer_security": "компьютерной_безопасности",
    "conceptual_physics": "теоретической_физике",
    "econometrics": "эконометрике",
    "electrical_engineering": "электротехнике",
    "elementary_mathematics": "элементарной_математике",
    "formal_logic": "формальной_логике",
    "global_facts": "фактам_о_мире",
    "high_school_biology": "биологии_в_старшей_школе",
    "high_school_chemistry": "химии_в_старшей_школе",
    "high_school_computer_science": "информатике_в_старшей_школе",
    "high_school_european_history": "истории_Европы_в_старшей_школе",
    "high_school_geography": "географии_в_старшей_школе",
    "high_school_government_and_politics": "государству_и_политике_в_старшей_школе",
    "high_school_macroeconomics": "макроэкономике_в_старшей_школе",
    "high_school_mathematics": "математике_в_старшей_школе",
    "high_school_microeconomics": "микроэкономике_в_старшей_школе",
    "high_school_physics": "физике_в_старшей_школе",
    "high_school_psychology": "психологии_в_старшей_школе",
    "high_school_statistics": "статистике_в_старшей_школе",
    "high_school_us_history": "истории_США_в_старшей_школе",
    "high_school_world_history": "всемирной_истории_в_старшей_школе",
    "human_aging": "старению_человека",
    "human_sexuality": "человеческой_сексуальности",
    "international_law": "международному_праву",
    "jurisprudence": "юриспруденции",
    "logical_fallacies": "логическим_ошибкам",
    "machine_learning": "машинному_обучению",
    "management": "менеджменту",
    "marketing": "маркетингу",
    "medical_genetics": "медицинской_генетике",
    "miscellaneous": "разным_темам",
    "moral_disputes": "нравственным_спорам",
    "moral_scenarios": "нравственным_сценариям",
    "nutrition": "правильному_питанию",
    "philosophy": "философии",
    "prehistory": "доисторической_эпохе",
    "professional_accounting": "профессиональному_бухгалтерскому_учету",
    "professional_law": "профессиональному_праву",
    "professional_medicine": "профессиональной_медицине",
    "professional_psychology": "профессиональной_психологии",
    "public_relations": "связям_с_общественностью",
    "security_studies": "исследованиям_в_области_безопасности",
    "sociology": "социологии",
    "us_foreign_policy": "внешней_политике_США",
    "virology": "вирусологии",
    "world_religions": "мировым_религиям",
}

In [3]:
import abc
import typing as tp

class Conversation(abc.ABC):
    """
    Inspired by https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py
    """
    def __init__(self, system_prompt: str, roles: tp.Tuple[str, str]):
        self.system_prompt = system_prompt
        self.roles = roles
        self.messages: tp.List[tp.Tuple[str, str]] = []

    @abc.abstractmethod
    def get_prompt(self) -> str:
        pass

    def update_last_message(self, text: str) -> None:
        self.messages[-1] = (self.messages[-1][0], text)

    def append_message(self, role: str, text: str) -> None:
        self.messages.append([role, text])

class EmptyConversation(Conversation):

    #"Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Response:"
    def __init__(self):
        super().__init__(
            system_prompt="",
            roles=("", ""),
        )

    def get_prompt(self) -> str:
        prompt = self.system_prompt
        for role, text in self.messages:
            if text:
                prompt += f"{role}{text}"
            else:
                prompt += f"{role}"
        return prompt

conversation_classes = {
    "empy_prompt_conv": EmptyConversation,
}

In [4]:
import argparse
import json
import logging
import os
import pathlib
import typing as tp

import pandas as pd
import datasets
import peft
import transformers
import torch
from tqdm.auto import tqdm

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


LANGUAGE_CONFIG: tp.Dict[str, tp.Dict[str, str]] = {
    "en": {
        "headline_prefix": "The following are multiple choice questions (with answers) about",
        "answer_prefix": "Answer:",
    },
    "ru": {
        "headline_prefix": "Ниже приведены вопросы с множественным выбором (с ответами) по",
        "answer_prefix": "Ответ:",
    },
}

[2023-09-08 08:09:43,768] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [5]:
def get_df_in_hendrycks_format(subject: str, split: str, lang: str) -> pd.DataFrame:
    dataset = datasets.load_dataset("NLPCoreTeam/mmlu_ru", name=subject, split=split, use_auth_token=True)
    wanted_cols = {
        "en": ["question_en", "choices_en", "answer"],
        "ru": ["question_ru", "choices_ru", "answer"],
    }[lang]
    df = dataset.to_pandas()[wanted_cols]
    int2str = dataset.features["answer"].int2str
    df[df.columns[2]] = df[df.columns[2]].apply(lambda x: int2str(x))
    df = pd.concat([
        df[[df.columns[0]]],
        pd.DataFrame(df[df.columns[1]].tolist()),
        df[[df.columns[2]]],
    ], axis=1)
    df.columns = range(len(df.columns))
    return df

In [6]:
model_id = "TheBloke/Llama-2-70B-GPTQ"

In [7]:
import torch
import transformers
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
import bitsandbytes as bnb
print("torch", torch.__version__)
print("transformers", transformers.__version__)

git_branch = "gptq-4bit-32g-actorder_True"
device_map = "auto"
max_memory = None
max_memory = {0:"20GiB", 1: "22GiB"}

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    revision=git_branch,
    trust_remote_code=True,
    device_map=device_map,
    max_memory=max_memory,
)

model

torch 2.0.1+cu118
transformers 4.33.1


Downloading (…)der_True/config.json:   0%|          | 0.00/840 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/40.7G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 8192, padding_idx=0)
    (layers): ModuleList(
      (0-79): 80 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (rotary_emb): LlamaRotaryEmbedding()
          (k_proj): QuantLinear()
          (o_proj): QuantLinear()
          (q_proj): QuantLinear()
          (v_proj): QuantLinear()
        )
        (mlp): LlamaMLP(
          (act_fn): SiLUActivation()
          (down_proj): QuantLinear()
          (gate_proj): QuantLinear()
          (up_proj): QuantLinear()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
  )
  (lm_head): Linear(in_features=8192, out_features=32000, bias=False)
)

In [8]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=False)
tokenizer.add_special_tokens({
    "eos_token": tokenizer.convert_ids_to_tokens(model.config.eos_token_id),
    "bos_token": tokenizer.convert_ids_to_tokens(model.config.bos_token_id),
})
tokenizer.pad_token = tokenizer.eos_token
tokenizer

Downloading (…)okenizer_config.json:   0%|          | 0.00/745 [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

LlamaTokenizer(name_or_path='TheBloke/Llama-2-70B-GPTQ', vocab_size=32000, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'pad_token': '</s>'}, clean_up_tokenization_spaces=False)

In [9]:
def format_subject(subject: str) -> str:
    l = subject.split("_")
    s = ""
    for entry in l:
        s += " " + entry
    return s.strip()

def get_pretty_subject(subject: str, lang: str) -> str:
    return format_subject({
        "en": subject,
        "ru": subcategories_en2ru[subject],  # predefined map
    }[lang])

def get_prompt_from_dataframes(dev_df: pd.DataFrame, test_df: pd.DataFrame,
                               k: int, test_iloc_idx: int, lang: str, subject: str, conversation_type: str):
    assert 0 <= k <= 5
    headline_prefix = LANGUAGE_CONFIG[lang]["headline_prefix"]
    headline_postfix = get_pretty_subject(subject=subject, lang=lang)
    headline = f"{headline_prefix} {headline_postfix}.\n\n"

    answer_prefix = LANGUAGE_CONFIG[lang]["answer_prefix"]

    conv = conversation_classes[conversation_type]()

    is_already_taken_headline = False
    for row_idx, row in dev_df.head(k).iterrows():
        q = row[0]
        options = row[1:5].tolist()
        lettered_options = [f"{x}. {y}" for x, y in zip(["A", "B", "C", "D"], options)]
        q_with_lettered_options = "\n".join([q] + lettered_options)
        if row_idx == 0:
            q_with_lettered_options = headline + q_with_lettered_options
            is_already_taken_headline = True
        conv.append_message(conv.roles[0], q_with_lettered_options)
        a = row[5]
        
        # if is not instruct, needed to be manually separated for mmlu examples
        if conv.roles == ("", ""):
            conv.append_message(conv.roles[1], f"\n{answer_prefix}{a}\n\n")
        else:
            conv.append_message(conv.roles[1], f"\n{answer_prefix}{a}")

    row = test_df.iloc[test_iloc_idx]
    q = row[0]
    options = row[1:5].tolist()
    lettered_options = [f"{x}. {y}" for x, y in zip(["A", "B", "C", "D"], options)]
    q_with_lettered_options = "\n".join([q] + lettered_options)
    if not is_already_taken_headline:
        q_with_lettered_options = headline + q_with_lettered_options
        is_already_taken_headline = True
    conv.append_message(conv.roles[0], q_with_lettered_options)
    a = row[5]
    conv.append_message(conv.roles[1], None)
    
    prompt = f"{conv.get_prompt()}{answer_prefix}"
    return prompt

def calculate_token_interest_probs(
    input_prompt: str,
    tokenizer: transformers.PreTrainedTokenizerBase,
    model: tp.Union[transformers.PreTrainedModel, peft.peft_model.PeftModelForCausalLM],
) -> tp.Dict[str, float]:
    assert isinstance(input_prompt, str)
    input_ids = tokenizer(input_prompt, return_tensors="pt").input_ids.to(model.device)
    
    with torch.no_grad():
        outputs = model(input_ids=input_ids)
    logits = outputs.logits  # shape (batch_size, sequence_length, vocab_size)
    next_token_logits = logits[:, -1, :]  # shape (batch_size, vocab_size)

    next_token_logits = next_token_logits.flatten()
    assert next_token_logits.shape == torch.Size((model.config.vocab_size, ))

    next_token_probs = torch.nn.functional.softmax(next_token_logits, dim=-1).cpu()  # all probs over vocab
    # assert torch.isclose(next_token_probs.sum(), torch.tensor(1.0).to(next_token_probs.dtype), atol=1e-03)  # dtype for half/nothalf, -03 for float16
    
    tokens_of_interest = [
        tokenizer("A", add_special_tokens=False).input_ids[-1],
        tokenizer("B", add_special_tokens=False).input_ids[-1],
        tokenizer("C", add_special_tokens=False).input_ids[-1],
        tokenizer("D", add_special_tokens=False).input_ids[-1],
    ]
    
    probs = next_token_probs[tokens_of_interest].tolist()
    res = dict(zip(["A", "B", "C", "D"], probs))
    return res

def append_to_jsonl(data: list, filename: str) -> None:
    with open(filename, "a") as f:
        f.write(json.dumps(data) + "\n")

def evaluate_subject(
    subject: str,
    lang: str,
    k_shot: int,
    jsonl_filepath: str,
    maxlen: int,
    convtype: str,
    tokenizer: transformers.PreTrainedTokenizerBase,
    model: tp.Union[transformers.PreTrainedModel, peft.peft_model.PeftModelForCausalLM],
) -> None:

    dev_df = get_df_in_hendrycks_format(subject=subject, split="dev", lang=lang)
    test_df = get_df_in_hendrycks_format(subject=subject, split="test", lang=lang)

    for idx, row in tqdm(test_df.iterrows(), total=len(test_df), desc=subject):

        current_k_shot = k_shot
        skip_too_lengthy = False
        while True:
            if current_k_shot < 0:
                logger.info("Skip too lengthy.")
                skip_too_lengthy = True
                break
            input_prompt = get_prompt_from_dataframes(
                dev_df=dev_df,
                test_df=test_df,
                k=current_k_shot,
                test_iloc_idx=idx,
                lang=lang,
                subject=subject,
                conversation_type=convtype,
            )
            input_ids = tokenizer(input_prompt, return_tensors="pt").input_ids.to(model.device)
            if input_ids.shape[-1] > maxlen and current_k_shot >= 0:
                logger.info("Takes smaller current_k_shot since maxlen.")
                current_k_shot -= 1
            elif current_k_shot < 0:
                logger.info("Skip too lengthy.")
                skip_too_lengthy = True
            else:
                break
        if skip_too_lengthy:
            continue

        label = row[5]

        preds = calculate_token_interest_probs(
            input_prompt=input_prompt,
            tokenizer=tokenizer,
            model=model,
        )

        append_to_jsonl(data=[input_prompt, label, preds], filename=jsonl_filepath)

In [10]:
%%time
lang = "ru"
subject = "abstract_algebra"
convtype = "empy_prompt_conv"
current_k_shot = 5
idx = 0
dev_df = get_df_in_hendrycks_format(subject=subject, split="dev", lang=lang)
test_df = get_df_in_hendrycks_format(subject=subject, split="test", lang=lang)
input_prompt = get_prompt_from_dataframes(
    dev_df=dev_df,
                test_df=test_df,
                k=current_k_shot,
                test_iloc_idx=idx,
                lang=lang,
                subject=subject,
                conversation_type=convtype,
            )
print(input_prompt)
input_ids = tokenizer(
    input_prompt,
    return_tensors="pt",
    return_token_type_ids=False,
    truncation=True,
    padding=True,
)['input_ids']
output_ids = model.generate(input_ids, max_new_tokens=10)
ouput_str = tokenizer.decode(output_ids[0])
ouput_str

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Ниже приведены вопросы с множественным выбором (с ответами) по абстрактной алгебре.

Найдите все c в Z_3 таким образом, чтобы Z_3[x]/(x ^ 2 + c) было полем.
A. 0
B. 1
C. 2
D. 3
Ответ:B

Утверждение 1 | Если aH является элементом факторной группы, то |aH| делит |a|. Утверждение 2 | Если H и K являются подгруппами G, то HK является подгруппой G.
A. Верно, верно
B. Ложь, ложь
C. Правда, ложь
D. Ложь, истина
Ответ:B

Утверждение 1 | Каждый элемент группы порождает циклическую подгруппу группы. Утверждение 2 | Симметричная группа S_10 состоит из 10 элементов.
A. Верно, верно
B. Ложь, ложь
C. Правда, ложь
D. Ложь, истина
Ответ:C

Утверждение 1| Каждая функция из конечного множества на саму себя должна быть один к одному. Утверждение 2 | Каждая подгруппа абелевой группы является абелевой.
A. Верно, верно
B. Ложь, ложь
C. Правда, ложь
D. Ложь, истина
Ответ:A

Найдите характеристику кольца 2Z.
A. 0
B. 3
C. 12
D. 30
Ответ:A

Найдите степень для данного расширения поля Q(sqrt(2), sqrt(3), sqrt(18

'<s>Ниже приведены вопросы с множественным выбором (с ответами) по абстрактной алгебре.\n\nНайдите все c в Z_3 таким образом, чтобы Z_3[x]/(x ^ 2 + c) было полем.\nA. 0\nB. 1\nC. 2\nD. 3\nОтвет:B\n\nУтверждение 1 | Если aH является элементом факторной группы, то |aH| делит |a|. Утверждение 2 | Если H и K являются подгруппами G, то HK является подгруппой G.\nA. Верно, верно\nB. Ложь, ложь\nC. Правда, ложь\nD. Ложь, истина\nОтвет:B\n\nУтверждение 1 | Каждый элемент группы порождает циклическую подгруппу группы. Утверждение 2 | Симметричная группа S_10 состоит из 10 элементов.\nA. Верно, верно\nB. Ложь, ложь\nC. Правда, ложь\nD. Ложь, истина\nОтвет:C\n\nУтверждение 1| Каждая функция из конечного множества на саму себя должна быть один к одному. Утверждение 2 | Каждая подгруппа абелевой группы является абелевой.\nA. Верно, верно\nB. Ложь, ложь\nC. Правда, ложь\nD. Ложь, истина\nОтвет:A\n\nНайдите характеристику кольца 2Z.\nA. 0\nB. 3\nC. 12\nD. 30\nОтвет:A\n\nНайдите степень для данного ра

In [None]:
output_dir = "mmlu_ru_Llama-2-70B-GPTQ"
lang = "ru"
k_shot = 5
maxlen = 2048
convtype = "empy_prompt_conv"

subjects = list(subcategories.keys())
for each_subject in subjects:
    jsonl_filepath = str(pathlib.Path(output_dir) / f"{each_subject}.jsonl")
    pathlib.Path(output_dir).mkdir(parents=True, exist_ok=True)
    logger.info(f"Filepath JSONL: {jsonl_filepath}")
    if pathlib.Path(jsonl_filepath).exists():
        logger.info(f"File already exists! Please manually verify that it wasn't partially interrupted.")
        continue
    evaluate_subject(
            subject=each_subject,
            lang=lang,
            k_shot=k_shot,
            jsonl_filepath=jsonl_filepath,
            maxlen=maxlen, convtype=convtype,
            tokenizer=tokenizer,
            model=model,
    )

In [18]:
output_dir = "mmlu_ru_Llama-2-70B-GPTQ"

In [19]:
import numpy as np

category_to_main_category = {value: key for key, sublist in categories.items() for value in sublist}
subcategories2categories = {key: category_to_main_category[value[0]] for key, value in subcategories.items()}

def calculate_accuracy_from_directory(dirpath: str) -> tp.Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    assert pathlib.Path(dirpath).exists()
    filepaths = [str(x) for x in pathlib.Path(dirpath).glob('*.jsonl')]
    # assert len(filepaths) == 57
    res = {}
    for each_filepath in filepaths:
        df = pd.read_json(each_filepath, lines=True)
        df.columns = ['prompt', 'label', 'preds']
        cors = []
        for idx, row in df.iterrows():
            preds = row['preds']
            best_idx = np.argmax(list(preds.values()))
            y_pred = list(preds.keys())[best_idx]
            y_true = row['label']
            y_pred = y_pred.strip()
            y_true = y_true.strip()
            cors.append(y_true == y_pred)
        acc = np.mean(cors)
        res[pathlib.Path(each_filepath).stem] = acc * 100
    
    df = pd.DataFrame({pathlib.Path(dirpath).stem: res}).reset_index()
    df = df.rename(columns={'index': 'subcategory'})
    subcategories_df = df.copy()
    
    df = subcategories_df.copy()
    df['subcategory'] = df['subcategory'].map(subcategories2categories)
    df = df.rename(columns={'subcategory': 'category'})
    df = df.groupby('category').mean().reset_index()
    categories_df = df.copy()
    
    total_df = pd.DataFrame({pathlib.Path(dirpath).stem: [categories_df[pathlib.Path(dirpath).stem].mean()]})
    
    # assert subcategories_df.shape == (57, 2)
    # assert categories_df.shape == (4, 2)
    # assert total_df.shape == (1, 1)
    return (subcategories_df, categories_df, total_df)

subcategories_df, categories_df, total_df = calculate_accuracy_from_directory(dirpath=output_dir)
print(total_df.shape)
total_df

(1, 1)


Unnamed: 0,mmlu_ru_Llama-2-70B-GPTQ
0,60.205311


In [20]:
categories_df

Unnamed: 0,category,mmlu_ru_Llama-2-70B-GPTQ
0,STEM,49.651062
1,humanities,66.031919
2,"other (business, health, misc.)",58.215279
3,social sciences,66.922983


In [21]:
subcategories_df

Unnamed: 0,subcategory,mmlu_ru_Llama-2-70B-GPTQ
0,abstract_algebra,33.0
1,anatomy,53.333333
2,astronomy,70.394737
3,business_ethics,60.0
4,clinical_knowledge,57.358491
5,college_biology,61.805556
6,college_chemistry,43.0
7,college_computer_science,48.0
8,college_mathematics,34.0
9,college_medicine,50.289017
