In [1]:

import sys

sys.path.append("../")

In [2]:
import polars as pl
from dotenv import load_dotenv

from evaluation.annotation import TaggedText
from evaluation.eval_algorithm import TaggedTextComparison

load_dotenv()

True

In [3]:
from huggingface_hub import login
import os

login(token=os.getenv("HF_TOKEN"))

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /Users/ostapbodnar/.cache/huggingface/token
Login successful


In [4]:
from huggingface_hub import InferenceClient

client = InferenceClient(base_url="https://jnovl897uj1glqgn.us-east-1.aws.endpoints.huggingface.cloud")

In [8]:
compl = client.chat.completions.create(
    [
        {"role": "system", "content": "You are a helpful assistant"},
        {"role": "user", "content": "Давай відповіді на запитання. \n Input: Яка столиця України?\n Output:"},
    ],
    max_tokens=2096,
    # top_k=30,
    top_p=0.9,
    temperature=0.2,
    # repetition_penalty=1.02,
    stop=["\nUser:", "<|endoftext|>", "</s>"],
)
compl

ChatCompletionOutput(choices=[ChatCompletionOutputComplete(finish_reason='stop', index=0, message=ChatCompletionOutputMessage(role='assistant', content='Київ', tool_calls=None), logprobs=None)], created=1726219613, id='', model='/repository', system_fingerprint='2.2.1-dev0-sha-f852190', usage=ChatCompletionOutputUsage(completion_tokens=3, prompt_tokens=36, total_tokens=39))

In [9]:
compl.choices[0].message['content']



'Київ'

In [10]:
from datasets import load_dataset, Features, Value, concatenate_datasets
from collections import Counter

features = Features({
    'input': Value('string'),
    'output': Value('string'),
    'instruct': Value('string'),
    'dataset_type': Value('string'),
    'dataloader_name': Value('string')
})


def custom_sample(dataset, sampling_fractions):
    sampled_datasets = []
    for name, fraction in sampling_fractions.items():
        filtered_dataset = dataset.filter(lambda x: x['dataloader_name'] == name)
        if fraction < 1.0:
            sampled_dataset = filtered_dataset.train_test_split(train_size=fraction, seed=42)['train']
        else:
            sampled_dataset = filtered_dataset
        sampled_datasets.append(sampled_dataset)

    return concatenate_datasets(sampled_datasets)


def value_counts(data):
    value_counts = Counter(data)
    print(value_counts)

In [11]:
artificial_ds = load_dataset("ostapbodnar/ua-gec-pos-ner-artificial", data_dir="small", split="validation",
                             features=features)

In [12]:
artificial_ds

Dataset({
    features: ['input', 'output', 'instruct', 'dataset_type', 'dataloader_name'],
    num_rows: 25549
})

In [13]:
sampling_fractions = {
    "PapersDataset": 0.65,
    "UbertextV2Dataset": 1.0,
}

artificial_ds = artificial_ds.filter(lambda x: x['dataloader_name'] in sampling_fractions)
artificial_ds = custom_sample(artificial_ds, sampling_fractions)
artificial_ds = artificial_ds.shuffle(seed=42)

artificial_ds = artificial_ds.shuffle(seed=42).select(range(1000))

value_counts(artificial_ds['dataloader_name'])

Counter({'PapersDataset': 548, 'UbertextV2Dataset': 452})


In [14]:
golden_ds = load_dataset("ostapbodnar/ua-gec-pos-ner-golden", features=features, split="validation")

In [15]:
sampling_fractions = {
    "UaSqaudDataset": 0.35,
    "NewsTopicClassificationDataset": 0.03,
    "NewsKeywordDataset": 0.03,
    "WscDataset": 1.0,
    'ZnoDataset': 0.85,
}

golden_ds_class = golden_ds.filter(lambda x: x['dataloader_name'] in sampling_fractions)
golden_ds_class = custom_sample(golden_ds_class, sampling_fractions)
golden_ds_class = golden_ds_class.shuffle(seed=42)

golden_ds_class = golden_ds_class.shuffle(seed=42).select(range(1000))

value_counts(golden_ds_class['dataloader_name'])

Counter({'UaSqaudDataset': 295, 'NewsTopicClassificationDataset': 276, 'NewsKeywordDataset': 241, 'ZnoDataset': 176, 'WscDataset': 12})


In [16]:
sampling_fractions = {
    "UaGecDataset": 0.55,
    "NerDataset": 1.0,
    'MovaInstPosDataset': 0.55,
}

golden_ds_tag = golden_ds.filter(lambda x: x['dataloader_name'] in sampling_fractions)
golden_ds_tag = custom_sample(golden_ds_tag, sampling_fractions)
golden_ds_tag = golden_ds_tag.shuffle(seed=42)

golden_ds_tag = golden_ds_tag.shuffle(seed=42).select(range(1000))
value_counts(golden_ds_tag['dataloader_name'])

Counter({'UaGecDataset': 485, 'MovaInstPosDataset': 471, 'NerDataset': 44})


In [17]:
import os
from openai import OpenAI, AsyncOpenAI
from huggingface_hub import AsyncInferenceClient

openai_client = AsyncOpenAI(api_key=os.environ['OPENAI_API_KEY'])

models = {
    'microsoft/Phi-3.5-mini-instruct': AsyncInferenceClient(
        base_url="https://jo64cpqkvayl7daj.us-east-1.aws.endpoints.huggingface.cloud"),
    # 'ostapbodnar/Phi3-mini-4k-instruct-UA': InferenceClient(base_url="https://it7s4lcu4sy306c2.eu-west-1.aws.endpoints.huggingface.cloud"),
    'ostapbodnar/Phi3.5-mini-instruct-UA-qlora': AsyncInferenceClient(
        base_url="https://it7s4lcu4sy306c2.eu-west-1.aws.endpoints.huggingface.cloud"),
    'ostapbodnar/Phi3.5-mini-instruct-UA-lora-unsloth-artificial': AsyncInferenceClient(
        base_url="https://kg2x3u556icag7u2.eu-west-1.aws.endpoints.huggingface.cloud"),
    'ostapbodnar/Phi3.5-mini-instruct-UA-lora-unsloth-mixed': AsyncInferenceClient(
        base_url="https://jnovl897uj1glqgn.us-east-1.aws.endpoints.huggingface.cloud"),
    'ft:gpt-4o-mini-2024-07-18:personal::A6RBdxHu': openai_client,
    'gpt-4o-mini': openai_client,
    'gpt-4o': openai_client
}


In [18]:
compl = await openai_client.chat.completions.create(

    messages=[
        {"role": "system", "content": "You are a helpful assistant"},
        {"role": "user", "content": "Давай відповіді на запитання. \n Input: Яка столиця України?\n Output:"},
    ],
        model="gpt-4o",
    max_tokens=2096,
    # top_k=30,
    top_p=0.9,
    temperature=0.2,
    # repetition_penalty=1.02,
    stop=["\nUser:", "<|endoftext|>", "</s>"],
)
compl

ChatCompletion(id='chatcmpl-A6wrtCmxonN8j7mpaVm0Doea6FCZK', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Київ є столицею України.', refusal=None, role='assistant', function_call=None, tool_calls=None))], created=1726219621, model='gpt-4o-2024-05-13', object='chat.completion', service_tier=None, system_fingerprint='fp_25624ae3a5', usage=CompletionUsage(completion_tokens=9, prompt_tokens=39, total_tokens=48, completion_tokens_details={'reasoning_tokens': 0}))

In [19]:
from difflib import SequenceMatcher
from datasets import Dataset
from tqdm.notebook import tqdm_notebook
import asyncio


async def _eval_model(model_name, client: AsyncOpenAI | AsyncInferenceClient, dataset, scoring_func, max_tokens,
                      concurrency_limit=10):
    print(f"Evaluation of {model_name}")

    semaphore = asyncio.Semaphore(concurrency_limit)  # Limit concurrent tasks
    results = [None] * len(dataset)  # Pre-allocate results list

    # Create a tqdm progress bar
    progress_bar = tqdm_notebook(total=len(dataset), desc=f'Evaluating {model_name}')

    async def process_row(index, row):
        async with semaphore:
            input_request = f'{row["instruct"]} \n Input: {row["input"]}\n Output:'
            messages = [
                {"role": "system", "content": "You are a helpful assistant"},
                {"role": "user", "content": input_request},
            ]

            try:
                response = await client.chat.completions.create(
                    model=model_name,
                    messages=messages,
                    max_tokens=max_tokens,
                    top_p=0.9,
                    temperature=0.2,
                    frequency_penalty=1.7,
                    stop=["\nUser:", "<|endoftext|>", "</s>", "\n"],
                )
                generated_text = response.choices[0].message.content
            except Exception as e:
                print(e)
                generated_text = None
                input_request = None

            try:
                accuracy_score = scoring_func(row, generated_text)
            except Exception as e:
                print(e)
                accuracy_score = None

            # Store results with proper ordering
            results[index] = {
                "model": model_name,
                "input": input_request,
                "output": row["output"],
                "generated_output": generated_text,
                "dataset_type": row['dataloader_name'],
                "accuracy": accuracy_score
            }

            progress_bar.update(1)

    if 'gpt' in model_name:
        for i, row in enumerate(dataset):
            await process_row(i, row)
    else:
        tasks = [process_row(i, row) for i, row in enumerate(dataset)]
        await asyncio.gather(*tasks)

    progress_bar.close()
    df = pl.DataFrame(results)
    return df


async def perform_eval_async(dataset: Dataset, scoring_func, max_tokens=2096):
    tasks = [
        _eval_model(model_name, client, dataset, scoring_func, max_tokens)
        for model_name, client in models.items()
    ]
    results = await asyncio.gather(*tasks)
    return pl.concat(results)


def get_score_grammar_func(*args, **kwargs):
    comparator = TaggedTextComparison(*args, **(dict(max_position_deviation=2, ignore_additional_tags=False) | kwargs))

    def score_grammar(row, generated_text):
        reference_text = TaggedText(row["output"])
        proposed_text = TaggedText(generated_text)
        return comparator.compute_accuracy_score(reference_text, proposed_text)

    return score_grammar


def score_q_and_a_func(row, generated_text):
    reference_text = row["output"]
    similarity_ratio = SequenceMatcher(None, reference_text, generated_text).ratio()
    return similarity_ratio

In [55]:
def calc_stats(df):
    accuracy_df = df.group_by(['model', 'dataset_type']).agg([
            (pl.col('accuracy').sum() / pl.len()).alias('accuracy')
        ])
    pivot_df = (
        accuracy_df.pivot(
            values='accuracy',
            index='model',
            columns='dataset_type'
        )
    )
    
    col_means = pivot_df.select(pl.all().exclude('model')).mean_horizontal()
    pivot_df = pivot_df.with_columns(mean=col_means)
    
    row_means = pivot_df.select(pl.all().exclude('model')).mean()
    row_means = row_means.with_columns(model=pl.lit('mean')).select(pivot_df.columns)
    return pl.concat([pivot_df, row_means])

In [20]:
models.keys()

dict_keys(['microsoft/Phi-3.5-mini-instruct', 'ostapbodnar/Phi3.5-mini-instruct-UA-qlora', 'ostapbodnar/Phi3.5-mini-instruct-UA-lora-unsloth-artificial', 'ostapbodnar/Phi3.5-mini-instruct-UA-lora-unsloth-mixed', 'ft:gpt-4o-mini-2024-07-18:personal::A6RBdxHu', 'gpt-4o-mini', 'gpt-4o'])

In [21]:
artificial_results = await perform_eval_async(artificial_ds.shuffle(seed=42).select(range(10)),
                                              get_score_grammar_func())
artificial_results.write_csv("artificial_scores.csv")
artificial_results

Evaluation of microsoft/Phi-3.5-mini-instruct


Evaluating microsoft/Phi-3.5-mini-instruct:   0%|          | 0/10 [00:00<?, ?it/s]

Evaluation of ostapbodnar/Phi3.5-mini-instruct-UA-qlora


Evaluating ostapbodnar/Phi3.5-mini-instruct-UA-qlora:   0%|          | 0/10 [00:00<?, ?it/s]

Evaluation of ostapbodnar/Phi3.5-mini-instruct-UA-lora-unsloth-artificial


Evaluating ostapbodnar/Phi3.5-mini-instruct-UA-lora-unsloth-artificial:   0%|          | 0/10 [00:00<?, ?it/s]

Evaluation of ostapbodnar/Phi3.5-mini-instruct-UA-lora-unsloth-mixed


Evaluating ostapbodnar/Phi3.5-mini-instruct-UA-lora-unsloth-mixed:   0%|          | 0/10 [00:00<?, ?it/s]

Evaluation of ft:gpt-4o-mini-2024-07-18:personal::A6RBdxHu


Evaluating ft:gpt-4o-mini-2024-07-18:personal::A6RBdxHu:   0%|          | 0/10 [00:00<?, ?it/s]

Evaluation of gpt-4o-mini


Evaluating gpt-4o-mini:   0%|          | 0/10 [00:00<?, ?it/s]

Evaluation of gpt-4o


Evaluating gpt-4o:   0%|          | 0/10 [00:00<?, ?it/s]

422, message='Unprocessable Entity', url='https://it7s4lcu4sy306c2.eu-west-1.aws.endpoints.huggingface.cloud/v1/chat/completions'
`text` must be string, not <class 'NoneType'>
422, message='Unprocessable Entity', url='https://kg2x3u556icag7u2.eu-west-1.aws.endpoints.huggingface.cloud/v1/chat/completions'
`text` must be string, not <class 'NoneType'>
422, message='Unprocessable Entity', url='https://jo64cpqkvayl7daj.us-east-1.aws.endpoints.huggingface.cloud/v1/chat/completions'
`text` must be string, not <class 'NoneType'>
422, message='Unprocessable Entity', url='https://jnovl897uj1glqgn.us-east-1.aws.endpoints.huggingface.cloud/v1/chat/completions'
`text` must be string, not <class 'NoneType'>


model,input,output,generated_output,dataset_type,accuracy
str,str,str,str,str,f64
"""microsoft/Phi-3.5-mini-instruc…","""Використай наведені нижче інст…","""<p t=""X"">уничтожены</p> <p t=""…","""<g ed=""G/Case"" et=""Тип помилки…","""PapersDataset""",0.005952
"""microsoft/Phi-3.5-mini-instruc…","""Використай наведені нижче інст…","""<p t=""ADJ"">Оперативная</p> <p …","""<g ed=""G/Case"" et=""Тип помилки…","""PapersDataset""",0.004587
"""microsoft/Phi-3.5-mini-instruc…","""Використай наведені нижче інст…","""<p t=""NOUN"">Року</p> <p t=""ADJ…","""<g ed=""G/Case"" et=""Тип помилки…","""PapersDataset""",0.0
"""microsoft/Phi-3.5-mini-instruc…","""Використай наведені нижче інст…","""<g ed=""."" et=""G/Case""><p t=""PU…","""<g ed=""G/Date"" et=""Тип помилки…","""PapersDataset""",0.0
"""microsoft/Phi-3.5-mini-instruc…",,"""<p t=""X"">Ɍɚɤɢɦ</p> <p t=""X"">ɱɢ…",,"""PapersDataset""",
…,…,…,…,…,…
"""gpt-4o""","""Використай наведені нижче інст…","""<p t=""ADJ"">Мотивувальна</p> <p…","""Мотивувальна частина <n t=""ORG…","""UbertextV2Dataset""",0.009375
"""gpt-4o""","""Використай наведені нижче інст…","""<p t=""NOUN"">Тренинг</p> <p t=""…","""Тренінг в сучасній <g ed=""літе…","""PapersDataset""",0.0
"""gpt-4o""","""Використай наведені нижче інст…","""<p t=""NOUN"">Співробітники</p> …","""Співробітники <n t=""LOC"">Вінни…","""UbertextV2Dataset""",0.0
"""gpt-4o""","""Використай наведені нижче інст…","""<p t=""ADV"">Сьогодні</p> <p t=""…","""Сьогодні ці ідеї використання …","""PapersDataset""",0.006024


In [56]:
calc_stats(artificial_results)

  accuracy_df.pivot(


model,PapersDataset,UbertextV2Dataset,mean
str,f64,f64,f64
"""gpt-4o""",0.004325,0.005408,0.004867
"""ft:gpt-4o-mini-2024-07-18:pers…",0.0061,0.003805,0.004953
"""ostapbodnar/Phi3.5-mini-instru…",0.092747,0.130036,0.111391
"""ostapbodnar/Phi3.5-mini-instru…",0.111426,0.101297,0.106362
"""microsoft/Phi-3.5-mini-instruc…",0.002009,0.000963,0.001486
"""ostapbodnar/Phi3.5-mini-instru…",0.0,0.000761,0.000381
"""gpt-4o-mini""",0.001311,0.0,0.000655
"""mean""",0.031131,0.03461,0.032871


In [23]:
golden_grammar_results = await perform_eval_async(golden_ds_tag.shuffle(seed=42).select(range(10)),
                                                  get_score_grammar_func())
golden_grammar_results.write_csv("golden_grammar_results_scores.csv")
golden_grammar_results

Evaluation of microsoft/Phi-3.5-mini-instruct


Evaluating microsoft/Phi-3.5-mini-instruct:   0%|          | 0/10 [00:00<?, ?it/s]

Evaluation of ostapbodnar/Phi3.5-mini-instruct-UA-qlora


Evaluating ostapbodnar/Phi3.5-mini-instruct-UA-qlora:   0%|          | 0/10 [00:00<?, ?it/s]

Evaluation of ostapbodnar/Phi3.5-mini-instruct-UA-lora-unsloth-artificial


Evaluating ostapbodnar/Phi3.5-mini-instruct-UA-lora-unsloth-artificial:   0%|          | 0/10 [00:00<?, ?it/s]

Evaluation of ostapbodnar/Phi3.5-mini-instruct-UA-lora-unsloth-mixed


Evaluating ostapbodnar/Phi3.5-mini-instruct-UA-lora-unsloth-mixed:   0%|          | 0/10 [00:00<?, ?it/s]

Evaluation of ft:gpt-4o-mini-2024-07-18:personal::A6RBdxHu


Evaluating ft:gpt-4o-mini-2024-07-18:personal::A6RBdxHu:   0%|          | 0/10 [00:00<?, ?it/s]

Evaluation of gpt-4o-mini


Evaluating gpt-4o-mini:   0%|          | 0/10 [00:00<?, ?it/s]

Evaluation of gpt-4o


Evaluating gpt-4o:   0%|          | 0/10 [00:00<?, ?it/s]

model,input,output,generated_output,dataset_type,accuracy
str,str,str,str,str,f64
"""microsoft/Phi-3.5-mini-instruc…","""Визнач частину мови для кожног…","""<p t=""VERB"">Заглянула</p> <p t…","""`<p t=""VERB"">Заглянула</p> <p …","""MovaInstPosDataset""",0.833333
"""microsoft/Phi-3.5-mini-instruc…","""Виправ граматичні помилки в по…",""" Варто згадати, що цей період …","""<g ed=""Prep"" et=""G/Prep"">супро…","""UaGecDataset""",0.0
"""microsoft/Phi-3.5-mini-instruc…","""Виправ граматичні помилки в по…","""Льоня відірвав очі від стелі і…","""<g ed=""D/Prep"" et=""Grammar"">Ло…","""UaGecDataset""",0.0
"""microsoft/Phi-3.5-mini-instruc…","""Визнач частину мови для кожног…","""<p t=""PART"">Не</p> <p t=""NOUN""…","""`<p t=""PREP"">Не</p> <p t=""PART…","""MovaInstPosDataset""",0.5
"""microsoft/Phi-3.5-mini-instruc…","""Виправ граматичні помилки в по…","""Якщо <g ed=""в заняттях"" et=""G/…","""<g ed=""Punctuation"" et=""Punctu…","""UaGecDataset""",0.0
…,…,…,…,…,…
"""gpt-4o""","""Визнач частину мови для кожног…","""<p t=""NOUN"">Посол</p> <p t=""PR…","""<p t=""NOUN"">Посол</p> <p t=""NO…","""MovaInstPosDataset""",0.195652
"""gpt-4o""","""Визнач частину мови для кожног…","""<p t=""NOUN"">Художник</p> <p t=…","""<p t=""NOUN"">Художник</p> <p t=…","""MovaInstPosDataset""",0.07377
"""gpt-4o""","""Визнач частину мови для кожног…","""<p t=""INTJ"">Бліна</p> <p t=""PU…","""<p t=""NOUN"">Бліна</p> <p t=""PU…","""MovaInstPosDataset""",0.75
"""gpt-4o""","""Виправ граматичні помилки в по…","""— Свіські сірники! <g ed=""О"" e…","""— <g ed=""Сільські"" et=""Spellin…","""UaGecDataset""",0.0


In [58]:
calc_stats(golden_grammar_results)

  accuracy_df.pivot(


model,UaGecDataset,MovaInstPosDataset,mean
str,f64,f64,f64
"""ostapbodnar/Phi3.5-mini-instru…",0.0,0.425,0.2125
"""ostapbodnar/Phi3.5-mini-instru…",0.0,0.255601,0.127801
"""ostapbodnar/Phi3.5-mini-instru…",0.0,0.0,0.0
"""gpt-4o""",0.0,0.549954,0.274977
"""gpt-4o-mini""",0.007353,0.524616,0.265984
"""microsoft/Phi-3.5-mini-instruc…",0.0,0.473726,0.236863
"""ft:gpt-4o-mini-2024-07-18:pers…",0.044118,0.298,0.171059
"""mean""",0.007353,0.360985,0.184169


In [59]:
golden_results = await perform_eval_async(golden_ds_class.shuffle(seed=42).select(range(10)), score_q_and_a_func, max_tokens=128)
golden_results.write_csv("golden_q_and_a_scores.csv")
golden_results

Evaluation of microsoft/Phi-3.5-mini-instruct


Evaluating microsoft/Phi-3.5-mini-instruct:   0%|          | 0/10 [00:00<?, ?it/s]

Evaluation of ostapbodnar/Phi3.5-mini-instruct-UA-qlora


Evaluating ostapbodnar/Phi3.5-mini-instruct-UA-qlora:   0%|          | 0/10 [00:00<?, ?it/s]

Evaluation of ostapbodnar/Phi3.5-mini-instruct-UA-lora-unsloth-artificial


Evaluating ostapbodnar/Phi3.5-mini-instruct-UA-lora-unsloth-artificial:   0%|          | 0/10 [00:00<?, ?it/s]

Evaluation of ostapbodnar/Phi3.5-mini-instruct-UA-lora-unsloth-mixed


Evaluating ostapbodnar/Phi3.5-mini-instruct-UA-lora-unsloth-mixed:   0%|          | 0/10 [00:00<?, ?it/s]

Evaluation of ft:gpt-4o-mini-2024-07-18:personal::A6RBdxHu


Evaluating ft:gpt-4o-mini-2024-07-18:personal::A6RBdxHu:   0%|          | 0/10 [00:00<?, ?it/s]

Evaluation of gpt-4o-mini


Evaluating gpt-4o-mini:   0%|          | 0/10 [00:00<?, ?it/s]

Evaluation of gpt-4o


Evaluating gpt-4o:   0%|          | 0/10 [00:00<?, ?it/s]

400, message='Bad Request', url='https://it7s4lcu4sy306c2.eu-west-1.aws.endpoints.huggingface.cloud/v1/chat/completions'
object of type 'NoneType' has no len()
400, message='Bad Request', url='https://it7s4lcu4sy306c2.eu-west-1.aws.endpoints.huggingface.cloud/v1/chat/completions'
object of type 'NoneType' has no len()
400, message='Bad Request', url='https://it7s4lcu4sy306c2.eu-west-1.aws.endpoints.huggingface.cloud/v1/chat/completions'
object of type 'NoneType' has no len()
400, message='Bad Request', url='https://it7s4lcu4sy306c2.eu-west-1.aws.endpoints.huggingface.cloud/v1/chat/completions'
object of type 'NoneType' has no len()
400, message='Bad Request', url='https://it7s4lcu4sy306c2.eu-west-1.aws.endpoints.huggingface.cloud/v1/chat/completions'
object of type 'NoneType' has no len()
400, message='Bad Request', url='https://it7s4lcu4sy306c2.eu-west-1.aws.endpoints.huggingface.cloud/v1/chat/completions'
object of type 'NoneType' has no len()
400, message='Bad Request', url='https:/

CancelledError: 

In [60]:
calc_stats(golden_results)

  accuracy_df.pivot(


model,NewsTopicClassificationDataset,ZnoDataset,NewsKeywordDataset,mean
str,f64,f64,f64,f64
"""ostapbodnar/Phi3.5-mini-instru…",0.049828,0.003401,0.198051,0.08376
"""microsoft/Phi-3.5-mini-instruc…",0.151098,0.007576,0.267253,0.141976
"""ft:gpt-4o-mini-2024-07-18:pers…",0.8125,0.061279,0.642144,0.505308
"""ostapbodnar/Phi3.5-mini-instru…",0.124008,0.003268,0.005698,0.044324
"""gpt-4o""",0.247146,0.006944,0.205545,0.153212
"""gpt-4o-mini""",0.225103,0.005556,0.125078,0.118579
"""ostapbodnar/Phi3.5-mini-instru…",0.604423,0.007576,0.61127,0.407756
"""mean""",0.316301,0.013657,0.293577,0.207845


python main.py     --tasks=mmlu_uk --limit=100     --model_args pretrained=microsoft/Phi-3.5-mini-instruct     --device=cuda

python main.py     --tasks=mmlu_uk --limit=100     --model_args pretrained=ostapbodnar/Phi3.5-mini-instruct-UA-lora-unsloth-mixed-16bit     --device=cuda

export HF_DATASETS_TRUST_REMOTE_CODE=true