In [1]:

import sys

sys.path.append("../")

In [2]:
import polars as pl
from dotenv import load_dotenv

from evaluation.annotation import TaggedText
from evaluation.eval_algorithm import TaggedTextComparison

load_dotenv()

True

In [3]:
from huggingface_hub import login
import os

login(token=os.getenv("HF_TOKEN"))

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /Users/ostapbodnar/.cache/huggingface/token
Login successful


In [4]:
from huggingface_hub import InferenceClient

client = InferenceClient(base_url="https://jnovl897uj1glqgn.us-east-1.aws.endpoints.huggingface.cloud")

In [5]:
compl = client.chat.completions.create(
    [
        {"role": "system", "content": "You are a helpful assistant"},
        {"role": "user", "content": "Давай відповіді на запитання. \n Input: Яка столиця України?\n Output:"},
    ],
    max_tokens=2096,
    # top_k=30,
    top_p=0.9,
    temperature=0.2,
    # repetition_penalty=1.02,
    stop=["\nUser:", "<|endoftext|>", "</s>"],
)
compl

ChatCompletionOutput(choices=[ChatCompletionOutputComplete(finish_reason='stop', index=0, message=ChatCompletionOutputMessage(role='assistant', content='Київ', tool_calls=None), logprobs=None)], created=1726403409, id='', model='/repository', system_fingerprint='2.2.1-dev0-sha-f852190', usage=ChatCompletionOutputUsage(completion_tokens=3, prompt_tokens=36, total_tokens=39))

In [6]:
compl.choices[0].message['content']



'Київ'

In [7]:
from datasets import load_dataset, Features, Value, concatenate_datasets
from collections import Counter

features = Features({
    'input': Value('string'),
    'output': Value('string'),
    'instruct': Value('string'),
    'dataset_type': Value('string'),
    'dataloader_name': Value('string')
})


def custom_sample(dataset, sampling_fractions):
    sampled_datasets = []
    for name, fraction in sampling_fractions.items():
        filtered_dataset = dataset.filter(lambda x: x['dataloader_name'] == name)
        if fraction < 1.0:
            sampled_dataset = filtered_dataset.train_test_split(train_size=fraction, seed=42)['train']
        else:
            sampled_dataset = filtered_dataset
        sampled_datasets.append(sampled_dataset)

    return concatenate_datasets(sampled_datasets)


def value_counts(data):
    value_counts = Counter(data)
    print(value_counts)

In [8]:
artificial_ds = load_dataset("ostapbodnar/ua-gec-pos-ner-artificial", data_dir="small", split="validation",
                             features=features)

In [9]:
artificial_ds

Dataset({
    features: ['input', 'output', 'instruct', 'dataset_type', 'dataloader_name'],
    num_rows: 25549
})

In [10]:
sampling_fractions = {
    "PapersDataset": 0.65,
    "UbertextV2Dataset": 1.0,
}

artificial_ds = artificial_ds.filter(lambda x: x['dataloader_name'] in sampling_fractions)
artificial_ds = custom_sample(artificial_ds, sampling_fractions)
artificial_ds = artificial_ds.shuffle(seed=42)

artificial_ds = artificial_ds.shuffle(seed=42).select(range(1000))

value_counts(artificial_ds['dataloader_name'])

Counter({'PapersDataset': 548, 'UbertextV2Dataset': 452})


In [11]:
golden_ds = load_dataset("ostapbodnar/ua-gec-pos-ner-golden", features=features, split="validation")

In [12]:
sampling_fractions = {
    "UaSqaudDataset": 0.35,
    "NewsTopicClassificationDataset": 0.03,
    "NewsKeywordDataset": 0.03,
    "WscDataset": 1.0,
    'ZnoDataset': 0.85,
}

golden_ds_class = golden_ds.filter(lambda x: x['dataloader_name'] in sampling_fractions)
golden_ds_class = custom_sample(golden_ds_class, sampling_fractions)
golden_ds_class = golden_ds_class.shuffle(seed=42)

golden_ds_class = golden_ds_class.shuffle(seed=42).select(range(1000))

value_counts(golden_ds_class['dataloader_name'])

Counter({'UaSqaudDataset': 295, 'NewsTopicClassificationDataset': 276, 'NewsKeywordDataset': 241, 'ZnoDataset': 176, 'WscDataset': 12})


In [13]:
sampling_fractions = {
    "UaGecDataset": 0.55,
    "NerDataset": 1.0,
    'MovaInstPosDataset': 0.55,
}

golden_ds_tag = golden_ds.filter(lambda x: x['dataloader_name'] in sampling_fractions)
golden_ds_tag = custom_sample(golden_ds_tag, sampling_fractions)
golden_ds_tag = golden_ds_tag.shuffle(seed=42)

golden_ds_tag = golden_ds_tag.shuffle(seed=42).select(range(1000))
value_counts(golden_ds_tag['dataloader_name'])

Counter({'UaGecDataset': 485, 'MovaInstPosDataset': 471, 'NerDataset': 44})


In [14]:
import os
from openai import OpenAI, AsyncOpenAI
from huggingface_hub import AsyncInferenceClient

openai_client = AsyncOpenAI(api_key=os.environ['OPENAI_API_KEY'])

models = {
    'microsoft/Phi-3.5-mini-instruct': AsyncInferenceClient(
        base_url="https://jo64cpqkvayl7daj.us-east-1.aws.endpoints.huggingface.cloud"),
    # 'ostapbodnar/Phi3-mini-4k-instruct-UA': InferenceClient(base_url="https://it7s4lcu4sy306c2.eu-west-1.aws.endpoints.huggingface.cloud"),
    'ostapbodnar/Phi3.5-mini-instruct-UA-qlora': AsyncInferenceClient(
        base_url="https://it7s4lcu4sy306c2.eu-west-1.aws.endpoints.huggingface.cloud"),
    'ostapbodnar/Phi3.5-mini-instruct-UA-lora-unsloth-artificial': AsyncInferenceClient(
        base_url="https://kg2x3u556icag7u2.eu-west-1.aws.endpoints.huggingface.cloud"),
    'ostapbodnar/Phi3.5-mini-instruct-UA-lora-unsloth-mixed': AsyncInferenceClient(
        base_url="https://jnovl897uj1glqgn.us-east-1.aws.endpoints.huggingface.cloud"),
    'ft:gpt-4o-mini-2024-07-18:personal::A7fOmagX': openai_client,
    'gpt-4o-mini': openai_client,
}


In [15]:
compl = await openai_client.chat.completions.create(

    messages=[
        {"role": "system", "content": "You are a helpful assistant"},
        {"role": "user", "content": "Давай відповіді на запитання. \n Input: Яка столиця України?\n Output:"},
    ],
        model="gpt-4o",
    max_tokens=2096,
    # top_k=30,
    top_p=0.9,
    temperature=0.2,
    # repetition_penalty=1.02,
    stop=["\nUser:", "<|endoftext|>", "</s>"],
)
compl

ChatCompletion(id='chatcmpl-A7igP6oezD6bFdqBjx9UCbEIPdkdd', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Київ є столицею України.', refusal=None, role='assistant', function_call=None, tool_calls=None))], created=1726403421, model='gpt-4o-2024-05-13', object='chat.completion', service_tier=None, system_fingerprint='fp_25624ae3a5', usage=CompletionUsage(completion_tokens=9, prompt_tokens=39, total_tokens=48, completion_tokens_details={'reasoning_tokens': 0}))

In [16]:
from difflib import SequenceMatcher
from datasets import Dataset
from tqdm.notebook import tqdm_notebook
import asyncio


async def _eval_model(model_name, client: AsyncOpenAI | AsyncInferenceClient, dataset, scoring_func, max_tokens,
                      concurrency_limit=10):
    print(f"Evaluation of {model_name}")

    semaphore = asyncio.Semaphore(concurrency_limit)  # Limit concurrent tasks
    results = [None] * len(dataset)  # Pre-allocate results list

    # Create a tqdm progress bar
    progress_bar = tqdm_notebook(total=len(dataset), desc=f'Evaluating {model_name}')

    async def process_row(index, row):
        async with semaphore:
            input_request = f'{row["instruct"]} \n Input: {row["input"]}\n Output:'
            messages = [
                {"role": "system", "content": "You are a helpful assistant"},
                {"role": "user", "content": input_request},
            ]

            try:
                response = await client.chat.completions.create(
                    model=model_name,
                    messages=messages,
                    max_tokens=max_tokens,
                    top_p=0.9,
                    temperature=0.2,
                    frequency_penalty=1.7,
                    stop=["\nUser:", "<|endoftext|>", "</s>", "\n"],
                )
                generated_text = response.choices[0].message.content
            except Exception as e:
                print(e)
                generated_text = None
                input_request = None

            try:
                accuracy_score = scoring_func(row, generated_text)
            except Exception as e:
                print(e)
                accuracy_score = None

            # Store results with proper ordering
            results[index] = {
                "model": model_name,
                "input": input_request,
                "output": row["output"],
                "generated_output": generated_text,
                "dataset_type": row['dataloader_name'],
                "accuracy": accuracy_score
            }

            progress_bar.update(1)

    if 'gpt' in model_name:
        for i, row in enumerate(dataset):
            await process_row(i, row)
    else:
        tasks = [process_row(i, row) for i, row in enumerate(dataset)]
        await asyncio.gather(*tasks)

    progress_bar.close()
    df = pl.DataFrame(results)
    return df


async def perform_eval_async(dataset: Dataset, scoring_func, max_tokens=2096):
    tasks = [
        _eval_model(model_name, client, dataset, scoring_func, max_tokens)
        for model_name, client in models.items()
    ]
    results = await asyncio.gather(*tasks)
    return pl.concat(results)


def get_score_grammar_func(*args, **kwargs):
    comparator = TaggedTextComparison(*args, **(dict(max_position_deviation=2, ignore_additional_tags=False) | kwargs))

    def score_grammar(row, generated_text):
        reference_text = TaggedText(row["output"])
        proposed_text = TaggedText(generated_text)
        return comparator.compute_accuracy_score(reference_text, proposed_text)

    return score_grammar


def score_q_and_a_func(row, generated_text):
    reference_text = row["output"]
    similarity_ratio = SequenceMatcher(None, reference_text, generated_text).ratio()
    return similarity_ratio

In [17]:
def calc_stats(df):
    accuracy_df = df.group_by(['model', 'dataset_type']).agg([
            (pl.col('accuracy').sum() / pl.len()).alias('accuracy')
        ])
    pivot_df = (
        accuracy_df.pivot(
            values='accuracy',
            index='model',
            columns='dataset_type'
        )
    )
    
    col_means = pivot_df.select(pl.all().exclude('model')).mean_horizontal()
    pivot_df = pivot_df.with_columns(mean=col_means)
    
    row_means = pivot_df.select(pl.all().exclude('model')).mean()
    row_means = row_means.with_columns(model=pl.lit('mean')).select(pivot_df.columns)
    return pl.concat([pivot_df, row_means])

In [18]:
models.keys()

dict_keys(['microsoft/Phi-3.5-mini-instruct', 'ostapbodnar/Phi3.5-mini-instruct-UA-qlora', 'ostapbodnar/Phi3.5-mini-instruct-UA-lora-unsloth-artificial', 'ostapbodnar/Phi3.5-mini-instruct-UA-lora-unsloth-mixed', 'ft:gpt-4o-mini-2024-07-18:personal::A7fOmagX', 'gpt-4o-mini'])

In [20]:
artificial_results = await perform_eval_async(artificial_ds.shuffle(seed=42).select(range(100)),
                                              get_score_grammar_func())
artificial_results.write_csv("artificial_scores.csv")
artificial_results

Evaluation of microsoft/Phi-3.5-mini-instruct


Evaluating microsoft/Phi-3.5-mini-instruct:   0%|          | 0/100 [00:00<?, ?it/s]

Evaluation of ostapbodnar/Phi3.5-mini-instruct-UA-qlora


Evaluating ostapbodnar/Phi3.5-mini-instruct-UA-qlora:   0%|          | 0/100 [00:00<?, ?it/s]

Evaluation of ostapbodnar/Phi3.5-mini-instruct-UA-lora-unsloth-artificial


Evaluating ostapbodnar/Phi3.5-mini-instruct-UA-lora-unsloth-artificial:   0%|          | 0/100 [00:00<?, ?it/s…

Evaluation of ostapbodnar/Phi3.5-mini-instruct-UA-lora-unsloth-mixed


Evaluating ostapbodnar/Phi3.5-mini-instruct-UA-lora-unsloth-mixed:   0%|          | 0/100 [00:00<?, ?it/s]

Evaluation of ft:gpt-4o-mini-2024-07-18:personal::A7fOmagX


Evaluating ft:gpt-4o-mini-2024-07-18:personal::A7fOmagX:   0%|          | 0/100 [00:00<?, ?it/s]

Evaluation of gpt-4o-mini


Evaluating gpt-4o-mini:   0%|          | 0/100 [00:00<?, ?it/s]

422, message='Unprocessable Entity', url='https://it7s4lcu4sy306c2.eu-west-1.aws.endpoints.huggingface.cloud/v1/chat/completions'
`text` must be string, not <class 'NoneType'>
422, message='Unprocessable Entity', url='https://kg2x3u556icag7u2.eu-west-1.aws.endpoints.huggingface.cloud/v1/chat/completions'
`text` must be string, not <class 'NoneType'>
422, message='Unprocessable Entity', url='https://jo64cpqkvayl7daj.us-east-1.aws.endpoints.huggingface.cloud/v1/chat/completions'
`text` must be string, not <class 'NoneType'>
422, message='Unprocessable Entity', url='https://jnovl897uj1glqgn.us-east-1.aws.endpoints.huggingface.cloud/v1/chat/completions'
`text` must be string, not <class 'NoneType'>


model,input,output,generated_output,dataset_type,accuracy
str,str,str,str,str,f64
"""microsoft/Phi-3.5-mini-instruc…","""Використай наведені нижче інст…","""<p t=""X"">уничтожены</p> <p t=""…","""<g ed=""G/Case"" et=""Тип помилки…","""PapersDataset""",0.005952
"""microsoft/Phi-3.5-mini-instruc…","""Використай наведені нижче інст…","""<p t=""ADJ"">Оперативная</p> <p …","""<g ed=""G/Case"" et=""Тип помилки…","""PapersDataset""",0.004587
"""microsoft/Phi-3.5-mini-instruc…","""Використай наведені нижче інст…","""<p t=""NOUN"">Року</p> <p t=""ADJ…","""<g ed=""G/Case"" et=""Тип помилки…","""PapersDataset""",0.0
"""microsoft/Phi-3.5-mini-instruc…","""Використай наведені нижче інст…","""<g ed=""."" et=""G/Case""><p t=""PU…","""<g ed=""G/Date"" et=""Тип помилки…","""PapersDataset""",0.0
"""microsoft/Phi-3.5-mini-instruc…",,"""<p t=""X"">Ɍɚɤɢɦ</p> <p t=""X"">ɱɢ…",,"""PapersDataset""",
…,…,…,…,…,…
"""gpt-4o-mini""","""Використай наведені нижче інст…","""<p t=""X"">Symptoms</p> <p t=""X""…","""```xml""","""PapersDataset""",0.0
"""gpt-4o-mini""","""Використай наведені нижче інст…","""<p t=""X"">He</p> <p t=""X"">is</p…","""```xml""","""PapersDataset""",0.0
"""gpt-4o-mini""","""Використай наведені нижче інст…","""<p t=""ADP"">В</p> <p t=""X"">боль…","""```xml""","""PapersDataset""",0.0
"""gpt-4o-mini""","""Використай наведені нижче інст…","""<p t=""X"">Keywords</p><p t=""PUN…","""```xml""","""PapersDataset""",0.0


In [21]:
calc_stats(artificial_results)

  accuracy_df.pivot(


model,PapersDataset,UbertextV2Dataset,mean
str,f64,f64,f64
"""ft:gpt-4o-mini-2024-07-18:pers…",0.010919,0.00567,0.008295
"""ostapbodnar/Phi3.5-mini-instru…",0.171621,0.230448,0.201034
"""ostapbodnar/Phi3.5-mini-instru…",0.0,0.000118,5.9e-05
"""ostapbodnar/Phi3.5-mini-instru…",0.168541,0.206307,0.187424
"""microsoft/Phi-3.5-mini-instruc…",0.001642,0.000764,0.001203
"""gpt-4o-mini""",0.000641,0.000656,0.000649
"""mean""",0.058894,0.073994,0.066444


In [22]:
golden_grammar_results = await perform_eval_async(golden_ds_tag.shuffle(seed=42).select(range(100)),
                                                  get_score_grammar_func())
golden_grammar_results.write_csv("golden_grammar_results_scores.csv")
golden_grammar_results

Evaluation of microsoft/Phi-3.5-mini-instruct


Evaluating microsoft/Phi-3.5-mini-instruct:   0%|          | 0/100 [00:00<?, ?it/s]

Evaluation of ostapbodnar/Phi3.5-mini-instruct-UA-qlora


Evaluating ostapbodnar/Phi3.5-mini-instruct-UA-qlora:   0%|          | 0/100 [00:00<?, ?it/s]

Evaluation of ostapbodnar/Phi3.5-mini-instruct-UA-lora-unsloth-artificial


Evaluating ostapbodnar/Phi3.5-mini-instruct-UA-lora-unsloth-artificial:   0%|          | 0/100 [00:00<?, ?it/s…

Evaluation of ostapbodnar/Phi3.5-mini-instruct-UA-lora-unsloth-mixed


Evaluating ostapbodnar/Phi3.5-mini-instruct-UA-lora-unsloth-mixed:   0%|          | 0/100 [00:00<?, ?it/s]

Evaluation of ft:gpt-4o-mini-2024-07-18:personal::A7fOmagX


Evaluating ft:gpt-4o-mini-2024-07-18:personal::A7fOmagX:   0%|          | 0/100 [00:00<?, ?it/s]

Evaluation of gpt-4o-mini


Evaluating gpt-4o-mini:   0%|          | 0/100 [00:00<?, ?it/s]

500, message='Internal Server Error', url='https://it7s4lcu4sy306c2.eu-west-1.aws.endpoints.huggingface.cloud/v1/chat/completions'
`text` must be string, not <class 'NoneType'>
500, message='Internal Server Error', url='https://it7s4lcu4sy306c2.eu-west-1.aws.endpoints.huggingface.cloud/v1/chat/completions'
`text` must be string, not <class 'NoneType'>
500, message='Internal Server Error', url='https://it7s4lcu4sy306c2.eu-west-1.aws.endpoints.huggingface.cloud/v1/chat/completions'
`text` must be string, not <class 'NoneType'>
500, message='Internal Server Error', url='https://it7s4lcu4sy306c2.eu-west-1.aws.endpoints.huggingface.cloud/v1/chat/completions'
`text` must be string, not <class 'NoneType'>
500, message='Internal Server Error', url='https://it7s4lcu4sy306c2.eu-west-1.aws.endpoints.huggingface.cloud/v1/chat/completions'
`text` must be string, not <class 'NoneType'>
500, message='Internal Server Error', url='https://it7s4lcu4sy306c2.eu-west-1.aws.endpoints.huggingface.cloud/v1/ch

model,input,output,generated_output,dataset_type,accuracy
str,str,str,str,str,f64
"""microsoft/Phi-3.5-mini-instruc…","""Визнач частину мови для кожног…","""<p t=""VERB"">Заглянула</p> <p t…","""`<p t=""VERB"">Заглянула</p> <p …","""MovaInstPosDataset""",0.833333
"""microsoft/Phi-3.5-mini-instruc…","""Виправ граматичні помилки в по…",""" Варто згадати, що цей період …","""<g ed=""Prep"" et=""G/Prep"">супро…","""UaGecDataset""",0.0
"""microsoft/Phi-3.5-mini-instruc…","""Виправ граматичні помилки в по…","""Льоня відірвав очі від стелі і…","""<g ed=""D/Prep"" et=""Grammar"">Ло…","""UaGecDataset""",0.0
"""microsoft/Phi-3.5-mini-instruc…","""Визнач частину мови для кожног…","""<p t=""PART"">Не</p> <p t=""NOUN""…","""`<p t=""PREP"">Не</p> <p t=""PART…","""MovaInstPosDataset""",0.5
"""microsoft/Phi-3.5-mini-instruc…","""Виправ граматичні помилки в по…","""Якщо <g ed=""в заняттях"" et=""G/…","""<g ed=""Punctuation"" et=""Punctu…","""UaGecDataset""",0.0
…,…,…,…,…,…
"""gpt-4o-mini""","""Визнач частину мови для кожног…","""<p t=""DET"">Один</p> <p t=""ADP""…","""<p t=""NUMR"">Один</p> <p t=""PRE…","""MovaInstPosDataset""",0.454545
"""gpt-4o-mini""","""Визнач частину мови для кожног…","""<p t=""CCONJ"">Але</p> <p t=""PRO…","""<p t=""CONJ"">Але</p> <p t=""PRON…","""MovaInstPosDataset""",0.875
"""gpt-4o-mini""","""Ось інструкція з використанням…","""<n t=""ORG"">Міністерство економ…","""<n t=""ORG"">Міністерство економ…","""NerDataset""",0.184211
"""gpt-4o-mini""","""Виправ граматичні помилки в по…","""О, я трохи виріс після двадцят…","""О, я <g ed=""виріс"" et=""G/Tense…","""UaGecDataset""",0.0


In [23]:
calc_stats(golden_grammar_results)

  accuracy_df.pivot(


model,MovaInstPosDataset,NerDataset,UaGecDataset,mean
str,f64,f64,f64,f64
"""ostapbodnar/Phi3.5-mini-instru…",0.0,0.0,0.0,0.0
"""ft:gpt-4o-mini-2024-07-18:pers…",0.434864,0.052632,0.186708,0.224734
"""ostapbodnar/Phi3.5-mini-instru…",0.860283,0.052632,0.117307,0.343407
"""ostapbodnar/Phi3.5-mini-instru…",0.504781,0.0,0.0,0.16826
"""microsoft/Phi-3.5-mini-instruc…",0.349037,0.013158,0.004743,0.122313
"""gpt-4o-mini""",0.474848,0.092105,0.02681,0.197921
"""mean""",0.437302,0.035088,0.055928,0.176106


In [24]:
golden_results = await perform_eval_async(golden_ds_class.shuffle(seed=42).select(range(100)), score_q_and_a_func, max_tokens=128)
golden_results.write_csv("golden_q_and_a_scores.csv")
golden_results

Evaluation of microsoft/Phi-3.5-mini-instruct


Evaluating microsoft/Phi-3.5-mini-instruct:   0%|          | 0/100 [00:00<?, ?it/s]

Evaluation of ostapbodnar/Phi3.5-mini-instruct-UA-qlora


Evaluating ostapbodnar/Phi3.5-mini-instruct-UA-qlora:   0%|          | 0/100 [00:00<?, ?it/s]

Evaluation of ostapbodnar/Phi3.5-mini-instruct-UA-lora-unsloth-artificial


Evaluating ostapbodnar/Phi3.5-mini-instruct-UA-lora-unsloth-artificial:   0%|          | 0/100 [00:00<?, ?it/s…

Evaluation of ostapbodnar/Phi3.5-mini-instruct-UA-lora-unsloth-mixed


Evaluating ostapbodnar/Phi3.5-mini-instruct-UA-lora-unsloth-mixed:   0%|          | 0/100 [00:00<?, ?it/s]

Evaluation of ft:gpt-4o-mini-2024-07-18:personal::A7fOmagX


Evaluating ft:gpt-4o-mini-2024-07-18:personal::A7fOmagX:   0%|          | 0/100 [00:00<?, ?it/s]

Evaluation of gpt-4o-mini


Evaluating gpt-4o-mini:   0%|          | 0/100 [00:00<?, ?it/s]

500, message='Internal Server Error', url='https://jnovl897uj1glqgn.us-east-1.aws.endpoints.huggingface.cloud/v1/chat/completions'
object of type 'NoneType' has no len()
500, message='Internal Server Error', url='https://jnovl897uj1glqgn.us-east-1.aws.endpoints.huggingface.cloud/v1/chat/completions'
object of type 'NoneType' has no len()
500, message='Internal Server Error', url='https://jnovl897uj1glqgn.us-east-1.aws.endpoints.huggingface.cloud/v1/chat/completions'
object of type 'NoneType' has no len()
500, message='Internal Server Error', url='https://jnovl897uj1glqgn.us-east-1.aws.endpoints.huggingface.cloud/v1/chat/completions'
object of type 'NoneType' has no len()
500, message='Internal Server Error', url='https://jnovl897uj1glqgn.us-east-1.aws.endpoints.huggingface.cloud/v1/chat/completions'
object of type 'NoneType' has no len()
500, message='Internal Server Error', url='https://jnovl897uj1glqgn.us-east-1.aws.endpoints.huggingface.cloud/v1/chat/completions'
object of type 'Non

model,input,output,generated_output,dataset_type,accuracy
str,str,str,str,str,f64
"""microsoft/Phi-3.5-mini-instruc…","""Визнач, які слова є найбільш з…","""Основні ключові слова: ['Росія…","""Найбільш значущі слова у текст…","""NewsKeywordDataset""",0.277778
"""microsoft/Phi-3.5-mini-instruc…","""Розглянь варіанти відповідей н…","""В""","""Г) підтвердити думку про Вікіп…","""ZnoDataset""",0.022727
"""microsoft/Phi-3.5-mini-instruc…","""Класифікуй новину, виходячи з …","""технології""","""Тематика: Sony PlayStation 5; …","""NewsTopicClassificationDataset""",0.162602
"""microsoft/Phi-3.5-mini-instruc…","""Базуючись на контенті тексту в…","""політика""","""Ця контент тексту відноситься …","""NewsTopicClassificationDataset""",0.0
"""microsoft/Phi-3.5-mini-instruc…","""Аналізуй текст новини і вибери…","""політика""","""Політична новина - цей жанр на…","""NewsTopicClassificationDataset""",0.0
…,…,…,…,…,…
"""gpt-4o-mini""","""Проаналізуй варіанти відповіде…","""В""","""Щоб вибрати правильну відповід…","""ZnoDataset""",0.013514
"""gpt-4o-mini""","""Проаналізуй текст і визнач клю…","""Основні ключові слова: ['Техно…","""Ключові елементи тексту:""","""NewsKeywordDataset""",0.297872
"""gpt-4o-mini""","""Визнач правильний варіант відп…","""Б""","""Правильний варіант відповіді н…","""ZnoDataset""",0.0
"""gpt-4o-mini""","""На основі змісту новини, визна…","""політика""","""Жанр новини: військова хроніка…","""NewsTopicClassificationDataset""",0.126984


In [25]:
calc_stats(golden_results)

  accuracy_df.pivot(


model,NewsTopicClassificationDataset,ZnoDataset,WscDataset,UaSqaudDataset,NewsKeywordDataset,mean
str,f64,f64,f64,f64,f64,f64
"""ostapbodnar/Phi3.5-mini-instru…",0.064487,0.0,0.19925,0.039385,0.102118,0.081048
"""microsoft/Phi-3.5-mini-instruc…",0.139156,0.007717,0.20775,0.12722,0.259782,0.148325
"""gpt-4o-mini""",0.209568,0.002928,0.340902,0.256273,0.215815,0.205097
"""ostapbodnar/Phi3.5-mini-instru…",0.085033,0.00057,0.048569,0.106253,0.068835,0.061852
"""ostapbodnar/Phi3.5-mini-instru…",0.67557,0.222222,0.116667,0.550507,0.538239,0.420641
"""ft:gpt-4o-mini-2024-07-18:pers…",0.915344,0.444444,0.960784,0.618957,0.622217,0.712349
"""mean""",0.348193,0.11298,0.31232,0.283099,0.301168,0.271552


python main.py     --tasks=mmlu_uk --limit=100     --model_args pretrained=microsoft/Phi-3.5-mini-instruct     --device=cuda

python main.py     --tasks=mmlu_uk --limit=100     --model_args pretrained=ostapbodnar/Phi3.5-mini-instruct-UA-lora-unsloth-mixed-16bit     --device=cuda

export HF_DATASETS_TRUST_REMOTE_CODE=true