In [1]:
import openai

from fp_dataset_artifacts.utils import init_openai
from fp_dataset_artifacts.anli import map_finetune, get_response
from datasets import list_datasets, load_dataset, list_metrics, load_metric, concatenate_datasets

init_openai()

data = load_dataset('snli')
data

Reusing dataset snli (/home/x/.cache/huggingface/datasets/snli/plain_text/1.0.0/1f60b67533b65ae0275561ff7828aad5ee4282d0e6f844fd148d05d3c6ea251b)


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    test: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 10000
    })
    train: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 550152
    })
    validation: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 10000
    })
})

In [9]:
test = data['test'].shuffle(0).select(list(range(1000))).map(map_finetune)
test

Loading cached shuffled indices for dataset at /home/x/.cache/huggingface/datasets/snli/plain_text/1.0.0/1f60b67533b65ae0275561ff7828aad5ee4282d0e6f844fd148d05d3c6ea251b/cache-72f0a9fb81bb4de9.arrow
Loading cached processed dataset at /home/x/.cache/huggingface/datasets/snli/plain_text/1.0.0/1f60b67533b65ae0275561ff7828aad5ee4282d0e6f844fd148d05d3c6ea251b/cache-6aa7b7d22baa7237.arrow


Dataset({
    features: ['premise', 'hypothesis', 'label', 'prompt', 'completion'],
    num_rows: 1000
})

In [3]:
model = 'curie:ft-user-5hzndcnnszukksvrzrlnjn8l-2021-12-05-03-26-14'

def map_response(x):
    try:
        response = get_response(x['prompt'], model)
        return {
            'response': response['choices'][0]['text']
        }
    except Exception as e:
        print(e)
        return {
            'response': None
        }

In [4]:
map_response(test[0])

{'response': 'Entailment'}

In [10]:
test = test.remove_columns(['premise', 'hypothesis', 'label'])
test

Dataset({
    features: ['prompt', 'completion'],
    num_rows: 1000
})

In [11]:
responses = test.map(map_response)
responses

  0%|          | 0/1000 [00:00<?, ?ex/s]

Dataset({
    features: ['prompt', 'completion', 'response'],
    num_rows: 1000
})

In [12]:
responses.to_json('../results/anli_snli_responses.jsonl')

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

212186

In [13]:
f1_metric = load_metric('f1')
acc_metric = load_metric('accuracy')

In [14]:
def label2int(x):
    return {
        'Entailment': 0,
        'Neutral': 1,
        'Contradiction': 2
    }[x]

In [15]:
def map_refs_and_preds(x):
    return {
        'references': label2int(x['completion'].strip()),
        'predictions': label2int(x['response'])
    }

results = responses.map(map_refs_and_preds)
results

  0%|          | 0/1000 [00:00<?, ?ex/s]

Dataset({
    features: ['prompt', 'completion', 'response', 'references', 'predictions'],
    num_rows: 1000
})

In [16]:
f1_metric.compute(references=results['references'], predictions=results['predictions'], average='weighted')

{'f1': 0.7000549183627798}

In [17]:
acc_metric.compute(references=results['references'], predictions=results['predictions'])

{'accuracy': 0.706}