In [42]:
import openai

from fp_dataset_artifacts.utils import init_openai
from fp_dataset_artifacts.boolq import format_question, format_answer
from datasets import list_datasets, load_dataset, list_metrics, load_metric, concatenate_datasets

init_openai()

data = load_dataset('anli')
data

Reusing dataset anli (/home/x/.cache/huggingface/datasets/anli/plain_text/0.1.0/aabce88453b06dff21c201855ea83283bab0390bff746deadb30b65695755c0b)


  0%|          | 0/9 [00:00<?, ?it/s]

DatasetDict({
    train_r1: Dataset({
        features: ['uid', 'premise', 'hypothesis', 'label', 'reason'],
        num_rows: 16946
    })
    dev_r1: Dataset({
        features: ['uid', 'premise', 'hypothesis', 'label', 'reason'],
        num_rows: 1000
    })
    test_r1: Dataset({
        features: ['uid', 'premise', 'hypothesis', 'label', 'reason'],
        num_rows: 1000
    })
    train_r2: Dataset({
        features: ['uid', 'premise', 'hypothesis', 'label', 'reason'],
        num_rows: 45460
    })
    dev_r2: Dataset({
        features: ['uid', 'premise', 'hypothesis', 'label', 'reason'],
        num_rows: 1000
    })
    test_r2: Dataset({
        features: ['uid', 'premise', 'hypothesis', 'label', 'reason'],
        num_rows: 1000
    })
    train_r3: Dataset({
        features: ['uid', 'premise', 'hypothesis', 'label', 'reason'],
        num_rows: 100459
    })
    dev_r3: Dataset({
        features: ['uid', 'premise', 'hypothesis', 'label', 'reason'],
        num_rows: 12

In [8]:
data['train_r1'][0]

{'uid': '0fd0abfb-659e-4453-b196-c3a64d2d8267',
 'premise': 'The Parma trolleybus system (Italian: "Rete filoviaria di Parma" ) forms part of the public transport network of the city and "comune" of Parma, in the region of Emilia-Romagna, northern Italy. In operation since 1953, the system presently comprises four urban routes.',
 'hypothesis': 'The trolleybus system has over 2 urban routes',
 'label': 0,
 'reason': ''}

In [9]:
data['train_r2'][0]

{'uid': '245475c4-6225-4f5c-b945-e13e0e988fc8',
 'premise': 'Topi Raja Sweety Roja is a 1996 Telugu comedy film, produced by Dr. A. Vijayalakshmi on Sri Sai Madhavi Productions banner and directed by Dr. N. Siva Prasad. Starring Rajendra Prasad, Roja in the lead roles and music also composed by "Hero" Rajendra Prasad.',
 'hypothesis': 'The lead role was played by Rajendra Prasad',
 'label': 0,
 'reason': ''}

In [10]:
data['train_r3'][0]

{'uid': '2093cfb3-a15f-4282-81e3-0cb793ffd0d7',
 'premise': 'TOKYO, Dec 18 (Reuters) - Japan’s Shionogi & Co said on Tuesday that it has applied to health regulators in the United States, Canada and Europe for approval of its HIV drug Dolutegravir. Shionogi developed Dolutegravir with a Viiv Healthcare, an AIDS drug joint venture between GlaxoSmithKline and Pfizer, in exchange for its rights to the drug.',
 'hypothesis': 'The article was written on December 18th.',
 'label': 0,
 'reason': 'TOKYO, Dec 18 (Reuters) is when the article was written as it states in the first words of the sentence'}

In [32]:
def int2label(i):
    return ['Entailment', 'Neutral', 'Contradiction'][i]
int2label(0)

'Entailment'

In [33]:
def map_finetune_train(x):
    premise = x['premise']
    hypothesis = x['hypothesis']
    label = int2label(x['label'])

    return {
        'prompt': f"Premise: {premise}\n\nHypothesis: {hypothesis}\n\nLabel: ",
        'completion': label + '\n',
    }
map_finetune_train(data['train_r1'][0])

{'prompt': 'Premise: The Parma trolleybus system (Italian: "Rete filoviaria di Parma" ) forms part of the public transport network of the city and "comune" of Parma, in the region of Emilia-Romagna, northern Italy. In operation since 1953, the system presently comprises four urban routes.\n\nHypothesis: The trolleybus system has over 2 urban routes\n\nLabel: ',
 'completion': 'Entailment\n'}

In [38]:
train_r1 = data['train_r1'].map(map_finetune_train)
train_r1 = train_r1.remove_columns(['uid', 'reason', 'premise', 'hypothesis', 'label'])

Loading cached processed dataset at /home/x/.cache/huggingface/datasets/anli/plain_text/0.1.0/aabce88453b06dff21c201855ea83283bab0390bff746deadb30b65695755c0b/cache-0abd26387ee4a689.arrow


In [39]:
train_r2 = data['train_r2'].map(map_finetune_train)
train_r2 = train_r2.remove_columns(['uid', 'reason', 'premise', 'hypothesis', 'label'])

  0%|          | 0/45460 [00:00<?, ?ex/s]

In [47]:
train_r3 = data['train_r3'].map(map_finetune_train)
train_r3 = train_r3.remove_columns(['uid', 'reason', 'premise', 'hypothesis', 'label'])

  0%|          | 0/100459 [00:00<?, ?ex/s]

In [49]:
train = concatenate_datasets([train_r1, train_r2, train_r3])
train.to_json('anli_finetune_train.jsonl')

Creating json from Arrow format:   0%|          | 0/17 [00:00<?, ?ba/s]

74546999

In [50]:
# This trainging dataset will definitely go over the token limit.
# Let's just use round 1 for fine tuning for now.

In [51]:
train_r1.to_json('anli_finetune_train_r1.jsonl')

Creating json from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

8042647

In [53]:
dev_r1 = data['dev_r1'].map(map_finetune_train)
dev_r1 = dev_r1.remove_columns(['uid', 'reason', 'premise', 'hypothesis', 'label'])
dev_r1.to_json('anli_finetune_dev_r1.jsonl')

  0%|          | 0/1000 [00:00<?, ?ex/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

475195

In [54]:
openai.File.create(file=open('anli_finetune_train_r1.jsonl'), purpose='fine-tune')

<File file id=file-FkO4WnaoNWQjODh96npgZ6Kr at 0x7f67643eccc0> JSON: {
  "bytes": 8042647,
  "created_at": 1638672457,
  "filename": "anli_finetune_train_r1.jsonl",
  "id": "file-FkO4WnaoNWQjODh96npgZ6Kr",
  "object": "file",
  "purpose": "fine-tune",
  "status": "uploaded",
  "status_details": null
}

In [55]:
openai.File.create(file=open('anli_finetune_dev_r1.jsonl'), purpose='fine-tune')

<File file id=file-cZYCbTSEyYvGpQ1lW2kB41pg at 0x7f66ff96b2c0> JSON: {
  "bytes": 475195,
  "created_at": 1638672465,
  "filename": "anli_finetune_dev_r1.jsonl",
  "id": "file-cZYCbTSEyYvGpQ1lW2kB41pg",
  "object": "file",
  "purpose": "fine-tune",
  "status": "uploaded",
  "status_details": null
}

In [56]:
openai.FineTune.create(
    training_file='file-FkO4WnaoNWQjODh96npgZ6Kr',
    validation_file='file-cZYCbTSEyYvGpQ1lW2kB41pg',
    model='curie',
    n_epochs=4,
    compute_classification_metrics=True,
    classification_n_classes=3
)

<FineTune fine-tune id=ft-uYZ4vdMlvmoLVuycheBpvM7U at 0x7f66ff988bd0> JSON: {
  "created_at": 1638672534,
  "events": [
    {
      "created_at": 1638672534,
      "level": "info",
      "message": "Created fine-tune: ft-uYZ4vdMlvmoLVuycheBpvM7U",
      "object": "fine-tune-event"
    }
  ],
  "fine_tuned_model": null,
  "hyperparams": {
    "batch_size": null,
    "classification_n_classes": 3,
    "compute_classification_metrics": true,
    "learning_rate_multiplier": null,
    "n_epochs": 4,
    "prompt_loss_weight": 0.1,
    "use_packing": null
  },
  "id": "ft-uYZ4vdMlvmoLVuycheBpvM7U",
  "model": "curie",
  "object": "fine-tune",
  "organization_id": "org-5AE307Eg4rc5EAoEA2S2bwkH",
  "result_files": [],
  "status": "pending",
  "training_files": [
    {
      "bytes": 8042647,
      "created_at": 1638672457,
      "filename": "anli_finetune_train_r1.jsonl",
      "id": "file-FkO4WnaoNWQjODh96npgZ6Kr",
      "object": "file",
      "purpose": "fine-tune",
      "status": "processe