#Train a fine-tuning model specialized for Q&A

In [1]:
!pip install --upgrade openai --quiet

from google.colab import userdata
OPEN_AI_KEY=userdata.get('opeaikey4o')

from openai import OpenAI
import matplotlib.pyplot as plt
import time

## Set the API key and model name
MODEL="gpt-3.5-turbo"
client = OpenAI(api_key=OPEN_AI_KEY)

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m327.4/327.4 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m993.8 kB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.9/77.9 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25h

We will add hard adversarial examples as well, which will be based either on semantically similar sections, or neighbouring sections, originating from the same article.



In [2]:
import pandas as pd
df = pd.read_csv('olympics_qa_lg.csv')
olympics_search_fileid = "file-c3shd8wqF3vSCKaukW4Jr1TT"
df.head()

Unnamed: 0,title,heading,content,tokens,context,questions,answers
0,2020 Summer Olympics,Summary,"The 2020 Summer Olympics, officially the Games...",623,2020 Summer Olympics\nSummary\n\nThe 2020 Summ...,1.What were some of the new events introduced ...,1.Some of the new events introduced in existin...
1,2016 Winter Youth Olympics,Summary,The 2016 Winter Youth Olympics (Norwegian: Oly...,152,2016 Winter Youth Olympics\nSummary\n\nThe 201...,1.What was significant about the 2016 Winter Y...,1.The 2016 Winter Youth Olympics held in Lille...
2,2016 Winter Youth Olympics,Host selection,Lillehammer was the only city to bid for the g...,129,2016 Winter Youth Olympics\nHost selection\n\n...,1.What was the process for selecting the host ...,1.The process for selecting the host city for ...
3,Olympic Games ceremony,Summary,The Olympic Games ceremonies of the Ancient Ol...,271,Olympic Games ceremony\nSummary\n\nThe Olympic...,1.What are some elements of the modern Olympic...,1.Some elements of the modern Olympic Games ce...
4,Olympic Games ceremony,Ancient forerunners,"The Ancient Games, held in Greece from ca. 776...",247,Olympic Games ceremony\nAncient forerunners\n\...,1.What were some of the elements involved in t...,1.Some of the elements involved in the victory...


Split the sections into a training and testing set



In [3]:
from sklearn.model_selection import train_test_split
train_df,test_df=train_test_split(df,test_size=0.2,random_state=42)
len(train_df) ,len(test_df)

(607, 152)

we check that the separator we intend to use isn't present within the contexts



In [4]:
df.context.str.contains('->').sum()

0

#Create the fine-tuning datasets for Q&A


The fine-tuning dataset is created in the following way. For every corresponding question, answer and context pair we create:


We apply the same process of dataset creation for both the discriminator, and the Q&A answering model. We apply the process separately for the training and testing set, to ensure that the examples from the training set don't feature within the test set.




##    Find similar contexts to the given context using the search file


Create a dataset for fine tuning the OpenAI model; either for a discriminator model,
    or a model specializing in Q&A, where it says if no relevant context is found.

    Parameters
    ----------
    df: pd.DataFrame
        The dataframe containing the question, answer and context pairs
    discriminator: bool
        Whether to create a dataset for the discriminator
    n_negative: int
        The number of random negative samples to add (using a random context)
    add_related: bool
        Whether to add the related contexts to the correct context. These are hard negative examples

    Returns
    -------
    pd.DataFrame
        The dataframe containing the prompts and completions, ready for fine-tuning

In [5]:


def create_fine_tuning_dataset(df, discriminator=False, n_negative=1, add_related=False):

    rows = []
    for i, row in df.iterrows():
        for q, a in zip(("1." + row.questions).split('\n'), ("1." + row.answers).split('\n')):
            if len(q) >10 and len(a) >10:
                if discriminator:
                    rows.append({"prompt":f"{row.context}\nQuestion: {q[2:].strip()}\n Related:", "completion":f" yes"})
                else:
                    rows.append({"prompt":f"{row.context}\nQuestion: {q[2:].strip()}\nAnswer:", "completion":f" {a[2:].strip()}"})

    for i, row in df.iterrows():
        for q in ("1." + row.questions).split('\n'):
            if len(q) >10:
                for j in range(n_negative + (2 if add_related else 0)):
                    random_context = ""
                    if j == 0 and add_related:
                        # add the related contexts based on originating from the same wikipedia page
                        subset = df[(df.title == row.title) & (df.context != row.context)]

                        if len(subset) < 1:
                            continue
                        random_context = subset.sample(1).iloc[0].context
                    else:
                        while True:
                            # add random context, which isn't the correct context
                            random_context = df.sample(1).iloc[0].context
                            if random_context != row.context:
                                break
                    if discriminator:
                        rows.append({"prompt":f"{random_context}\nQuestion: {q[2:].strip()}\n Related:", "completion":f" no"})
                    else:
                        rows.append({"prompt":f"{random_context}\nQuestion: {q[2:].strip()}\nAnswer:", "completion":f" No appropriate context found to answer the question."})

    return pd.DataFrame(rows)

We apply the same process of dataset creation for both the discriminator, and the Q&A answering model. We apply the process separately for the training and testing set, to ensure that the examples from the training set don't feature within the test set.

In [6]:
for name, is_disc in [('discriminator', True), ('qa', False)]:
    for train_test, dt in [('train', train_df), ('test', test_df)]:
        ft = create_fine_tuning_dataset(dt, discriminator=is_disc, n_negative=1, add_related=True)
        ft.to_json(f'{name}_{train_test}.jsonl', orient='records', lines=True)

We formatted the data according to the recommendations from the fine-tuning tool, which is available using

openai tools fine_tunes.prepare_data -f qa_train.jsonl

We highly recommend that you use this tool, which suggests improvements in your data formatting for fine-tuning.

###Submit the datasets for fine-tuning

In [8]:
# !openai tools fine_tunes.prepare_data -f discriminator_train.jsonl  -q

In [9]:
# !openai tools fine_tunes.prepare_data -f discriminator_test.jsonl  -q

In [10]:
!openai tools fine_tunes.prepare_data -f qa_train.jsonl  -q

Analyzing...

- Your file contains 2157 prompt-completion pairs
- Based on your data it seems like you're trying to fine-tune a model for classification
- For classification, we recommend you try one of the faster and cheaper models, such as `ada`
- For classification, you can estimate the expected model performance by keeping a held out dataset, which is not used for training
- There are 5 duplicated prompt-completion sets. These are rows: [1470, 1846, 1922, 2033, 2044]
- All prompts end with suffix `\nAnswer:`

Based on the analysis we will perform the following actions:
- [Recommended] Remove 5 duplicate rows [Y/n]: Y
- [Recommended] Would you like to split into training and validation set? [Y/n]: Y


Your data will be written to a new JSONL file. Proceed [Y/n]: Y

Wrote modified files to `qa_train_prepared_train.jsonl` and `qa_train_prepared_valid.jsonl`
Feel free to take a look!

Now use that file when fine-tuning:
> openai api fine_tunes.create -t "qa_train_prepared_train.jsonl" 

In [None]:
!openai tools fine_tunes.prepare_data -f qa_test.jsonl  -q

Analyzing...

- Your file contains 493 prompt-completion pairs
- Based on your data it seems like you're trying to fine-tune a model for classification
- For classification, we recommend you try one of the faster and cheaper models, such as `ada`
- For classification, you can estimate the expected model performance by keeping a held out dataset, which is not used for training
- There are 2 duplicated prompt-completion sets. These are rows: [328, 339]
- All prompts end with suffix `\nAnswer:`

Based on the analysis we will perform the following actions:
- [Recommended] Remove 2 duplicate rows [Y/n]: Y
- [Recommended] Would you like to split into training and validation set? [Y/n]: Y


Your data will be written to a new JSONL file. Proceed [Y/n]: Y

Wrote modified files to `qa_test_prepared_train.jsonl` and `qa_test_prepared_valid.jsonl`
Feel free to take a look!

Now use that file when fine-tuning:
> openai api fine_tunes.create -t "qa_test_prepared_train.jsonl" -v "qa_test_prepared_val

In [12]:
train_file=client.files.create(file=open("qa_train_prepared_train.jsonl", "rb"),purpose='fine-tune')
valid_file=client.files.create(file=open("qa_train_prepared_valid.jsonl", "rb"),purpose='fine-tune')


In [13]:
fine_tuning_job=client.fine_tuning.jobs.create(training_file=train_file.id,validation_file=valid_file.id,model="babbage-002")
print(fine_tuning_job)

FineTuningJob(id='ftjob-bcQU9znvtHYxpsSI6eJIMXfy', created_at=1719132722, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs='auto', batch_size='auto', learning_rate_multiplier='auto'), model='babbage-002', object='fine_tuning.job', organization_id='org-PRq3tYU2rFVBkygKcRdWRjwB', result_files=[], seed=226339052, status='validating_files', trained_tokens=None, training_file='file-yEqk2YT8VtNcl79BTEo7VaYO', validation_file='file-4UzjflJmC6kDRkXqf6hQAdGp', estimated_finish=None, integrations=[], user_provided_suffix=None)


In [15]:
retrieved_jobs=client.fine_tuning.jobs.retrieve(fine_tuning_job.id)
status=retrieved_jobs.status
print(status)

running


In [16]:
while True:
  time.sleep(10)
  retrieved_jobs=client.fine_tuning.jobs.retrieve(fine_tuning_job.id)
  status=retrieved_jobs.status
  print(status)
  if(status=='succeeded'):
    break

running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
succeeded


##Using the fine-tuned models

In [18]:
fine_tune_results = client.fine_tuning.jobs.retrieve(fine_tuning_job.id)
ft_qa_model = fine_tune_results.fine_tuned_model
ft_qa_model

'ft:babbage-002:personal::9dDP4miq'

In [25]:
def apply_ft_qa_answer(context,question,answering_model):

  print("heloo")
  prompt=f"{context}\nQuestion:{question}\nAnswer:"
  result=client.chat.completions.create(model=answering_model,
                                        prompt=prompt,top_p=1, n=1, stop=['.','\n'])

In [26]:
apply_ft_qa_answer("The first human-made object in space was the Soviet Union satellite Sputnik 1 on 4 October 1957.",
                    "What was the first human-made object in space?", ft_qa_model)

heloo


TypeError: Missing required arguments; Expected either ('messages' and 'model') or ('messages', 'model' and 'stream') arguments to be given

In [27]:
test = pd.read_json('qa_train_prepared_valid.jsonl', lines=True)
test.head()

Unnamed: 0,prompt,completion
0,Tanzania at the 2016 Summer Olympics\nBackgrou...,1.The most successful event for Tanzanian ath...
1,Boxing at the 2016 Summer Olympics – Men's lig...,1.**Answer:** The medals for the men's light ...
2,Archery at the 2016 Summer Olympics – Women's ...,1.The competition format for the women's team...
3,Tennis at the 2016 Summer Olympics – Women's s...,1.The 2016 Summer Olympics marked the 13th ap...
4,Rugby sevens at the 2016 Summer Olympics – Wom...,1.The Great Britain rugby sevens team qualifi...
