In [98]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import openai
import json
import ijson
import os
import tiktoken
from collections import defaultdict

openai.api_key = ""

openai.base_url = "https://api.vsegpt.ru:6070/v1/"

In [66]:
def convert_conversation(question, answer, system_message):
    messages = []
    if system_message:
        messages.append({
            "role": "system",
            "content": system_message
        })

    message_1 = {
        "role": "user",
        "content": question
    }
    message_2 = {
        "role": "assistant",
        "content": answer
    }
    messages.append(message_1)
    messages.append(message_2)

    # Creating the final output dictionary
    output_dict = {
        "messages": messages
    }

    return output_dict


In [44]:
df_test = pd.read_csv('archive/n_annotated_wd_data_test_answerable.csv')
df_train = pd.read_csv('archive/n_annotated_wd_data_train_answerable.csv')
df_valid = pd.read_csv('archive/n_annotated_wd_data_valid_answerable.csv')


In [None]:
system_message = "You are a question answering system. Answer shortly in 1-3 words"
dataset = []

for question, answer in zip(df_train['q'][:200], df_train['e2']):
    record = convert_conversation(question, answer, system_message)
    dataset.append(record)
    
dataset

In [None]:
system_message = "You are a question answering system. Answer shortly in 1-3 words"
validation = []

for question, answer in zip(df_valid['q'][:200], df_valid['e2']):
    record = convert_conversation(question, answer, system_message)
    validation.append(record)
    
validation

In [67]:
def save_to_jsonl(conversations, file_path):
    with open(file_path, 'w') as file:
        for conversation in conversations:
            json_line = json.dumps(conversation)
            file.write(json_line + '\n')

In [48]:
training_file_name = 'data/fine_tuning_sqwd_train.jsonl'
validation_file_name = 'data/fine_tuning_sqwd_valid.jsonl'

In [87]:
save_to_jsonl(dataset, training_file_name)
save_to_jsonl(validation, validation_file_name)

In [50]:
training_response = openai.File.create(
    file=open(training_file_name, "rb"), purpose="fine-tune"
)
training_file_id = training_response["id"]

validation_response = openai.File.create(
    file=open(validation_file_name, "rb"), purpose="fine-tune"
)
validation_file_id = validation_response["id"]

print("Training file id:", training_file_id)
print("Validation file id:", validation_file_id)

Training file id: file-fvifTG3NWaEo9BaRh3u9XOlN
Validation file id: file-BZSy9DzZEFrVyGD9ug1fANDr


In [51]:
suffix_name = "sqwd-test"


response = openai.FineTuningJob.create(
    training_file=training_file_id,
    validation_file=validation_file_id,
    model="gpt-3.5-turbo",
    suffix=suffix_name,
)

job_id = response["id"]

print(response)

{
  "object": "fine_tuning.job",
  "id": "ftjob-RfhzDqjrbZNKXTARaGLJBt0i",
  "model": "gpt-3.5-turbo-0125",
  "created_at": 1714330339,
  "finished_at": null,
  "fine_tuned_model": null,
  "organization_id": "org-5h4sKaWGPhS5fugds0H2RjQA",
  "result_files": [],
  "status": "validating_files",
  "validation_file": "file-BZSy9DzZEFrVyGD9ug1fANDr",
  "training_file": "file-fvifTG3NWaEo9BaRh3u9XOlN",
  "hyperparameters": {
    "n_epochs": "auto",
    "batch_size": "auto",
    "learning_rate_multiplier": "auto"
  },
  "trained_tokens": null,
  "error": {},
  "user_provided_suffix": "sqwd-test",
  "seed": 718846123,
  "estimated_finish": null,
  "integrations": []
}


In [58]:
response = openai.FineTuningJob.list_events(id=job_id, limit=20)

events = response["data"]
events.reverse()

for event in events:
    print(event["message"])

Step 585/600: training loss=0.00
Step 586/600: training loss=0.00
Step 587/600: training loss=0.00
Step 588/600: training loss=0.87
Step 589/600: training loss=0.79
Step 590/600: training loss=0.00, validation loss=0.00
Step 591/600: training loss=0.00
Step 592/600: training loss=0.00
Step 593/600: training loss=0.00
Step 594/600: training loss=1.12
Step 595/600: training loss=2.74
Step 596/600: training loss=1.47
Step 597/600: training loss=0.00
Step 598/600: training loss=0.00
Step 599/600: training loss=0.00
Step 600/600: training loss=0.05, validation loss=0.00, full validation loss=1.22
Checkpoint created at step 200 with Snapshot ID: ft:gpt-3.5-turbo-0125:personal:sqwd-test:9J4JdRJI:ckpt-step-200
Checkpoint created at step 400 with Snapshot ID: ft:gpt-3.5-turbo-0125:personal:sqwd-test:9J4JdblX:ckpt-step-400
New fine-tuned model created: ft:gpt-3.5-turbo-0125:personal:sqwd-test:9J4Je1dk
The job has successfully completed


In [None]:
i = 0
true_answers = 0
answers = []
for question, answer in zip(df_test['q'], df_test['e2']):
    if i == 200:
        break
    messages = []
    i += 1

    prompt = "Answer shortly in 1-3 words"
    # prompt = "Отвечай наиболее коротко, 1-3 слова"
    # prompt = "Ответь максимально коротко и ёмко на вопрос. Ответ приводи в именительном падеже, при этом не забывай о пунктуации."
    # messages.append({"role": "user", "content": prompt})
    messages.append({"role": "user", "content": prompt + ' ' + question})
    response = ""
    try:
        response = openai.ChatCompletion.create(
            model="", messages=messages, temperature=0, max_tokens=500
        )
        response = response["choices"][0]["message"]["content"].lower()
    except Exception as e:
        print(e)
        response = "Error"
    answers.append({
        "question": question,
        "answer": response,
        "correct_answer": answer.lower(),
        "is_correct": answer.lower() in response
    })
    if answer.lower() in response:
        true_answers += 1
    print(i, true_answers)
    json_data = json.dumps(answers, indent=4)
    with open("questions_answers_tmp.json", "w") as json_file:
        json_file.write(json_data)

In [None]:
def create_files(training_file_name, training_file_id):
    training_response = openai.File.create(
        file=open(training_file_name, "rb"), purpose="fine-tune"
    )
    training_file_id = training_response["id"]

    validation_response = openai.File.create(
        file=open(validation_file_name, "rb"), purpose="fine-tune"
    )
    validation_file_id = validation_response["id"]


    print("Training file id:", training_file_id)
    print("Validation file id:", validation_file_id)

In [86]:
system_message = "You are a question answering system. Answer shortly in 1-3 words"
dataset = []
validation = []
i = 0

file_path = "mintaka_train.json"
with open(file_path, 'r') as file:
    array_items = ijson.items(file, 'item')
    for item in array_items:
        if i == 500:
            break
        messages = []
        i += 1

        question = item["question"]
        if item["answer"]["answerType"] == "entity":
            if item["answer"]["answer"]:
                answer = item["answer"]["answer"][0]["label"]['en']
            else:
                answer = ""
        else:
            if len(item["answer"]["answer"]):
                answer = str(item["answer"]["answer"][0])
            else:
                answer = ""
        record = convert_conversation(question, answer, system_message)
        dataset.append(record)
i = 0
    
file_path = "mintaka_valid.json"
with open(file_path, 'r') as file:
    array_items = ijson.items(file, 'item')
    for item in array_items:
        if i == 500:
            break
        messages = []
        i += 1

        question = item["question"]
        if item["answer"]["answerType"] == "entity":
            if item["answer"]["answer"]:
                answer = item["answer"]["answer"][0]["label"]['en']
            else:
                answer = ""
        else:
            if len(item["answer"]["answer"]):
                answer = str(item["answer"]["answer"][0])
            else:
                answer = ""
        record = convert_conversation(question, answer, system_message)
        validation.append(record)

In [88]:
training_file_name = 'data/fine_tuning_mintaka_train.jsonl'
validation_file_name = 'data/fine_tuning_mintaka_valid.jsonl'
save_to_jsonl(dataset, training_file_name)
save_to_jsonl(validation, validation_file_name)

In [91]:
training_response = openai.File.create(
    file=open(training_file_name, "rb"), purpose="fine-tune"
)
training_file_id = training_response["id"]

validation_response = openai.File.create(
    file=open(validation_file_name, "rb"), purpose="fine-tune"
)
validation_file_id = validation_response["id"]

print("Training file id:", training_file_id)
print("Validation file id:", validation_file_id)

Training file id: file-eAmkINfLKNDapjfOjRCfn3xw
Validation file id: file-fmfZaaQqF3SUQaOMIxa7ZMAK


In [92]:
suffix_name = "mintaka-test"


response = openai.FineTuningJob.create(
    training_file=training_file_id,
    validation_file=validation_file_id,
    model="gpt-3.5-turbo",
    suffix=suffix_name,
)

job_id = response["id"]

print(response)

{
  "object": "fine_tuning.job",
  "id": "ftjob-ngk4j5sQd8k4Sy1ZKh2s8aTO",
  "model": "gpt-3.5-turbo-0125",
  "created_at": 1714369101,
  "finished_at": null,
  "fine_tuned_model": null,
  "organization_id": "org-5h4sKaWGPhS5fugds0H2RjQA",
  "result_files": [],
  "status": "validating_files",
  "validation_file": "file-fmfZaaQqF3SUQaOMIxa7ZMAK",
  "training_file": "file-eAmkINfLKNDapjfOjRCfn3xw",
  "hyperparameters": {
    "n_epochs": "auto",
    "batch_size": "auto",
    "learning_rate_multiplier": "auto"
  },
  "trained_tokens": null,
  "error": {},
  "user_provided_suffix": "mintaka-test",
  "seed": 998119998,
  "estimated_finish": null,
  "integrations": []
}


In [None]:
file_path = "mintaka_test.json"
with open(file_path, 'r') as file:
    array_items = ijson.items(file, 'item')
    
    i = 0
    true_answers = 0
    answers = []
    for item in array_items:
        messages = []
        i += 1

        question = item["question"]
        if item["answer"]["answerType"] == "entity":
            if item["answer"]["answer"]:
                answer = item["answer"]["answer"][0]["label"]['en']
            else:
                answer = ""
        else:
            if len(item["answer"]["answer"]):
                answer = str(item["answer"]["answer"][0])
            else:
                answer = ""

        prompt = "Answer shortly in 1-3 words. If answer is a number, print digits"
        # prompt = "Ответь максимально коротко и ёмко на вопрос. Ответ приводи в именительном падеже, при этом не забывай о пунктуации."
        # messages.append({"role": "user", "content": prompt})
        messages.append({"role": "user", "content": prompt + ' ' + question})
        response = ""
        try:
            response_big = openai.ChatCompletion.create(
                model="", messages=messages, temperature=0, max_tokens=500
            )
            response = response_big.choices[0].message.content.lower()
        except Exception as e:
            print(e)
            response = ""
        answers.append({
            "question": question,
            "answer": response,
            "correct_answer": answer.lower(),
            "is_correct": answer.lower() in response
        })
        if answer.lower() in response:
            true_answers += 1
        print(i, true_answers)
        json_data = json.dumps(answers, indent=4)
        with open("questions_answers_mintaka_chatGpt_ft.json", "w") as json_file:
            json_file.write(json_data)