In [98]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import openai
import json
import ijson
import os
import tiktoken
from collections import defaultdict

openai.api_key = ""

openai.base_url = "https://api.vsegpt.ru:6070/v1/"

In [66]:
def convert_conversation(question, answer, system_message):
    messages = []
    if system_message:
        messages.append({
            "role": "system",
            "content": system_message
        })

    message_1 = {
        "role": "user",
        "content": question
    }
    message_2 = {
        "role": "assistant",
        "content": answer
    }
    messages.append(message_1)
    messages.append(message_2)

    # Creating the final output dictionary
    output_dict = {
        "messages": messages
    }

    return output_dict


In [44]:
df_test = pd.read_csv('archive/n_annotated_wd_data_test_answerable.csv')
df_train = pd.read_csv('archive/n_annotated_wd_data_train_answerable.csv')
df_valid = pd.read_csv('archive/n_annotated_wd_data_valid_answerable.csv')


In [None]:
system_message = "You are a question answering system. Answer shortly in 1-3 words"
dataset = []

for question, answer in zip(df_train['q'][:200], df_train['e2']):
    record = convert_conversation(question, answer, system_message)
    dataset.append(record)
    
dataset

In [None]:
system_message = "You are a question answering system. Answer shortly in 1-3 words"
validation = []

for question, answer in zip(df_valid['q'][:200], df_valid['e2']):
    record = convert_conversation(question, answer, system_message)
    validation.append(record)
    
validation

In [67]:
def save_to_jsonl(conversations, file_path):
    with open(file_path, 'w') as file:
        for conversation in conversations:
            json_line = json.dumps(conversation)
            file.write(json_line + '\n')

In [48]:
training_file_name = 'data/fine_tuning_sqwd_train.jsonl'
validation_file_name = 'data/fine_tuning_sqwd_valid.jsonl'

In [87]:
save_to_jsonl(dataset, training_file_name)
save_to_jsonl(validation, validation_file_name)

In [50]:
training_response = openai.File.create(
    file=open(training_file_name, "rb"), purpose="fine-tune"
)
training_file_id = training_response["id"]

validation_response = openai.File.create(
    file=open(validation_file_name, "rb"), purpose="fine-tune"
)
validation_file_id = validation_response["id"]

print("Training file id:", training_file_id)
print("Validation file id:", validation_file_id)

Training file id: file-fvifTG3NWaEo9BaRh3u9XOlN
Validation file id: file-BZSy9DzZEFrVyGD9ug1fANDr


In [51]:
suffix_name = "sqwd-test"


response = openai.FineTuningJob.create(
    training_file=training_file_id,
    validation_file=validation_file_id,
    model="gpt-3.5-turbo",
    suffix=suffix_name,
)

job_id = response["id"]

print(response)

{
  "object": "fine_tuning.job",
  "id": "ftjob-RfhzDqjrbZNKXTARaGLJBt0i",
  "model": "gpt-3.5-turbo-0125",
  "created_at": 1714330339,
  "finished_at": null,
  "fine_tuned_model": null,
  "organization_id": "org-5h4sKaWGPhS5fugds0H2RjQA",
  "result_files": [],
  "status": "validating_files",
  "validation_file": "file-BZSy9DzZEFrVyGD9ug1fANDr",
  "training_file": "file-fvifTG3NWaEo9BaRh3u9XOlN",
  "hyperparameters": {
    "n_epochs": "auto",
    "batch_size": "auto",
    "learning_rate_multiplier": "auto"
  },
  "trained_tokens": null,
  "error": {},
  "user_provided_suffix": "sqwd-test",
  "seed": 718846123,
  "estimated_finish": null,
  "integrations": []
}


In [58]:
response = openai.FineTuningJob.list_events(id=job_id, limit=20)

events = response["data"]
events.reverse()

for event in events:
    print(event["message"])

Step 585/600: training loss=0.00
Step 586/600: training loss=0.00
Step 587/600: training loss=0.00
Step 588/600: training loss=0.87
Step 589/600: training loss=0.79
Step 590/600: training loss=0.00, validation loss=0.00
Step 591/600: training loss=0.00
Step 592/600: training loss=0.00
Step 593/600: training loss=0.00
Step 594/600: training loss=1.12
Step 595/600: training loss=2.74
Step 596/600: training loss=1.47
Step 597/600: training loss=0.00
Step 598/600: training loss=0.00
Step 599/600: training loss=0.00
Step 600/600: training loss=0.05, validation loss=0.00, full validation loss=1.22
Checkpoint created at step 200 with Snapshot ID: ft:gpt-3.5-turbo-0125:personal:sqwd-test:9J4JdRJI:ckpt-step-200
Checkpoint created at step 400 with Snapshot ID: ft:gpt-3.5-turbo-0125:personal:sqwd-test:9J4JdblX:ckpt-step-400
New fine-tuned model created: ft:gpt-3.5-turbo-0125:personal:sqwd-test:9J4Je1dk
The job has successfully completed


In [60]:
i = 0
true_answers = 0
answers = []
for question, answer in zip(df_test['q'], df_test['e2']):
    if i == 200:
        break
    messages = []
    i += 1

    prompt = "Answer shortly in 1-3 words"
    # prompt = "Отвечай наиболее коротко, 1-3 слова"
    # prompt = "Ответь максимально коротко и ёмко на вопрос. Ответ приводи в именительном падеже, при этом не забывай о пунктуации."
    # messages.append({"role": "user", "content": prompt})
    messages.append({"role": "user", "content": prompt + ' ' + question})
    response = ""
    try:
        response = openai.ChatCompletion.create(
            model="", messages=messages, temperature=0, max_tokens=500
        )
        response = response["choices"][0]["message"]["content"].lower()
    except Exception as e:
        print(e)
        response = "Error"
    answers.append({
        "question": question,
        "answer": response,
        "correct_answer": answer.lower(),
        "is_correct": answer.lower() in response
    })
    if answer.lower() in response:
        true_answers += 1
    print(i, true_answers)
    json_data = json.dumps(answers, indent=4)
    with open("questions_answers_tmp.json", "w") as json_file:
        json_file.write(json_data)

1 0
2 0
3 0
4 0
5 1
6 1
7 1
8 1
9 1
10 2
11 3
12 4
13 4
14 4
15 4
16 4
17 5
18 6
19 7
20 7
21 7
22 8
23 8
24 8
25 8
26 9
27 9
28 10
29 11
30 12
31 12
32 13
33 14
34 14
35 14
36 14
37 15
38 15
39 16
40 17
41 18
42 19
43 19
44 20
45 20
46 21
47 22
48 22
49 22
50 22
51 22
52 22
53 22
54 22
55 23
56 23
57 23
58 23
59 23
60 24
61 24
62 24
63 25
64 26
65 26
66 27
67 27
68 28
69 29
70 30
71 31
72 31
73 31
74 32
75 32
76 33
77 33
78 34
79 34
80 35
81 35
82 35
83 35
84 36
85 36
86 36
87 36
88 36
89 36
90 37
91 38
92 38
93 39
94 40
95 40
96 40
97 40
98 40
99 41
100 41
101 41
102 42
103 43
104 43
105 43
106 43
107 43
108 44
109 45
110 46
111 46
112 46
113 46
114 46
115 47
116 47
117 47
118 48
119 48
120 48
121 48
122 49
123 50
124 50
125 50
126 50
127 51
128 52
129 52
130 53
131 54
132 54
133 54
134 54
135 55
136 56
137 57
138 58
139 58
140 58
141 59
142 59
143 60
144 60
145 61
146 61
147 62
148 62
149 62
150 63
151 64
152 65
153 65
154 65
155 66
156 66
157 66
158 66
159 66
160 67
161 68
162 68
1

In [None]:
def create_files(training_file_name, training_file_id):
    training_response = openai.File.create(
        file=open(training_file_name, "rb"), purpose="fine-tune"
    )
    training_file_id = training_response["id"]

    validation_response = openai.File.create(
        file=open(validation_file_name, "rb"), purpose="fine-tune"
    )
    validation_file_id = validation_response["id"]


    print("Training file id:", training_file_id)
    print("Validation file id:", validation_file_id)

In [86]:
system_message = "You are a question answering system. Answer shortly in 1-3 words"
dataset = []
validation = []
i = 0

file_path = "mintaka_train.json"
with open(file_path, 'r') as file:
    array_items = ijson.items(file, 'item')
    for item in array_items:
        if i == 500:
            break
        messages = []
        i += 1

        question = item["question"]
        if item["answer"]["answerType"] == "entity":
            if item["answer"]["answer"]:
                answer = item["answer"]["answer"][0]["label"]['en']
            else:
                answer = ""
        else:
            if len(item["answer"]["answer"]):
                answer = str(item["answer"]["answer"][0])
            else:
                answer = ""
        record = convert_conversation(question, answer, system_message)
        dataset.append(record)
i = 0
    
file_path = "mintaka_valid.json"
with open(file_path, 'r') as file:
    array_items = ijson.items(file, 'item')
    for item in array_items:
        if i == 500:
            break
        messages = []
        i += 1

        question = item["question"]
        if item["answer"]["answerType"] == "entity":
            if item["answer"]["answer"]:
                answer = item["answer"]["answer"][0]["label"]['en']
            else:
                answer = ""
        else:
            if len(item["answer"]["answer"]):
                answer = str(item["answer"]["answer"][0])
            else:
                answer = ""
        record = convert_conversation(question, answer, system_message)
        validation.append(record)

In [88]:
training_file_name = 'data/fine_tuning_mintaka_train.jsonl'
validation_file_name = 'data/fine_tuning_mintaka_valid.jsonl'
save_to_jsonl(dataset, training_file_name)
save_to_jsonl(validation, validation_file_name)

In [91]:
training_response = openai.File.create(
    file=open(training_file_name, "rb"), purpose="fine-tune"
)
training_file_id = training_response["id"]

validation_response = openai.File.create(
    file=open(validation_file_name, "rb"), purpose="fine-tune"
)
validation_file_id = validation_response["id"]

print("Training file id:", training_file_id)
print("Validation file id:", validation_file_id)

Training file id: file-eAmkINfLKNDapjfOjRCfn3xw
Validation file id: file-fmfZaaQqF3SUQaOMIxa7ZMAK


In [92]:
suffix_name = "mintaka-test"


response = openai.FineTuningJob.create(
    training_file=training_file_id,
    validation_file=validation_file_id,
    model="gpt-3.5-turbo",
    suffix=suffix_name,
)

job_id = response["id"]

print(response)

{
  "object": "fine_tuning.job",
  "id": "ftjob-ngk4j5sQd8k4Sy1ZKh2s8aTO",
  "model": "gpt-3.5-turbo-0125",
  "created_at": 1714369101,
  "finished_at": null,
  "fine_tuned_model": null,
  "organization_id": "org-5h4sKaWGPhS5fugds0H2RjQA",
  "result_files": [],
  "status": "validating_files",
  "validation_file": "file-fmfZaaQqF3SUQaOMIxa7ZMAK",
  "training_file": "file-eAmkINfLKNDapjfOjRCfn3xw",
  "hyperparameters": {
    "n_epochs": "auto",
    "batch_size": "auto",
    "learning_rate_multiplier": "auto"
  },
  "trained_tokens": null,
  "error": {},
  "user_provided_suffix": "mintaka-test",
  "seed": 998119998,
  "estimated_finish": null,
  "integrations": []
}


In [97]:
file_path = "mintaka_test.json"
with open(file_path, 'r') as file:
    array_items = ijson.items(file, 'item')
    
    i = 0
    true_answers = 0
    answers = []
    for item in array_items:
        messages = []
        i += 1

        question = item["question"]
        if item["answer"]["answerType"] == "entity":
            if item["answer"]["answer"]:
                answer = item["answer"]["answer"][0]["label"]['en']
            else:
                answer = ""
        else:
            if len(item["answer"]["answer"]):
                answer = str(item["answer"]["answer"][0])
            else:
                answer = ""

        prompt = "Answer shortly in 1-3 words. If answer is a number, print digits"
        # prompt = "Ответь максимально коротко и ёмко на вопрос. Ответ приводи в именительном падеже, при этом не забывай о пунктуации."
        # messages.append({"role": "user", "content": prompt})
        messages.append({"role": "user", "content": prompt + ' ' + question})
        response = ""
        try:
            response_big = openai.ChatCompletion.create(
                model="", messages=messages, temperature=0, max_tokens=500
            )
            response = response_big.choices[0].message.content.lower()
        except Exception as e:
            print(e)
            response = ""
        answers.append({
            "question": question,
            "answer": response,
            "correct_answer": answer.lower(),
            "is_correct": answer.lower() in response
        })
        if answer.lower() in response:
            true_answers += 1
        print(i, true_answers)
        json_data = json.dumps(answers, indent=4)
        with open("questions_answers_mintaka_chatGpt_ft.json", "w") as json_file:
            json_file.write(json_data)

1 1
2 2
3 3
4 4
5 5
6 6
7 6
8 7
9 8
10 9
11 10
12 11
13 12
14 13
15 13
16 14
17 14
18 15
19 16
20 16
21 16
22 16
23 17
24 17
25 17
26 18
27 19
28 19
29 20
30 20
31 20
32 21
33 22
34 22
35 23
36 24
37 25
38 26
39 26
40 27
41 27
42 28
43 29
44 30
45 31
46 32
47 33
48 34
49 35
50 35
51 35
52 36
53 37
54 37
55 38
56 39
57 39
58 40
59 40
60 40
61 40
62 41
63 42
64 43
65 44
66 45
67 46
68 47
69 47
70 48
71 49
72 50
73 51
74 52
75 53
76 53
77 54
78 55
79 56
80 57
81 57
82 57
83 57
84 57
85 58
86 59
87 59
88 60
89 61
90 62
91 63
92 63
93 64
94 64
95 64
96 64
97 65
98 65
99 65
100 65
101 66
102 67
103 67
104 67
105 67
106 68
107 68
108 68
109 69
110 69
111 69
112 70
113 71
114 71
115 72
116 72
117 72
118 72
119 73
120 73
121 74
122 75
123 76
124 77
125 78
126 78
127 79
128 80
129 80
130 81
131 82
132 83
133 84
134 84
135 84
136 85
137 86
138 87
139 88
140 89
141 89
142 90
143 90
144 90
145 90
146 91
147 92
148 93
149 93
150 94
151 94
152 94
153 94
154 94
155 94
156 95
157 95
158 95
159 95
160 9

KeyboardInterrupt: 