## JSONL データ　保存

In [8]:
# 
from openai import OpenAI
import json
import time
import re
import os

class CreateFineTuningData:
    def __init__(self, api_key):
        # openai.api_key = api_key
        self.data = []

    def read_document(self, file_path):
        with open(file_path, 'r') as file:
            return file.read()

    def split_into_paragraphs(self, content):
        paragraphs = content.split('\n\n')
        return [p.strip() for p in paragraphs if p.strip()]

    def generate_question(self, paragraph, title):
        client = OpenAI()
        prompt="The title of this document is: vision. From the attached document, create at least 10 QA data (Q: prompt, A: completion) to fine-tune the chatgpt: here is an example output. {\"prompt\":\"<prompt text>\", \"completion\":\"<ideal generated text>\"} The content to be described is as follows. 'Overview', 'Quick start', 'Preparation', 'Usage examples', 'How to use', 'Process flow', 'Advanced settings', 'Functional details', 'Limitations', 'FAQs (if any) Use as is.:\n\n{paragraph}"
        
        max_retries = 3
        retry_delay = 5  # 秒単位での待ち時間

        for attempt in range(max_retries):
            # try:
            response = client.chat.completions.create(
                model="gpt-4o-mini",
                messages=[
                    {"role": "system",
                     "content": "You are a professional python developer, a helpful assistant and good at chatgpt APIs."},
                    {"role": "user", "content": prompt},
                ],
                max_tokens=50,
                n=1,
                stop=None,
                temperature=0.5
            )
            # message = completion.choices[0].message.content
            return response.choices[0].message.content
            # except client.error.Timeout as e:
            #     if attempt < max_retries - 1:
            #         print(f"Timeout occurred. Retrying in {retry_delay} seconds...")
            #         time.sleep(retry_delay)
            #     else:
            #         print("Max retries reached. Exiting.")
            #         raise e

    def create_fine_tuning_data(self, content, title):
        paragraphs = self.split_into_paragraphs(content)
        for i, paragraph in enumerate(paragraphs):
            if i >= 20:  # 20件のQAペアを作成
                break
            question = self.generate_question(paragraph, title)
            self.data.append({"prompt": question, "completion": paragraph})

    def save_to_jsonl(self, filename):
        with open(filename, 'w') as f:
            for entry in self.data:
                f.write(json.dumps(entry) + "\n")
        print(f"Data saved to {filename}")

def main():
    api_key = os.getenv('OPENAI_API_KEY')  # OpenAI APIキーを入力
    
    file_path = './document/02_vision.txt'  # 読み込むファイルのパス
    output_filename = 'fine_tuning_data.jsonl'  # 保存するファイル名

    fine_tuner = CreateFineTuningData(api_key)
    content = fine_tuner.read_document(file_path)
    title = "GPT-4 Vision Capabilities"  # タイトルを設定
    fine_tuner.create_fine_tuning_data(content, title)
    fine_tuner.save_to_jsonl(output_filename)

if __name__ == "__main__":
    main()

Data saved to fine_tuning_data.jsonl


In [None]:
# count number of token



## {prompt:XXX, completion: YYY}

In [None]:
# 文字列に含まれる toke 数
# model: text-embedding-3-small
import tiktoken
import pprint

with open('fine_tuning_data.jsonl', 'r') as fp:
    data_l = fp.read()

def num_tokens_from_string(data_l: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(data_l))
    return num_tokens

num_tokens_from_string("tiktoken is great!", "cl100k_base")


In [None]:
# Obtaining the embeddings: 埋め込みを取得する。
from openai import OpenAI

model_embedding = "text-embedding-3-small"
with open('fine_tuning_data.jsonl', 'r') as fp:
    data_jsonl = fp.read()

client = OpenAI()

response = client.embeddings.create(
    input=data_jsonl,  # "Your text string goes here",
    model=model_embedding
)

print(response.data[0].embedding)

#### token count
| **Encoding name** | **OpenAI models** |
|-------------------|-------------------|
| cl100k_base       | gpt-4, gpt-3.5-turbo, text-embedding-ada-002, text-embedding-3-small, text-embedding-3-large |
| p50k_base         | Codex models, text-davinci-002, text-davinci-003 |
| r50k_base (or gpt2) | GPT-3 models like davinci |


In [None]:
import pprint

model='text-embedding-3-small'
import tiktoken
enc = tiktoken.get_encoding("o200k_base")
assert enc.decode(enc.encode("hello world")) == "hello world"

# To get the tokeniser corresponding to a specific model in the OpenAI API:
enc = tiktoken.encoding_for_model("gpt-4o")

In [19]:
import tiktoken
enc = tiktoken.get_encoding("o200k_base")
assert enc.decode(enc.encode("hello world")) == "hello world"

# To get the tokeniser corresponding to a specific model in the OpenAI API:
enc = tiktoken.encoding_for_model("gpt-4o-mini")

In [20]:
enc

<Encoding 'o200k_base'>