In [1]:
import instructor
import json
import random
from pydantic import BaseModel, Field
from enum import Enum
from typing import List

class AnswerEnum(str, Enum):
    A = 'A'
    B = 'B'
    C = 'C'
    D = 'D'
    
class Selective_QA(BaseModel):
    question: str
    A: str
    B: str
    C: str
    D: str
    answer: AnswerEnum
        
class QAS(BaseModel):
    qa: List[Selective_QA]

In [2]:
choices = json.dumps(instructor.openai_schema(QAS).openai_schema, indent=4)
print(choices)

{
    "name": "QAS",
    "description": "Correctly extracted `QAS` with all the required parameters with correct types",
    "parameters": {
        "$defs": {
            "AnswerEnum": {
                "enum": [
                    "A",
                    "B",
                    "C",
                    "D"
                ],
                "type": "string"
            },
            "Selective_QA": {
                "properties": {
                    "question": {
                        "type": "string"
                    },
                    "A": {
                        "type": "string"
                    },
                    "B": {
                        "type": "string"
                    },
                    "C": {
                        "type": "string"
                    },
                    "D": {
                        "type": "string"
                    },
                    "answer": {
                        "$ref": "#/$defs/AnswerEnum"
           

In [3]:
from glob import glob

files = glob('/home/husein/ssd3/wikipedia-data/qa-*.jsonl')
files

['/home/husein/ssd3/wikipedia-data/qa-majalahsains.jsonl',
 '/home/husein/ssd3/wikipedia-data/qa-ms-wikipedia.jsonl',
 '/home/husein/ssd3/wikipedia-data/qa-dewanbahasa-jdbp.jsonl']

In [4]:
data = []
for f in files:
    with open(f) as fopen:
        for l in fopen:
            l = json.loads(l)
            d = {
                'name': 'QAS',
                'arguments': {
                    'qa': l['qa']['qa']
                }
            }
            data.append((l['paragraph'], d))

                
len(data)

13358

In [5]:
templates = [
    'tukar ke JSON berdasarkan schema {schema}, teks `{text}`',
    'text `{text}`, convert to JSON using schema {schema}',
    'teks: {text}\n\ntukar ke JSON using schema {schema}',
    'convert to JSON using schema {schema}\n\ntext: {text}',
    '{text}\n\nJSON berdasarkan schema {schema}',
    'JSON berdasarkan schema {schema}\n\n{text}',
]

In [6]:
chat = []
for d in data:
    chat.append({
        'prompt_input': None,
        'input': random.choice(templates).format(schema = choices, text = d[0]),
        'output': json.dumps(d[1]),
    })

In [7]:
chat[0]

{'prompt_input': None,
 'input': 'convert to JSON using schema {\n    "name": "QAS",\n    "description": "Correctly extracted `QAS` with all the required parameters with correct types",\n    "parameters": {\n        "$defs": {\n            "AnswerEnum": {\n                "enum": [\n                    "A",\n                    "B",\n                    "C",\n                    "D"\n                ],\n                "type": "string"\n            },\n            "Selective_QA": {\n                "properties": {\n                    "question": {\n                        "type": "string"\n                    },\n                    "A": {\n                        "type": "string"\n                    },\n                    "B": {\n                        "type": "string"\n                    },\n                    "C": {\n                        "type": "string"\n                    },\n                    "D": {\n                        "type": "string"\n                    },\n  

In [8]:
def generate_and_tokenize_prompt(row):
    texts = ['<s>']

    if 'function_call' in row:
        t = row['function_call']
        texts.append(f'\n[FUNCTIONCALL]\n{t}\n')

    if '<bot>:' in row['input'] and row['output'] is None:
        inputs, outputs = [], []
        splitted = row['input'].split('<bot>:')
        for i in range(len(splitted) - 1):
            if i == 0:
                human = splitted[i].replace('<manusia>:', '')
            else:
                try:
                    human = splitted[i].split('<manusia>:')[1]
                except:
                    continue
            bot = splitted[i + 1].split('<manusia>:')[0]
            inputs.append(human.strip())
            outputs.append(bot.strip())
    else:
        inputs = [row['input']]
        outputs = [row['output']]

    for u, a in zip(inputs, outputs):
        texts.append(f'[INST] {u.strip()} [/INST] {a.strip()}</s> ')

    prompt = ''.join(texts)
    return {'text': prompt}

In [12]:
print(generate_and_tokenize_prompt(chat[3])['text'])

<s>[INST] convert to JSON using schema {
    "name": "QAS",
    "description": "Correctly extracted `QAS` with all the required parameters with correct types",
    "parameters": {
        "$defs": {
            "AnswerEnum": {
                "enum": [
                    "A",
                    "B",
                    "C",
                    "D"
                ],
                "type": "string"
            },
            "Selective_QA": {
                "properties": {
                    "question": {
                        "type": "string"
                    },
                    "A": {
                        "type": "string"
                    },
                    "B": {
                        "type": "string"
                    },
                    "C": {
                        "type": "string"
                    },
                    "D": {
                        "type": "string"
                    },
                    "answer": {
                        "

In [11]:
import json

with open('prepared-function-call-qa-choice.jsonl', 'w') as fopen:
    for l in chat:
        fopen.write(f'{json.dumps(l)}\n')