In [1]:
import instructor
import json
import random
from pydantic import BaseModel, Field
from typing import List

class Triplet(BaseModel):
    subject: str 
    predicate: str
    object: str
        
class KG(BaseModel):
    triplets: List[Triplet]

In [2]:
kg = json.dumps(instructor.openai_schema(KG).openai_schema, indent=4)
print(kg)

{
    "name": "KG",
    "description": "Correctly extracted `KG` with all the required parameters with correct types",
    "parameters": {
        "$defs": {
            "Triplet": {
                "properties": {
                    "subject": {
                        "type": "string"
                    },
                    "predicate": {
                        "type": "string"
                    },
                    "object": {
                        "type": "string"
                    }
                },
                "required": [
                    "subject",
                    "predicate",
                    "object"
                ],
                "type": "object"
            }
        },
        "properties": {
            "triplets": {
                "items": {
                    "$ref": "#/$defs/Triplet"
                },
                "type": "array"
            }
        },
        "required": [
            "triplets"
        ],
        "type": "obj

In [3]:
data = []
with open('/home/husein/ssd1/ctranslate2/kg-astroawani.translated.jsonl') as fopen:
    for l in fopen:
        l = json.loads(l)
        if l.get('title_kg_ms'):
            selected = True
            for r in l['title_kg_ms']:
                if not len(r['head']) or not len(r['type']) or not len(r['tail']):
                    selected = False
            if selected:
                triplets = []
                for r in l['title_kg_ms']:
                    triplets.append({
                        'subject': r['head'],
                        'predicate': r['type'],
                        'object': r['tail']
                    })
                
                d = {
                    'name': 'KG',
                    'arguments': {
                        'triplets': triplets
                    }
                }
                data.append((l['title'], d))
        
        if l.get('description_kg_ms'):
            selected = True
            for r in l['description_kg_ms']:
                if not len(r['head']) or not len(r['type']) or not len(r['tail']):
                    selected = False
            if selected:
                triplets = []
                for r in l['description_kg_ms']:
                    triplets.append({
                        'subject': r['head'],
                        'predicate': r['type'],
                        'object': r['tail']
                    })
                
                d = {
                    'name': 'KG',
                    'arguments': {
                        'triplets': triplets
                    }
                }
                data.append((l['description'], d))
        
        if l.get('body_kg_ms'):
            for row in l['body_kg_ms']:
                selected = True
                if row[1] is None:
                    continue
                if not len(row[1]):
                    continue
                for r in row[1]:
                    if not len(r['head']) or not len(r['type']) or not len(r['tail']):
                        selected = False
                if selected:
                    triplets = []
                    for r in row[1]:
                        triplets.append({
                            'subject': r['head'],
                            'predicate': r['type'],
                            'object': r['tail']
                        })

                    d = {
                        'name': 'KG',
                        'arguments': {
                            'triplets': triplets
                        }
                    }
                    data.append((row[0], d))
                
len(data)

104689

In [4]:
templates = [
    'tukar ke JSON berdasarkan schema {schema}, teks `{text}`',
    'text `{text}`, convert to JSON using schema {schema}',
    'teks: {text}\n\ntukar ke JSON using schema {schema}',
    'convert to JSON using schema {schema}\n\ntext: {text}',
    '{text}\n\nJSON berdasarkan schema {schema}',
    'JSON berdasarkan schema {schema}\n\n{text}',
]

In [5]:
chat = []
for d in data:
    chat.append({
        'prompt_input': None,
        'input': random.choice(templates).format(schema = kg, text = d[0]),
        'output': json.dumps(d[1]),
    })

In [6]:
chat[0]

{'prompt_input': None,
 'input': 'convert to JSON using schema {\n    "name": "KG",\n    "description": "Correctly extracted `KG` with all the required parameters with correct types",\n    "parameters": {\n        "$defs": {\n            "Triplet": {\n                "properties": {\n                    "subject": {\n                        "type": "string"\n                    },\n                    "predicate": {\n                        "type": "string"\n                    },\n                    "object": {\n                        "type": "string"\n                    }\n                },\n                "required": [\n                    "subject",\n                    "predicate",\n                    "object"\n                ],\n                "type": "object"\n            }\n        },\n        "properties": {\n            "triplets": {\n                "items": {\n                    "$ref": "#/$defs/Triplet"\n                },\n                "type": "array"\n       

In [7]:
def generate_and_tokenize_prompt(row):
    texts = ['<s>']

    if 'function_call' in row:
        t = row['function_call']
        texts.append(f'\n[FUNCTIONCALL]\n{t}\n')

    if '<bot>:' in row['input'] and row['output'] is None:
        inputs, outputs = [], []
        splitted = row['input'].split('<bot>:')
        for i in range(len(splitted) - 1):
            if i == 0:
                human = splitted[i].replace('<manusia>:', '')
            else:
                try:
                    human = splitted[i].split('<manusia>:')[1]
                except:
                    continue
            bot = splitted[i + 1].split('<manusia>:')[0]
            inputs.append(human.strip())
            outputs.append(bot.strip())
    else:
        inputs = [row['input']]
        outputs = [row['output']]

    for u, a in zip(inputs, outputs):
        texts.append(f'[INST] {u.strip()} [/INST] {a.strip()}</s> ')

    prompt = ''.join(texts)
    return {'text': prompt}

In [9]:
print(generate_and_tokenize_prompt(chat[2])['text'])

<s>[INST] JSON berdasarkan schema {
    "name": "KG",
    "description": "Correctly extracted `KG` with all the required parameters with correct types",
    "parameters": {
        "$defs": {
            "Triplet": {
                "properties": {
                    "subject": {
                        "type": "string"
                    },
                    "predicate": {
                        "type": "string"
                    },
                    "object": {
                        "type": "string"
                    }
                },
                "required": [
                    "subject",
                    "predicate",
                    "object"
                ],
                "type": "object"
            }
        },
        "properties": {
            "triplets": {
                "items": {
                    "$ref": "#/$defs/Triplet"
                },
                "type": "array"
            }
        },
        "required": [
            "triplet

In [10]:
import json

with open('prepared-function-call-kg.jsonl', 'w') as fopen:
    for l in chat:
        fopen.write(f'{json.dumps(l)}\n')