In [None]:
# !wget https://huggingface.co/datasets/glaiveai/glaive-function-calling-v2/resolve/main/glaive-function-calling-v2.json

In [None]:
import pandas as pd
import openai
import instructor
import json
import os
import malaya
from pydantic import BaseModel, Field
from typing import List
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed
from unidecode import unidecode
import re
import random

minimum_len = 15

def simple_cleaning(string):
    return re.sub(r'[ ]+', ' ', unidecode(string).replace('\n', ' ').replace('--', ' ').replace('/', ' ')).strip()

In [None]:
instructor.patch()

In [4]:
class ConversationObject(BaseModel):
    role: str
    content: str = Field('...', description = 'if the role is `assistant`, it must convert user content into function parameter format `<functioncall> {"name", "arguments"}`')
        
class JsonObject(BaseModel):
    function_call: str
    conversations: List[ConversationObject]

In [5]:
openai.api_type = "azure"
openai.api_base = "https://husein-ai10-aiservices-2093329748.openai.azure.com/"
openai.api_version = "2023-07-01-preview"
openai.api_key = ""

engine = 'gpt-35-turbo'

In [6]:
with open('glaive-function-calling-v2.json') as fopen:
    data = json.load(fopen)
    
all_data = []
for d in data:
    function_call = None
    try:
        function_call = d['system'].split('required -')[1]
    except Exception as e:
        continue
    
    conversations = []
    splitted = d['chat'].split('ASSISTANT:')
    conversations.append({
        'role': 'user',
        'content': splitted[0].split('USER:')[1].strip(),
    })
    for s in splitted[1:]:
        s = s.replace('<|endoftext|>', '')
        if 'USER:' in s:
            s = s.split('USER:')
            conversations.append({
                'role': 'assistant',
                'content': s[0].strip(),
            })
            conversations.append({
                'role': 'user',
                'content': s[1].strip(),
            })
        else:
            conversations.append({
                'role': 'assistant',
                'content': s.strip(),
            })
        
    filtered_message = []
    
    for i in range(len(conversations)):
        if conversations[i]['role'] == 'assistant' and '<functioncall>' in conversations[i]['content']:
            if conversations[i-1]['role'] == 'user':
                message = conversations[i-1]['content'].strip() 
            else:
                message = conversations[i-2]['content'].strip() 
            filtered_message.append({
                'role': 'user',
                'content': message ,
            })
            filtered_message.append({
                'role': 'assistant',
                'content':  conversations[i]['content'].split('FUNCTION RESPONSE')[0].strip() ,
            })
            
    if len(filtered_message) > 0:
        all_data.append({
            'function_call':function_call,
            'conversations': filtered_message
        })

In [8]:
def predict(prompt, max_tokens = 1024):
    messages=[
        {"role": "user", "content": prompt},
       
    ]
    try:
        response = openai.ChatCompletion.create(
        engine=engine,
        messages=messages,
        temperature=1.0,
        max_tokens=max_tokens,
        top_p=0.95,
        frequency_penalty=0,
        presence_penalty=0,
        stop=None
        )
        return response
    except Exception as e:
        print(e)
        return None
    
def predict_instruct(text):
    r = openai.ChatCompletion.create(
        engine=engine,
        response_model=JsonObject,
        messages=[
            {"role": "user", "content": text},
        ],
        temperature=1.0,
        max_tokens=1024,
        top_p=0.95,
        frequency_penalty=0,
        presence_penalty=0,
        stop=None
    )
    return r

In [9]:
!mkdir function-call

mkdir: cannot create directory ‘function-call’: File exists


In [10]:
instructions = []
for i in range(len(all_data)):
    d = {
        'function_call': all_data[i]['function_call'].strip(),
        'conversations': all_data[i]['conversations'],
    }
    text = f"""translate to malay,
    ```
    {d}
    ```
    """.strip()
    instructions.append(text)
    
len(instructions)

63218

In [11]:
def generate(instruction, index):
    filename = f'function-call/{index}.json'
    if os.path.exists(filename):
        return
    
    try:
        e = None
        for _ in range(3):
            try:
                output = predict(instruction)
                e = eval(output.choices[0]['message']['content'].replace('```', ''))
                break
            except:
                pass
        
        if e is None:
            return
        
        text = f"""Generate complex Malay function call prompts inspired by the given example. If translation is challenging, use Malaysian terms for variety. Maintain the JSON format, altering only key contents. Follow instructions precisely, avoiding randomness.

Code snippet for inspiration:
```
{e}
```
""".strip()
        rs = []
        for _ in range(3):
            try:
                r = predict_instruct(text)
                rs.append(r.dict())
            except Exception as e__:
                print('nested', e__)
                pass
        
        d = {
            'e': e,
            'rs': rs
        }
        with open(filename, 'w') as fopen:
            json.dump(d, fopen)
    except Exception as e_:
        print(e_)
        pass

In [12]:
generate(instructions[2], 2)

In [None]:
from tqdm import tqdm

max_worker = 2
for i in tqdm(range((len(instructions) // 6) * 0, (len(instructions) // 6) * 1, max_worker)):
    b = instructions[i: i + max_worker]
    b = [(ins, i + no) for no, ins in enumerate(b)]
    
    with ThreadPoolExecutor(max_workers=max_worker) as executor:
        futures = {executor.submit(generate, f[0], f[1]): f for f in b}

        for future in as_completed(futures):
            future.result()

  0%|                                                  | 0/5268 [00:00<?, ?it/s]/tmp/ipykernel_3644185/2968651273.py:30: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.4/migration/
  rs.append(r.dict())
  0%|                                       | 5/5268 [01:09<19:17:07, 13.19s/it]