# Convert the unformatted Glaive dataset to ShareGPT


In [2]:
%load_ext autoreload
%autoreload 2
import lilac as ll

ll.set_project_dir('./data')

if not ll.has_dataset('local', 'glaive-function-calling-v2'):
  ll.from_huggingface(
    'glaiveai/glaive-function-calling-v2',
    'local',
    'glaive-function-calling-v2',
  )

ds = ll.get_dataset('local', 'glaive-function-calling-v2')

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
import re

GLAIVE_ROLES = ['USER', 'ASSISTANT', 'FUNCTION RESPONSE']
GLAIVE_TO_SHAREGPT_ROLE = {
  'SYSTEM': 'system',
  'USER': 'human',
  'ASSISTANT': 'gpt',
  'FUNCTION RESPONSE': 'tool',
}


# The split regex is a role, plus semicolon and space. For example
# "USER: " or "FUNCTION RESPONSE: ".
split_re = re.compile(r'({}): '.format('|'.join(GLAIVE_ROLES)))


def _parse_chat(row: dict):
  system_prompt = row.get('system')
  # Remove "SYSTEM: " from the beginning of the prompt.
  if system_prompt:
    system_prompt = system_prompt.removeprefix('SYSTEM: ')

  chat = row['chat']
  # Split chat by split_res, and remove empty strings.
  chats = [s.strip() for s in split_re.split(chat) if s]

  # results look like:
  # ['USER', 'Can you book a flight for me from New York to London?', 'ASSISTANT', '...']
  # We now want it to be a dictionary of {'from': 'user', 'value': 'Can you book a flight...'}
  chats = [
    {'from': GLAIVE_TO_SHAREGPT_ROLE[role], 'value': value}
    for role, value in zip(chats[::2], chats[1::2])
  ]

  if system_prompt:
    chats = [{'from': GLAIVE_TO_SHAREGPT_ROLE['SYSTEM'], 'value': system_prompt}] + chats

  return chats


res = ds.map(_parse_chat, output_path='conversations', overwrite=True)

[local/glaive-function-calling-v2][1 shards] map "_parse_chat" to "('conversations',)": 100%|██████████| 112960/112960 [00:06<00:00, 16609.62it/s]


Wrote map output to conversations-00000-of-00001.parquet


In [7]:
import pprint

pprint.pprint(next(ds.select_rows(['conversations'], limit=1)))

{'conversations': [{'from': 'system',
                    'value': 'You are a helpful assistant with access to the '
                             'following functions. Use them if required -\n'
                             '{\n'
                             '    "name": "calculate_median",\n'
                             '    "description": "Calculate the median of a '
                             'list of numbers",\n'
                             '    "parameters": {\n'
                             '        "type": "object",\n'
                             '        "properties": {\n'
                             '            "numbers": {\n'
                             '                "type": "array",\n'
                             '                "items": {\n'
                             '                    "type": "number"\n'
                             '                },\n'
                             '                "description": "The list of '
                          

In [8]:
hf_ds = ds.to_huggingface()
hf_ds.push_to_hub('lilacai/glaive-function-calling-v2-sharegpt')

Generating train split: 112960 examples [00:03, 36913.50 examples/s]
Creating parquet from Arrow format: 100%|██████████| 57/57 [00:00<00:00, 89.98ba/s]
Creating parquet from Arrow format: 100%|██████████| 57/57 [00:00<00:00, 85.01ba/s]s/it]
Pushing dataset shards to the dataset hub: 100%|██████████| 2/2 [00:10<00:00,  5.15s/it]
Deleting unused files from dataset repository: 100%|██████████| 2/2 [00:01<00:00,  1.33it/s]
Downloading metadata: 100%|██████████| 2.83k/2.83k [00:00<00:00, 13.0MB/s]
