# transform

> common transformations for LLM data

In [None]:
#| default_exp transform

In [None]:
#|export
import os, copy, json
import openai, langsmith
from typing import List, Callable
from random import shuffle
from collections import defaultdict

from pydantic import BaseModel
from langchain.adapters import openai as adapt
from langchain.load import load
from fastcore.foundation import L
from tenacity import (
    retry,
    stop_after_attempt,
    wait_random_exponential,
)  # for exponential backoff

In [None]:
#|exports
@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
def chat(**kwargs):
    "A wrapper around `openai.ChatCompletion` that has automatic retries." 
    return openai.ChatCompletion.create(**kwargs)

In [None]:
#|export
def rephrase(sentence, max_tokens=100, temperature=0.95):
    "Rephrase a sentence. Useful for data augmentation for finetuning."
    openai.api_key = os.environ['OPENAI_API_KEY']
    response = chat(
        temperature=temperature,
        max_tokens=max_tokens,
        model="gpt-4", 
        messages=[
         {"role": "system", "content": "You are a helpful assistant."},
         {"role": "user", "content": f"Rephrase the following sentence in one short sentence: {sentence}"}
        ]
    )
    return response.choices[0]['message']['content'].strip()

In [None]:
#|eval: false
_phrase = 'Write an email to example@gmail.com asking if we can move the appointment to friday'
print(rephrase(_phrase))

Email example@gmail.com to inquire about rescheduling the appointment to Friday.


In [None]:
#|export
def _gen_name():
    "Generate a random name"
    openai.api_key = os.environ['OPENAI_API_KEY']
    response = chat(
        temperature=1.9,
        max_tokens=4,
        model="gpt-3.5-turbo", 
        messages=[
         {"role": "system", "content": "You are a helpful assistant."},
         {"role": "user", "content": f"Imagine a full name for a person. Only return a first and last name."}
        ]
    )
    return response.choices[0]['message']['content'].strip().replace('.', '')

def gen_name():
    "Generate a random name"
    while True:
        nm = _gen_name()
        if len(nm) <= 18:
            return nm

In [None]:
#|eval:false
for i in range(3):
    print(gen_name())

Julian Michaels
Elsa Cormack
Edward Lopez


In [None]:
#|export
def fetch_run_componets(run_id:str):
    "Return the `inputs`, `output` and `funcs` for a run."
    client = langsmith.Client()
    run = client.read_run(run_id)
    output = adapt.convert_message_to_dict(load(run.outputs['generations'][0]['message']))
    inputs = [adapt.convert_message_to_dict(load(m)) for m in run.inputs['messages']]
    for inp in inputs:
        if 'function_call' in inp and inp.get('content', None) is None:
            del inp['content']
    funcs = run.extra['invocation_params']["functions"]
    return inputs, output, funcs

In [None]:
_run_id = '59080971-8786-4849-be88-898d3ffc2b45'
_inputs, _output, _funcs = fetch_run_componets(_run_id)

In [None]:
#|exports
class RunData(BaseModel):
    "Key components of a run from LangSmith"
    inputs:List[dict]
    output:dict
    funcs:List[dict] 
    run_id:str

In [None]:
#|exports
def _collate(cbdata:RunData, 
             callback:callable=None) -> dict:
    "Allow a callback to mutate `inputs`, `output`, and `funcs` and construct a dataset for fine tuning."
    if callback: cbdata = callback(cbdata)
    msgs = cbdata.inputs + [cbdata.output]
    return {"functions": cbdata.funcs,
            "messages": msgs}

In [None]:
#|exports
def collate(run_id:str, callback:Callable[[RunData], RunData]=None) -> dict:
    "Allow a callback to mutate a run for fine tuning."
    i,o,f = fetch_run_componets(run_id)
    cbdata = RunData(inputs=i, output=o, funcs=f, run_id=run_id)
    return _collate(cbdata, callback)

In [None]:
_tst_run = collate(_run_id)
assert 'functions' in _tst_run and 'messages' in _tst_run

## Example Transformation Callbacks

In [None]:
#| export
def _sub_name_in_func(funcs, name):
    "Substitute 'Unit Test' for `name` in the `email-campaign-creator` function"
    emailfunc = L(funcs).filter(lambda x: x['name'] == 'email-campaign-creator')
    if emailfunc:
        func = emailfunc[0]
        desc = func['parameters']['properties']['body']['description']
        new_desc = desc.replace('Unit Test', name)
        func['parameters']['properties']['body']['description'] = new_desc
    return funcs

In [None]:
#|hide
_testfuncs = _sub_name_in_func(_funcs, name='Hamel Husain')
assert 'Hamel' in L(_testfuncs).filter(lambda x: x['name'] == 'email-campaign-creator')[0]['parameters']['properties']['body']['description']

In [None]:
#| export
def _sub_name_in_output(output, name):
    "Subtitute `[Your Name]` with `name` in the output."
    output['content'] = output['content'].replace('[Your Name]', name)
    return output

In [None]:
#|hide
_out = _sub_name_in_output(output=_output, name='Hamel')
assert 'Hamel' in _out['content']

In [None]:
#| export
def _reword_input(inputs):
    "Rephrase the first human input."
    copy_inputs = copy.deepcopy(inputs)
    for inp in copy_inputs:
        if inp['role'] == 'user': 
            inp['content'] = rephrase(inp['content'])
            print(f"rephrased input as: {inp['content']}")
            break
    return copy_inputs

In [None]:
#|hide
_tst_inp = _reword_input(_inputs)
assert _tst_inp[1]['content'] != _inputs[1]['content']

rephrased input as: Implement email marketing for 2430 Victory Park Lane, Dallas TX.


In [None]:
#|exports
def tsfm_nm_rephrase(rundata:RunData, 
                     name=None) -> RunData:
    "An callback to be used with `collate` that substitutes names and rephrases the language model input."
    if name is None: name=gen_name()                    # generates a random name to be used to substitute a boilerplate name
    print(f'Substituting name: {name}')
    inputs = _reword_input(rundata.inputs)              # rephrases the input to the language model
    output = _sub_name_in_output(rundata.output, name)  # substitutes the template `[Your Name]` with `name` in the output of the language model
    funcs = _sub_name_in_func(rundata.funcs, name)      # substitutes the template `[Your Name]` with `name` in the a function schema description
    tsfm_rundata = RunData(inputs=inputs, output=output, funcs=funcs, run_id=rundata.run_id)
    return tsfm_rundata

In the below example, `[Your name]` is being substituted with an actual name:

In [None]:
_orig = collate(_run_id)
_tsfm = collate(_run_id, tsfm_nm_rephrase)
assert 'Your Name' not in _tsfm['messages'][-1]  # verify that `[Your Name]` is not present
assert _orig['messages'][1]['content'] != _tsfm['messages'][1]['content']  # make sure the message is different afer substitution

Substituting name: Desmond Price
rephrased input as: Implement email marketing for 2430 Victory Park Lane, Dallas TX.


### Write to jsonl

OpenAI fine-tuning takes jsonl files.

In [None]:
#|export
def write_to_jsonl(data_list:List[dict], filename:str):
    """
    Writes a list of dictionaries to a .jsonl file.
    
    Parameters:
    - data_list (list of dicts): The data to be written.
    - filename (str): The name of the output file.
    """
    shuffle(data_list)
    with open(filename, 'w') as f:
        for entry in data_list:
            json_str = json.dumps(entry)
            f.write(f"{json_str}\n")

In [None]:
#|eval:false
# _runs = ['59080971-8786-4849-be88-898d3ffc2b45', '8cd7deed-9547-4a07-ac01-55e9513ca1cd']
# _tsfm_runs = [collate(rid, tsfm_nm_rephrase) for rid in _runs];
# write_to_jsonl(_tsfm_runs, '_data/test_data.jsonl');

### Validate jsonl

In [None]:
#|export
def validate_jsonl(fname):
    "Code is modified from https://cookbook.openai.com/examples/chat_finetuning_data_prep, but updated for function calling."
    # Load the dataset
    with open(fname, 'r', encoding='utf-8') as f:
        dataset = [json.loads(line) for line in f]

    # Initial dataset stats
    print("Num examples:", len(dataset))
        
    # Format error checks
    format_errors = defaultdict(int)

    for i, ex in enumerate(dataset):
        if not isinstance(ex, dict):
            format_errors["data_type"] += 1
            continue

        messages = ex.get("messages", None)
        if not messages:
            format_errors["missing_messages_list"] += 1
            continue

        for im, message in enumerate(messages):
            if "role" not in message or ("content" not in message and 'function_call' not in message):
                format_errors["message_missing_key"] += 1

            if any(k not in ("role", "content", "name", "function_call") for k in message):
                format_errors["message_unrecognized_key"] += 1
                print(f'message_unrecognized_key {[k for k in message.keys() if k not in ["role", "content", "name"]]} in row:{i} message {im}')

            if message.get("role", None) not in ("system", "user", "assistant", "function"):
                format_errors["unrecognized_role"] += 1
                print(f'unrecognized_role {message.get("role", None)} in row:{i} message {im}')

            content = message.get("content", None)
            if (not content or not isinstance(content, str)) and 'function_call' not in message:
                format_errors["missing_content"] += 1
                print(f'missing_content in row:{i} message {im}')

        if not any(message.get("role", None) == "assistant" for message in messages):
            format_errors["example_missing_assistant_message"] += 1

    if format_errors:
        print("Found errors:")
        for k, v in format_errors.items():
            print(f"{k}: {v}")
    else:
        print("No errors found")

In [None]:
#|eval: false
validate_jsonl('_data/test_data.jsonl')

Num examples: 2
No errors found


In [None]:
#| hide
import nbdev; nbdev.nbdev_export()