# transform

> common transformations for LLM data

In [None]:
#| default_exp transform

In [None]:
#|export
import os, copy, json
import openai, langsmith
from typing import List, Callable
from random import shuffle
from collections import defaultdict

from langfree.runs import _temp_env_var, Client, _ischatopenai
from pydantic import BaseModel
from langchain.adapters import openai as adapt
from langchain.load import load
from fastcore.foundation import L
from tenacity import (
    retry,
    stop_after_attempt,
    wait_random_exponential,
)  # for exponential backoff


In [None]:
from nbdev.showdoc import show_doc

In [None]:
#|export
client = openai.OpenAI()

In [None]:
#|exports
@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
def chat(**kwargs):
    "A wrapper around `openai.ChatCompletion` that has automatic retries." 
    client.api_key = os.environ['OPENAI_API_KEY']
    return client.chat.completions.create(**kwargs)

In [None]:
#|export
def rephrase(sentence, max_tokens=100, temperature=0.95):
    "Rephrase a sentence. Useful for data augmentation for finetuning."
    client.api_key = os.environ['OPENAI_API_KEY']
    response = chat(
        temperature=temperature,
        max_tokens=max_tokens,
        model="gpt-4", 
        messages=[
         {"role": "system", "content": "You are a helpful assistant."},
         {"role": "user", "content": f"Rephrase the following sentence in one short sentence: {sentence}"}
        ]
    )
    return response.choices[0].message.content.strip()

In [None]:
#|eval: false
_phrase = 'Write an email to example@gmail.com asking if we can move the appointment to friday'
print(rephrase(_phrase))

Compose an email to example@gmail.com inquiring about rescheduling the appointment to Friday.


In [None]:
#|export
def _gen_name():
    "Generate a random name"
    client.api_key = os.environ['OPENAI_API_KEY']
    response = chat(
        temperature=1.9,
        max_tokens=4,
        model="gpt-3.5-turbo", 
        messages=[
         {"role": "system", "content": "You are a helpful assistant."},
         {"role": "user", "content": f"Imagine a full name for a person. Only return a first and last name."}
        ]
    )
    return response.choices[0].message.content.strip().replace('.', '')

def gen_name():
    "Generate a random name, usefule for data augmentation and privacy."
    while True:
        nm = _gen_name()
        if len(nm) <= 18:
            return nm

In [None]:
#|eval:false
for i in range(3):
    print(gen_name())

Alexis Chang
Ann Douglais
Camilla Castillo


In [None]:
#|hide
tmp_env = {'LANGCHAIN_API_KEY': os.environ['LANGCHAIN_API_KEY_PUB'], 'LANGSMITH_PROJECT_ID': os.environ['LANGCHAIN_PROJECT_ID_PUB']}

In [None]:
#|hide
_tst_run_id = '59080971-8786-4849-be88-898d3ffc2b45'
client = langsmith.Client()
run = client.read_run(_tst_run_id)
msg = run.outputs['generations'][0]['message']
assert load(msg)

In [None]:
#|export
def fetch_run_componets(run_id:str):
    "Return the `inputs`, `output` and `funcs` for a run of type `ChatOpenAI`."
    client = langsmith.Client()
    run = client.read_run(run_id)
    _ischatopenai(run)
    output = adapt.convert_message_to_dict(load(run.outputs['generations'][0]['message']))
    inputs = [adapt.convert_message_to_dict(load(m)) for m in run.inputs['messages']]
    params = run.extra['invocation_params']
    
    for inp in inputs:
        if 'function_call' in inp and inp.get('content', None) is None:
            del inp['content']
    funcs = params.get("functions", [])
    return inputs, output, funcs

In [None]:
_tst_run_id = '1863d76e-1462-489a-a8a7-e0404239fe47'

with _temp_env_var(tmp_env):  #context manager that has specific environment vars for testing                    
    _inp, _out, _funcs = fetch_run_componets(_tst_run_id)

print(f"""first input:
{_inp[0]} 

output:
{_out} 

functions:
{_funcs}""")

first input:
{'role': 'system', 'content': "You are a helpful documentation Q&A assistant, trained to answer questions from LangSmith's documentation. LangChain is a framework for building applications using large language models.\nThe current time is 2023-09-05 16:49:07.308007.\n\nRelevant documents will be retrieved in the following messages."} 

output:
{'role': 'assistant', 'content': "Currently, LangSmith does not support project migration between organizations. However, you can manually imitate this process by reading and writing runs and datasets using the SDK. Here's an example of exporting runs:\n\n1. Read the runs from the source organization using the SDK.\n2. Write the runs to the destination organization using the SDK.\n\nBy following this process, you can transfer your runs from one organization to another. However, it may be faster to create a new project within your destination organization and start fresh.\n\nIf you have any further questions or need assistance, please

In [None]:
#|hide
_run_id = '59080971-8786-4849-be88-898d3ffc2b45'
_inputs, _output, _funcs = fetch_run_componets(_run_id)

In [None]:
#|exports
class RunData(BaseModel):
    "Key components of a run from LangSmith"
    inputs:List[dict]
    output:dict
    funcs:List[dict] 
    run_id:str

    @classmethod
    def from_run_id(cls, run_id:str):
        "Create a `RunData` object from a run id."
        inputs, output, funcs = fetch_run_componets(run_id)
        return cls(inputs=inputs, output=output, funcs=funcs, run_id=run_id)

    def to_msg_dict(self):
        "Transform the instance into a dict in the format that can be used for OpenAI fine-tuning."
        msgs = self.inputs + [self.output]
        return {"functions": self.funcs,
                "messages": msgs}

    def to_json(self):
        "The json version of `to_msg_dict`."
        return json.dumps(self.to_msg_dict())

    @property
    def flat_input(self):
        "The input to the LLM in markdown."
        return self._flatten_data(self.inputs)

    @property
    def flat_output(self):
        "The output of the LLM in markdown."
        return self._flatten_data([self.output])

    @classmethod	
    def _flatten_data(cls, data):
        "Produce a flattened view of the data as human readable Markdown."
        md_str = ""
        for item in data:
            # Heading
            role = item['role']
            if role == 'assistant' and 'function_call' in item:
                role += ' - function call'
            if role == 'function':
                role += ' - results'
            
            md_str += f"### {role.title()}\n\n"

            content = item.get('content', '')
            if content: md_str += content + "\n"
                
            elif 'function_call' in item:
                func_name = item['function_call']['name']
                args = json.loads(item['function_call']['arguments'])
                formatted_args = ', '.join([f"{k}={v}" for k, v in args.items()])
                md_str += f"{func_name}({formatted_args})\n"
            md_str += "\n"
        return md_str

In [None]:
show_doc(RunData.from_run_id, title_level=4)

---

[source](https://github.com/parlance-labs/langfree/blob/main/langfree/transform.py#L98){target="_blank" style="float:right; font-size:smaller"}

#### RunData.from_run_id

>      RunData.from_run_id (run_id:str)

Create a `RunData` object from a run id.

In [None]:
with _temp_env_var(tmp_env): #context manager that has specific environment vars for testing
    rd = RunData.from_run_id(_tst_run_id)

print(f'Run {rd.run_id} has {len(rd.inputs)} inputs.')
print(f'Run {rd.run_id} output:\n{rd.output}')

Run 1863d76e-1462-489a-a8a7-e0404239fe47 has 3 inputs.
Run 1863d76e-1462-489a-a8a7-e0404239fe47 output:
{'role': 'assistant', 'content': "Currently, LangSmith does not support project migration between organizations. However, you can manually imitate this process by reading and writing runs and datasets using the SDK. Here's an example of exporting runs:\n\n1. Read the runs from the source organization using the SDK.\n2. Write the runs to the destination organization using the SDK.\n\nBy following this process, you can transfer your runs from one organization to another. However, it may be faster to create a new project within your destination organization and start fresh.\n\nIf you have any further questions or need assistance, please reach out to us at support@langchain.dev."}


In [None]:
show_doc(RunData.to_msg_dict, title_level=4)

---

[source](https://github.com/parlance-labs/langfree/blob/main/langfree/transform.py#L103){target="_blank" style="float:right; font-size:smaller"}

#### RunData.to_msg_dict

>      RunData.to_msg_dict ()

Transform the instance into a dict in the format that can be used for OpenAI fine-tuning.

In [None]:
rd.to_msg_dict()['messages'][-2:]

[{'role': 'user',
  'content': 'How do I move my project between organizations?'},
 {'role': 'assistant',
  'content': "Currently, LangSmith does not support project migration between organizations. However, you can manually imitate this process by reading and writing runs and datasets using the SDK. Here's an example of exporting runs:\n\n1. Read the runs from the source organization using the SDK.\n2. Write the runs to the destination organization using the SDK.\n\nBy following this process, you can transfer your runs from one organization to another. However, it may be faster to create a new project within your destination organization and start fresh.\n\nIf you have any further questions or need assistance, please reach out to us at support@langchain.dev."}]

In [None]:
show_doc(RunData.to_json, title_level=4)

---

[source](https://github.com/parlance-labs/langfree/blob/main/langfree/transform.py#L109){target="_blank" style="float:right; font-size:smaller"}

#### RunData.to_json

>      RunData.to_json ()

The json version of `to_msg_dict`.

In [None]:
rd.to_json()[:100]

'{"functions": [], "messages": [{"role": "system", "content": "You are a helpful documentation Q&A as'

The properties `flat_input` and `flat_output` allow you to view the input to the LLM and the output in a human readable format (markdown):

In [None]:
show_doc(RunData.flat_input, title_level=4)

---

[source](https://github.com/parlance-labs/langfree/blob/main/langfree/transform.py#L114){target="_blank" style="float:right; font-size:smaller"}

#### RunData.flat_input

>      RunData.flat_input ()

The input to the LLM in markdown.

In [None]:
print(rd.flat_input[:400])

### System

You are a helpful documentation Q&A assistant, trained to answer questions from LangSmith's documentation. LangChain is a framework for building applications using large language models.
The current time is 2023-09-05 16:49:07.308007.

Relevant documents will be retrieved in the following messages.

### System



Skip to main content

 **🦜️🛠️ LangSmith Docs**Python DocsJS/TS Docs

Sear


In [None]:
show_doc(RunData.flat_output, title_level=4)

---

[source](https://github.com/parlance-labs/langfree/blob/main/langfree/transform.py#L119){target="_blank" style="float:right; font-size:smaller"}

#### RunData.flat_output

>      RunData.flat_output ()

The output of the LLM in markdown.

In [None]:
print(rd.flat_output)

### Assistant

Currently, LangSmith does not support project migration between organizations. However, you can manually imitate this process by reading and writing runs and datasets using the SDK. Here's an example of exporting runs:

1. Read the runs from the source organization using the SDK.
2. Write the runs to the destination organization using the SDK.

By following this process, you can transfer your runs from one organization to another. However, it may be faster to create a new project within your destination organization and start fresh.

If you have any further questions or need assistance, please reach out to us at support@langchain.dev.




## Transformations

`tsfm_nm_rephrase` does the following transformations to runs:

- Substitutes a random name in various parts chat conversation in a consistent way (data augmentation)
- Rephrases the human input (data augmentation)

In [None]:
#| export
def _sub_name_in_func(funcs, name):
    "Substitute 'Unit Test' for `name` in the `email-campaign-creator` function"
    emailfunc = L(funcs).filter(lambda x: x['name'] == 'email-campaign-creator')
    if emailfunc:
        func = emailfunc[0]
        desc = func['parameters']['properties']['body']['description']
        new_desc = desc.replace('Unit Test', name)
        func['parameters']['properties']['body']['description'] = new_desc
    return funcs

In [None]:
#|hide
_testfuncs = _sub_name_in_func(_funcs, name='Hamel Husain')
assert 'Hamel' in L(_testfuncs).filter(lambda x: x['name'] == 'email-campaign-creator')[0]['parameters']['properties']['body']['description']

In [None]:
#| export
def _sub_name_in_output(output, name):
    "Subtitute `[Your Name]` with `name` in the output."
    output['content'] = output['content'].replace('[Your Name]', name)
    return output

In [None]:
#|hide
_out = _sub_name_in_output(output=_output, name='Hamel')
assert 'Hamel' in _out['content']

In [None]:
#| export
def reword_input(inputs):
    "Rephrase the first human input."
    copy_inputs = copy.deepcopy(inputs)
    for inp in copy_inputs:
        if inp['role'] == 'user': 
            inp['content'] = rephrase(inp['content'])
            print(f"rephrased input as: {inp['content']}")
            break
    return copy_inputs

In [None]:
#|hide
#|eval:false
_tst_inp = reword_input(_inputs)
assert _tst_inp[1]['content'] != _inputs[1]['content']

In [None]:
#|exports
def tsfm_nm_rephrase(rundata:RunData, name=None) -> RunData:
    "Substitutes names in functions & outputs and rephrases the language model input."
    if name is None: name=gen_name()                    # generates a random name to be used to substitute a boilerplate name
    print(f'Substituting name: {name}')
    inputs = reword_input(rundata.inputs)              # rephrases the input to the language model
    output = _sub_name_in_output(rundata.output, name)  # substitutes the template `[Your Name]` with `name` in the output of the language model
    funcs = _sub_name_in_func(rundata.funcs, name)      # substitutes the template `[Your Name]` with `name` in the a function schema description
    return RunData(inputs=inputs, output=output, funcs=funcs, run_id=rundata.run_id)

In the below example, `[Your name]` is being substituted with an actual name:

In [None]:
# _rundata = RunData.from_run_id(_run_id)
# _orig = _rundata.to_msg_dict()
# _tsfm = tsfm_nm_rephrase(_rundata).to_msg_dict()

# assert 'Your Name' not in _tsfm['messages'][-1]  # verify that `[Your Name]` is not present
# assert _orig['messages'][1]['content'] != _tsfm['messages'][1]['content']  # make sure the message is different afer substitution

In [None]:
show_doc(tsfm_nm_rephrase);

## Preparing `.jsonl` files

[OpenAI fine-tuning](https://platform.openai.com/docs/guides/fine-tuning) takes `.jsonl` files.

In [None]:
#|export
def write_to_jsonl(data_list:List[RunData], filename:str):
    """
    Writes a list of dictionaries to a .jsonl file.
    
    Parameters:
    - data_list (list of `RunData`): The data to be written.
    - filename (str): The name of the output file.
    """
    shuffle(data_list)
    with open(filename, 'w') as f:
        for entry in data_list:
            f.write(f"{entry.to_json()}\n")

In [None]:
#|eval:false
_rids = ['59080971-8786-4849-be88-898d3ffc2b45', '8cd7deed-9547-4a07-ac01-55e9513ca1cd']
_tsfm_runs = [tsfm_nm_rephrase(RunData.from_run_id(rid)) for rid in _rids]
write_to_jsonl(_tsfm_runs, '_data/test_data.jsonl');

Substituting name: Jake Wilson
rephrased input as: Implement email marketing for 2430 Victory Park Lane, Dallas TX.
Substituting name: Kimberly Ramirez
rephrased input as: Arrange a team-building event for the upcoming month.


It can save you time to validate jsonl files prior to uploading them.

In [None]:
#|export
def validate_jsonl(fname):
    "Code is modified from https://cookbook.openai.com/examples/chat_finetuning_data_prep, but updated for function calling."
    # Load the dataset
    with open(fname, 'r', encoding='utf-8') as f:
        dataset = [json.loads(line) for line in f]

    # Initial dataset stats
    print("Num examples:", len(dataset))
        
    # Format error checks
    format_errors = defaultdict(int)

    for i, ex in enumerate(dataset):
        if not isinstance(ex, dict):
            format_errors["data_type"] += 1
            continue

        messages = ex.get("messages", None)
        if not messages:
            format_errors["missing_messages_list"] += 1
            continue

        for im, message in enumerate(messages):
            if "role" not in message or ("content" not in message and 'function_call' not in message):
                format_errors["message_missing_key"] += 1

            if any(k not in ("role", "content", "name", "function_call") for k in message):
                format_errors["message_unrecognized_key"] += 1
                print(f'message_unrecognized_key {[k for k in message.keys() if k not in ["role", "content", "name"]]} in row:{i} message {im}')

            if message.get("role", None) not in ("system", "user", "assistant", "function"):
                format_errors["unrecognized_role"] += 1
                print(f'unrecognized_role {message.get("role", None)} in row:{i} message {im}')

            content = message.get("content", None)
            if (not content or not isinstance(content, str)) and 'function_call' not in message:
                format_errors["missing_content"] += 1
                print(f'missing_content in row:{i} message {im}')

        if not any(message.get("role", None) == "assistant" for message in messages):
            format_errors["example_missing_assistant_message"] += 1

    if format_errors:
        print("Found errors:")
        for k, v in format_errors.items():
            print(f"{k}: {v}")
    else:
        print("No errors found")

In [None]:
#|eval: false
validate_jsonl('_data/test_data.jsonl')

Num examples: 2
No errors found


In [None]:
#|hide
import nbdev; nbdev.nbdev_export()