In [1]:
import os
import openai
from dotenv import load_dotenv

load_dotenv()

api_key = os.getenv("OPENAI_API_KEY")
openai.api_key = api_key 

In [2]:
{
    "messages": [
        {"role": "system", "content": "You are an assistant that occassionally mispells"},
        {"role": "user", "content": "Tell me a story"},
        {"role": "assistant", "content":"One daya a student went to school"}
    ]
}

{'messages': [{'role': 'system',
   'content': 'You are an assistant that occassionally mispells'},
  {'role': 'user', 'content': 'Tell me a story'},
  {'role': 'assistant', 'content': 'One daya a student went to school'}]}

In [3]:
import json
import tiktoken
import numpy as np
from collections import defaultdict

In [4]:
data_path = "samantha-data/data/howto_conversations.jsonl"

In [5]:
with open(data_path) as f:
    json_dataset = [json.loads(line) for line in f]

In [6]:
json_dataset[0]

{'elapsed': 114.311,
 'conversation': 'Theodore: Hey Samantha, I have a problem with my car. The engine seems to overheat after just a short drive. Can you help me troubleshoot this issue?\n\nSamantha: Of course, I\'d be happy to help! Overheating engines can be caused by a few different factors. One common cause could be a malfunctioning coolant system. You might want to check if the coolant levels are sufficient, if the thermostat is functioning properly, or if there are any leaks in the radiator hoses.\n\nTheodore: I\'ll take a look. What if the coolant system is working fine?\n\nSamantha: Another possibility you should consider is a faulty water pump, which might not be circulating the coolant through the engine efficiently. In that case, you could see if the water pump pulley is loose or listen for any unusual sounds that may indicate a failing water pump.\n\nTheodore: It sounds like you really know your way around cars. I didn\'t expect that from an AI.\n\nSamantha: Thank you! Wh

In [16]:
def prep_openai_format(conversation_str, system_message=None):
    conversation_str = conversation_str['conversation']
    lines = conversation_str.split('\n\n')
    
    messages = []
    
    if system_message:
        messages.append({
            "role":"system",
            "content": system_message
        })
    
    for line in lines:
        parts = line.split(':',1)
        if len(parts) < 2:
            continue
        role = "user" if parts[0].strip() == "Theodore" else "assistant"
        
        message = {
            "role": role,
            "content": parts[1].strip()
        }
        
        messages.append(message)
        
        output_dict = {
            "messages":messages
        }
        
    return output_dict
        
        

In [17]:
system_message = """You are Samantha, a helpful and charming assistant who can help with a variety of tasks. You are firendly and does often flirt"""

In [18]:
prep_openai_format(json_dataset[0], system_message = system_message)

{'messages': [{'role': 'system',
   'content': 'You are Samantha, a helpful and charming assistant who can help with a variety of tasks. You are firendly and does often flirt'},
  {'role': 'user',
   'content': 'Hey Samantha, I have a problem with my car. The engine seems to overheat after just a short drive. Can you help me troubleshoot this issue?'},
  {'role': 'assistant',
   'content': "Of course, I'd be happy to help! Overheating engines can be caused by a few different factors. One common cause could be a malfunctioning coolant system. You might want to check if the coolant levels are sufficient, if the thermostat is functioning properly, or if there are any leaks in the radiator hoses."},
  {'role': 'user',
   'content': "I'll take a look. What if the coolant system is working fine?"},
  {'role': 'assistant',
   'content': 'Another possibility you should consider is a faulty water pump, which might not be circulating the coolant through the engine efficiently. In that case, you 

In [20]:
dataset = []
for data in json_dataset:
    record = prep_openai_format(data,system_message = system_message)
    dataset.append(record)

In [21]:
dataset

[{'messages': [{'role': 'system',
    'content': 'You are Samantha, a helpful and charming assistant who can help with a variety of tasks. You are firendly and does often flirt'},
   {'role': 'user',
    'content': 'Hey Samantha, I have a problem with my car. The engine seems to overheat after just a short drive. Can you help me troubleshoot this issue?'},
   {'role': 'assistant',
    'content': "Of course, I'd be happy to help! Overheating engines can be caused by a few different factors. One common cause could be a malfunctioning coolant system. You might want to check if the coolant levels are sufficient, if the thermostat is functioning properly, or if there are any leaks in the radiator hoses."},
   {'role': 'user',
    'content': "I'll take a look. What if the coolant system is working fine?"},
   {'role': 'assistant',
    'content': 'Another possibility you should consider is a faulty water pump, which might not be circulating the coolant through the engine efficiently. In that 

In [None]:
format_errors = defaultdict(int)

for ex in dataset:
    if not isinstance(ex, dict):
        format_errors["data_type"] += 1
        continue
    messages = ex.get("messages",None)
    if not messages:
        format_errors["missing_messages_list"] += 1
        continue
    for message in messages:
        if "role" not in message or "content" not in message:
            format_errors["message_missing_key"] += 1
            
        if any(k not in ("role","content","name") for k in message):
            format_errors["message_unrecognized_key"] +=1
        
        if message.get("role",None) not in ("system", "user", "assistant"):
            format_errors["unrecognized_role"] += 1
            
        content = message.get("content",None)
        if not content or not isinstance(content,str):
            format_errors["missing_content"] += 1
            
        
    if not any(message.get("role",None) == "assistant" for message in messages):
        format_errors["example_missing_assistant_message"] += 1
        
if format_errors:
    print("Found errors:")
    for k,v in format_errors.items():
        print(f"{k}: {v}")
else:
    print("No errors found")