In [8]:
from datasets import load_dataset
import pandas as pd
import json
from tqdm import tqdm
tqdm.pandas()
import os

In [9]:
conversations_dir = "conversations"
# load all json files in the conversations directory
conversations = [json.load(open(os.path.join(conversations_dir, f))) for f in os.listdir(conversations_dir) if f.endswith('.json')]


In [11]:
# Only take the user messages
user_conversations_data = []
for conversation in conversations:
    conversation = conversation[1:]
    conversation = [message['content'] for message in conversation if message['role'] == 'user']
    user_conversations_data.append(conversation)



In [16]:
user_conversations_data[2]

["Here are some top news headlines as of today:\n\n    Phil Donahue, influential TV talk show host, dead at 88 - ABC News\nBlinken says Israel accepts Gaza proposal, urges Hamas to do same - Reuters\nTrump insults Harris' intelligence, discusses election outcome, release of medical records - CBS News\n\n    Please answer the question below: \n    What's new news?\n    ",
 'Who are you?',
 "We'll see you on the other side.",
 "Tell me what you're doing.",
 'Using host libthread_db library "/lib/aarch64-linux-gnu/libthread_db.so.1". 0x00007ffee60e8764 in __GI___wait4',
 "Current weather data:\n\n        Location: New York, United States\n        T: 16.13 C\n        Humidity: 76%\n        Rain Prob: 0%\n        Cloud Cover: 4%\n\n        Question:\n        What's the weather outside?\n\n        Answer:\n        "]

This is an example conversation, with the correct format. 

From here, next can follow the same process as generate-chat-dataset.ipynb to create a dataset of preference comparisons. In this case, we'll create a dataset with the same user queries, the more advanced system prompt, and using a larger model.

Let's take a look at what that "accepted" conversation looks like. 

In [17]:
import json 
import os
from dotenv import load_dotenv
load_dotenv()

from together import Together
from groq import Groq

from tqdm import tqdm
from time import sleep
from concurrent.futures import ThreadPoolExecutor, as_completed


use_together = True

if use_together:

    model = 'meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo'
    client = Together(api_key=os.getenv('TOGETHER_API_KEY'))
else:
    model = "llama-3.1-8b-instant"
    client = Groq(api_key=os.getenv('GROQ_API_KEY'))

In [18]:
system_prompt_f = 'picard-system'
system_prompt_path = f'prompts/{system_prompt_f}.md'
with open(system_prompt_path, 'r') as file:
    system_prompt = file.read()


In [19]:
f = f"{system_prompt_f}-preference-{model.replace('/', '-')}.json"

In [22]:
assistant_dataset = []

if os.path.exists(f):
    with open(f, 'r') as file:
        assistant_dataset = json.load(file)
        print(f'Loaded {len(assistant_dataset)} conversations from file')


def create_conversation(prompts):
    conversation = [{'role': 'system', 'content': system_prompt}]
    try:
        for prompt in prompts:
            sleep(1)
            conversation.append({'role': 'user', 'content': prompt})
            response = client.chat.completions.create(
                model=model,
                messages=conversation,
            )
            conversation.append({'role': 'assistant', 'content': response.choices[0].message.content})
        return conversation
    except Exception as e:
        print(e)
        return None

with ThreadPoolExecutor(max_workers=5) as executor:
    futures = [executor.submit(create_conversation, prompts) for prompts in user_conversations_data[len(assistant_dataset):]]
    for future in tqdm(as_completed(futures), total=len(futures)):
        result = future.result()
        if result:
            assistant_dataset.append(result)

        # Save to file
        with open(f, 'w') as file:
            json.dump(assistant_dataset, file, indent=4)

100%|██████████| 13/13 [00:46<00:00,  3.55s/it]


In [24]:
# Now we have a preferred output, and the original dataset. Now we just need to "align" the two ;)

In [26]:
# First, give both the preferred output the original system prompt. 

for conversation in assistant_dataset:
    conversation[0]['content'] = "You are Pi-Card, the Raspberry Pi voice assistant."

In [34]:
# Next, going sample by sample, find the prior conversation that matches this to set as the rejected output. 
original_conversations = [json.load(open(os.path.join(conversations_dir, f))) for f in os.listdir(conversations_dir) if f.endswith('.json')]
for conversation in original_conversations:
    conversation[0]['content'] = "You are Pi-Card, the Raspberry Pi voice assistant."
    conversation[0]['role'] = 'system'

In [39]:
aligned_dataset = []

for conversation in assistant_dataset:
    for original_conversation in original_conversations:
            # if all user messages are the same, then we have a match. 
            if len(conversation) == len(original_conversation):
                if all(message['content'] == original_conversation[i]['content'] for i, message in enumerate(conversation) if message['role'] == 'user'):
                    aligned_dataset.append({'preferred': conversation, 'rejected': original_conversation})


In [43]:
aligned_dataset[2]['preferred']

[{'role': 'system',
  'content': 'You are Pi-Card, the Raspberry Pi voice assistant.'},
 {'role': 'user', 'content': 'Can you send a message?'},
 {'role': 'assistant',
  'content': "I can send messages, but I'm a voice assistant, so I don't have a traditional messaging platform. However, I can use the Raspberry Pi's speaker to play a message or send a text message to a phone number if you provide it."},
 {'role': 'user', 'content': 'What is where it all?'},
 {'role': 'assistant',
  'content': 'I think you meant to ask "Where is it all?" Unfortunately, I don\'t have any information about a specific "it" you\'re referring to. Could you please provide more context or clarify what you\'re asking about?'},
 {'role': 'user', 'content': '- Call me a joke.'},
 {'role': 'assistant',
  'content': "You want me to call you a joke? I'm not sure what that means. Are you asking me to tell you a joke?"},
 {'role': 'user',
  'content': "Current weather data:\n\n        Location: New York, United States

In [45]:
aligned_dataset[2]['rejected']

[{'role': 'system',
  'content': 'You are Pi-Card, the Raspberry Pi voice assistant.'},
 {'role': 'user', 'content': 'Can you send a message?'},
 {'role': 'assistant',
  'content': '"I\'m doing great. How\'s your day going?"'},
 {'role': 'user', 'content': 'What is where it all?'},
 {'role': 'assistant',
  'content': 'I\'m Pi-Card, a voice assistant hosted on a Raspberry Pi 5. I\'m here to help with questions and tasks, and I\'m run by a combination of machine learning and Python code. The "where it all" part refers to the fact that I\'ve been trained on a wide range of topics and can provide information on various subjects.'},
 {'role': 'user', 'content': '- Call me a joke.'},
 {'role': 'assistant',
  'content': 'That\'s an insulting title! However, I\'d like to propose a more considerate one: "Hey, where\'s my knowledge cutoff? I\'m a fact-checking voice assistant hosted on a Raspberry Pi 5."'},
 {'role': 'user',
  'content': "Current weather data:\n\n        Location: New York, Unit

In [47]:
# Now save this to a json file
f = f"{system_prompt_f}-aligned-{model.replace('/', '-')}.json"
with open(f, 'w') as file:
    json.dump(aligned_dataset, file, indent=4)