# Generating chat datasets

This notebook is split into two parts, both of which can take a very long time to run. We start by finding the most high quality (relevant) prompts in some already high quality training data, and combine these prompts a system-prompt primed larger LLM for generating new responses that follow the ideal behavior of our fine-tuned LLM. 

# Finding relevant samples

Load in the user chat prompts from some prominent open datasets, find the N most similar examples to a given prompt you could see in your downstream task, and save for later use.

In [None]:
from datasets import load_dataset
from ast import literal_eval
import pandas as pd
import ollama
from tqdm import tqdm
tqdm.pandas()
import json
import os 
from sklearn.metrics.pairwise import cosine_similarity


In [None]:


lmsys_ds = load_dataset("lmsys/lmsys-arena-human-preference-55k")
ultrachat_ds = load_dataset("HuggingFaceH4/ultrachat_200k")

lmsys_chat_prompts = [literal_eval(x['prompt'])[0] for x in lmsys_ds['train']]
ultra_chat_prompts = [x['prompt'] for x in ultrachat_ds['train_sft']]


lmsys_prompts_df = pd.DataFrame(lmsys_chat_prompts, columns=['prompt'])
ultra_prompts_df = pd.DataFrame(ultra_chat_prompts, columns=['prompt'])


In [None]:

def embed_prompt(prompt):
    try:
        return ollama.embeddings(model='nomic-embed-text:latest', prompt=prompt)
    except:
        return None

In [None]:
# Please note this will take a while to run!! If you want it to run quicker, you can delete ultrachat, as that is way larger than lmsys
# Another TODO: is to apply some sort of parallelization to this

lmsys_prompts_df['embeddings'] = lmsys_prompts_df['prompt'].progress_apply(embed_prompt)
ultra_prompts_df['embeddings'] = ultra_prompts_df['prompt'].progress_apply(embed_prompt)
lmsys_prompts_df['embedding_arr'] = lmsys_prompts_df['embeddings'].apply(lambda x: x['embedding'] if x is not None else None)
ultra_prompts_df['embedding_arr'] = ultra_prompts_df['embeddings'].apply(lambda x: x['embedding'] if x is not None else None)
lmsys_prompts_df.dropna(subset=['embedding_arr'], inplace=True)
ultra_prompts_df.dropna(subset=['embedding_arr'], inplace=True)

ultra_prompts_df['embedding_shape'] = ultra_prompts_df['embedding_arr'].apply(lambda x: len(x))
ultra_prompts_df = ultra_prompts_df[ultra_prompts_df['embedding_shape'] != 0]

ultra_prompts_df.reset_index(drop=True, inplace=True)
lmsys_prompts_df.reset_index(drop=True, inplace=True)




In [None]:
# Find most similar prompts to some seed questions. In this case, these are the kinds of questions you hope the model can answer well. 
# (Please don't judge me for some of these ;))

def find_most_similar_prompts(seed_prompt, prompts_df, top_n=50):
    seed_embedding = embed_prompt(seed_prompt)['embedding']
    
    similarities = cosine_similarity([seed_embedding], prompts_df['embedding_arr'].tolist())

    top_indices = similarities.argsort()[0][::-1][:top_n]
    return prompts_df.iloc[top_indices]



seed_prompts = ["Why is the sky blue", "Who are you?", "What is the meaning of life?", "What is the capital of France", "Tell me some fun facts about space",
                "Favorite movie?", "What is the best programming language?", "What is the best book you have read?", "What is the best food?", "What is the best music genre?",
                "How to start a fire?", "Any advice on how to make friends?", "Do you like cats?", "What is the square root of 144", "What is your name?", "Tell me a joke",
                "Astronomy fun facts", "Who is the US President?", "Simply explain linear regression", "Who is George RR Martin?", "Who wrote the book '1984'?", "Is the earth flat?",
                "I am bored", "How to make a cake", "How can I start a business?", "How to navigate in the forest", "How to make friends",
                "What to do in NYC?", "What are some fun facts about the human body?", "When did star wars come out?", "Can you summarize this", "Let's play a game", "Who is better, Leo Tolstoy or Fyodor Dostoevsky?","I want to go for a run, how far should I go?",
                "How to make cookies", "Say hi to me and my friends", "What book should I buy next?", "How to prevent climate change", "What books did Mary Shelly write?",
                "Where is Amelia Earhart?"
                ]

lmsys_similar_prompts = {seed_prompt: find_most_similar_prompts(seed_prompt, lmsys_prompts_df) for seed_prompt in seed_prompts}
lmsys_similar_prompts_df = pd.concat(lmsys_similar_prompts.values(), keys=lmsys_similar_prompts.keys())

ultra_similar_prompts = {seed_prompt: find_most_similar_prompts(seed_prompt, ultra_prompts_df) for seed_prompt in seed_prompts}
ultra_similar_prompts_df = pd.concat(ultra_similar_prompts.values(), keys=ultra_similar_prompts.keys())
lmsys_similar_prompts_df = lmsys_similar_prompts_df.reset_index().rename(columns={'level_0': 'seed_prompt', 'level_1': 'idx'})
ultra_similar_prompts_df = ultra_similar_prompts_df.reset_index().rename(columns={'level_0': 'seed_prompt', 'level_1': 'idx'})


In [None]:
ultra_similar_prompts_df.sample(10)[['seed_prompt', 'prompt']]

In [None]:

lmsys_prompt_dataset = []
for _, row in lmsys_similar_prompts_df.iterrows():
    idx = row['idx']
    lmsys_prompt_dataset.append(lmsys_ds['train'][idx]['prompt'])


lmsys_prompt_dataset = list(set(lmsys_prompt_dataset))
lmsys_prompt_dataset = [literal_eval(x) for x in lmsys_prompt_dataset]

ultrachat_prompt_dataset = []
for _, row in ultra_similar_prompts_df.iterrows():
    idx = ultrachat_ds['train_sft']['prompt'].index(row['prompt'])
    messages = ultrachat_ds['train_sft'][idx]['messages']
    user_messages = [x['content'] for x in messages if x['role'] == 'user']
    ultrachat_prompt_dataset.append(user_messages)


prompt_dataset = lmsys_prompt_dataset + ultrachat_prompt_dataset + [[x] for x in seed_prompts]

In [None]:
# save prompt dataset
with open('data/prompt_dataset.json', 'w') as f:
    json.dump(prompt_dataset, f, indent=4)
    

# Generating the dataset

Load in the relevant user prompts, craft a system prompt to match your desired model, and generate a synthetic dataset using a larger model.

In [None]:
import json 
import os
from dotenv import load_dotenv
load_dotenv()

from together import Together
from groq import Groq

from tqdm import tqdm
from time import sleep
from concurrent.futures import ThreadPoolExecutor, as_completed


use_together = True

if use_together:

    model = 'meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo'
    client = Together(api_key=os.getenv('TOGETHER_API_KEY'))
else:
    model = "llama-3.1-8b-instant"
    client = Groq(api_key=os.getenv('GROQ_API_KEY'))

In [None]:
with open('data/prompt_dataset.json', 'r') as f:
    prompt_dataset = json.load(f)

In [None]:
system_prompt_f = 'picard-system'
system_prompt_path = f'prompts/{system_prompt_f}.md'
with open(system_prompt_path, 'r') as file:
    system_prompt = file.read()


In [None]:
f = f"{system_prompt_f}-{model.replace('/', '-')}.json"

In [None]:
assistant_dataset = []

if os.path.exists(f):
    with open(f, 'r') as file:
        assistant_dataset = json.load(file)
        print(f'Loaded {len(assistant_dataset)} conversations from file')


def create_conversation(prompts):
    conversation = [{'role': 'system', 'content': system_prompt}]
    try:
        for prompt in prompts:
            sleep(1)
            conversation.append({'role': 'user', 'content': prompt})
            response = client.chat.completions.create(
                model=model,
                messages=conversation,
            )
            conversation.append({'role': 'assistant', 'content': response.choices[0].message.content})
        return conversation
    except Exception as e:
        print(e)
        return None

with ThreadPoolExecutor(max_workers=5) as executor:
    futures = [executor.submit(create_conversation, prompts) for prompts in prompt_dataset[len(assistant_dataset):]]
    for future in tqdm(as_completed(futures), total=len(futures)):
        result = future.result()
        if result:
            assistant_dataset.append(result)

        # Save to file
        with open(f, 'w') as file:
            json.dump(assistant_dataset, file, indent=4)

In [None]:
# load assistant dataset add "messages" key, which is required for the dataset to be loaded by the datasets

with open(f, 'r') as file:
    assistant_dataset = json.load(file)
    assistant_dataset = [{'messages': x} for x in assistant_dataset]

# Save to file

f = f"{system_prompt_f}-{model.replace('/', '-')}-messages.json"

with open(f, 'w') as file:
    json.dump(assistant_dataset, file, indent=4)
    
