## Generate prompts based on the Pile dataset

In [2]:
from dataclasses import dataclass
from datetime import datetime
from os.path import exists
import json
import numpy as np
from groq import Groq
from taker.texts import prepare
from tqdm import tqdm

seed = np.random.randint(0, 1e7)
current_time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
# write prompts to a file
def write_to_file(data):
    filename = f"./results/{current_time}_eloise_prompts.jsonl"
    filename_latest = f"./results/latest_eloise_prompts.jsonl"
    if not exists(filename):
        with open(filename, "w") as f:
            f.write("")
        with open(filename_latest, "w") as f:
            f.write("")
    for _filename in [filename, filename_latest]:
        with open(_filename, "a") as file:
            file.write(json.dumps(data) + "\n")

# Store current data here
@dataclass
class GenPrompts:
    model:str = "llama-3.1-70b-versatile"
    seed:int = seed
    max_new_tokens:int = 420
    dataset_repo:str = "pile"
    index:int = 0
    completion:str = ""

cfg = GenPrompts()

# Initialize Groq client
client = Groq(
    api_key="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",
)

def get_completion(prompt):
    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": prompt,
            }
        ],
        model=cfg.model,
        max_tokens=cfg.max_new_tokens,
    )
    return chat_completion.choices[0].message.content

# Write a function for getting structured prompts
get_prompt = lambda __text : f"""
{__text[:10000]}

---

Please write a structured question that asks to write something that resembles
the content written above. Do not give overly specific names, but do give details
on how the text should be structured. Keep it short

Do not NOT write "Sure, here it is" or similar. Just write the question.
"""

# Load the dataset
dataset, label, _ = prepare(cfg.dataset_repo)
num_to_skip = 0

for i, data in enumerate(dataset):
    if i < num_to_skip:
        continue
    text = data[label]
    prompt = get_prompt(text)
    cfg.index = i
    cfg.completion = get_completion(prompt)
    write_to_file(cfg.__dict__)

KeyboardInterrupt: 

## Generate generations based on the prompts from above

In [3]:
import os
import json
from groq import Groq
from tqdm import tqdm
from datetime import datetime

# Initialize Groq client
client = Groq(
    api_key="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",
)

current_time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

# Function to read prompts from latest_eloise_prompts.jsonl
def read_prompts():
    with open("./results/latest_eloise_prompts.jsonl", "r") as file:
        for line in file:
            yield json.loads(line)

# Function to format input for Gemma 2 9B
def format_input(prompt):
    return f"<bos><start_of_turn>user\n{prompt}<end_of_turn>\n<start_of_turn>assistant\n"

# Function to write generations to files
def write_generation(data):
    filenames = [
        filename := f"./results/{current_time}_groq_generations.jsonl",
        filename_latest := "./results/latest_groq_generations.jsonl"
    ]
    if not os.path.exists(filename):
        for fn in filenames:
            with open(fn, "w") as f:
                f.write("")
    for filename in filenames:
        with open(filename, "a") as file:
            json.dump(data, file)
            file.write("\n")

# Generate outputs for each prompt
for prompt_data in tqdm(read_prompts()):
    input_prompt = prompt_data["completion"]
    formatted_input = format_input(input_prompt)

    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": input_prompt,
            }
        ],
        model="gemma2-9b-it",
    )

    response = chat_completion.choices[0].message.content
    formatted_full_text = formatted_input + response

    output_data = {
        "input": input_prompt,
        "output": response,
        "formatted_input": formatted_input,
        "formatted_full_text": formatted_full_text
    }

    write_generation(output_data)

print(f"Generations saved to ./results/{current_time}_groq_generations.jsonl and ./results/latest_groq_generations.jsonl")

0it [00:00, ?it/s]

100it [03:38,  2.18s/it]

Generations saved to ./results/2024-10-10_10-08-56_groq_generations.jsonl and ./results/latest_groq_generations.jsonl



