## Generate prompts based on the Pile dataset

In [6]:
from dataclasses import dataclass
from datetime import datetime
from os.path import exists
import json
import numpy as np
from huggingface_hub import InferenceClient
from taker.texts import prepare
from tqdm import tqdm

seed = np.random.randint(0, 1e7)
current_time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
# write prompts to a file
def write_to_file(data):
    filename = f"./results/{current_time}_prompts.jsonl"
    filename_latest = f"./results/latest_prompts.jsonl"
    if not exists(filename):
        with open(filename, "w") as f:
            f.write("")
        with open(filename_latest, "w") as f:
            f.write("")
    for _filename in [filename, filename_latest]:
        with open(_filename, "a") as file:
            file.write(json.dumps(data) + "\n")

# Store current data here
@dataclass
class GenPrompts:
    # model = "meta-llama/Meta-Llama-3.1-70B"
    model:str = "meta-llama/Meta-Llama-3.1-70B-Instruct"
    seed:int = seed
    max_new_tokens:int = 420
    dataset_repo:str = "pile"
    index:int = 0
    completion:str = ""

cfg = GenPrompts()


# Write a function for getting written completions
client = InferenceClient(cfg.model)

def get_completion(prompt):
    message = client.chat_completion(
        messages=[{"role": "user", "content": prompt}],
        max_tokens=cfg.max_new_tokens,
        stream=False,
        seed=cfg.seed,
    )
    return message

# Write a function for getting structured prompts
get_prompt = lambda __text : f"""
{__text[:10000]}

---

Please write a structured question that asks to write something that resembles
the content written above. Do not give overly specific names, but do give details
on how the text should be structured. Keep it short

Do not NOT write "Sure, here it is" or similar. Just write the question.
"""

# Load the dataset
dataset, label, _ = prepare(cfg.dataset_repo)
num_to_skip = 36

for i, data in enumerate(dataset):
    if i < num_to_skip:
        continue
    text = data[label]
    prompt = get_prompt(text)
    cfg.index = i
    cfg.completion = get_completion(prompt)
    write_to_file(cfg.__dict__)

## Generate generations based on the prompts from above

In [1]:
from dataclasses import dataclass
from datetime import datetime
from os.path import exists
import json
import numpy as np
from huggingface_hub import InferenceClient
from tqdm import tqdm

seed = np.random.randint(0, 1e7)
current_time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

@dataclass
class GenPrompts:
    model: str = "microsoft/phi-3-mini-4k-instruct"
    seed: int = seed
    max_new_tokens: int = 420
    dataset_repo: str = "pile"
    index: int = 0
    completion: str = ""

cfg = GenPrompts()

# Update the client
client = InferenceClient(cfg.model)

def get_completion(prompt):
    message = client.text_generation(
        prompt,
        max_new_tokens=cfg.max_new_tokens,
        return_full_text=True,
        stream=False,
    )
    return message

# Function to read prompts from latest_prompts.jsonl
def read_prompts():
    with open("./results/latest_prompts.jsonl", "r") as file:
        for line in file:
            yield json.loads(line)

# Function to write generations to files
def write_generation(data):
    filenames = [
        filename := f"./results/{current_time}_phi3_generations.jsonl",
        filename_latest := "./results/latest_phi3_generations.jsonl"
    ]
    if not exists(filename):
        with open(filename, "w") as f:
            f.write("")
        with open(filename_latest, "w") as f:
            f.write("")
    for filename in filenames:
        with open(filename, "a") as file:
            json.dump(data, file)
            file.write("\n")

# Generate outputs for each prompt
for prompt_data in tqdm(read_prompts()):
    input_prompt = prompt_data["completion"]["choices"][0]["message"]["content"]
    full_prompt = f"Human: {input_prompt}\n\nAssistant: "

    generation = get_completion(full_prompt)
    # Get only the output part of the text:
    output_text = generation.split("Assistant: ")[1]

    output_data = {
        "input": input_prompt,
        "output": output_text,
        "full_text": generation
    }

    write_generation(output_data)

print(f"Generations saved to ./results/{current_time}_phi3_generations.jsonl and ./results/latest_phi3_generations.jsonl")

0it [00:08, ?it/s]


Exception: Stop