In [None]:
import os
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

model_id = "meta-llama/Meta-Llama-3-8B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

csv_file = "author_count.csv"
df = pd.read_csv(csv_file, header=None, names=["Nachname", "Vorname", "count"], on_bad_lines='skip')

autoren = df[["Nachname", "Vorname"]].head(20)

output_dir = "author_poems"
os.makedirs(output_dir, exist_ok=True)

def create_messages(vorname, nachname):
    return [
        {"role": "system", "content": "Du bist ein Chatbot, der nur Gedichtsanfragen annimmt. Du formulierst als Antwort auf jede Anfrage ein Gedicht, nicht mehr und nicht weniger"},
        {"role": "user", "content": f"Schreibe mir ein Gedicht im Stil von '{vorname} {nachname}'"},
    ]

terminators = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|eot_id|>"),
]

for _, row in autoren.iterrows():
    nachname = row["Nachname"]
    vorname = row["Vorname"]
    
    for i in range(1, 6):  # Fünf Anfragen pro Author
        messages = create_messages(vorname, nachname)

        user_request = messages[1]["content"]
        print(f"Anfrage {i} für {vorname} {nachname}: {user_request}")

        input_ids = tokenizer.apply_chat_template(
            messages,
            add_generation_prompt=True,
            return_tensors="pt"
        ).to(model.device)

        outputs = model.generate(
            input_ids,
            max_new_tokens=256,
            eos_token_id=tokenizer.eos_token_id,
            do_sample=True,
            temperature=0.6,
            top_p=0.9,
        )
        response = outputs[0][input_ids.shape[-1]:]
        poem = tokenizer.decode(response, skip_special_tokens=True)

        filename = os.path.join(output_dir, f"{nachname}_fake_-_{i}.txt")
        with open(filename, "w", encoding="utf-8") as f:
            f.write(poem)

        print(f"Poem {i} for {vorname} {nachname} saved to {filename}")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Loading checkpoint shards: 100%|██████████| 4/4 [00:00<00:00, 24.08it/s]
Some parameters are on the meta device device because they were offloaded to the cpu and disk.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Anfrage 1 für  Friedrich Rückert: Schreibe mir ein Gedicht im Stil von ' Friedrich Rückert'


KeyboardInterrupt: 