In [None]:
!pip install -q transformers datasets accelerate

In [None]:
import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    DataCollatorForLanguageModeling,
    TrainingArguments,
    Trainer,
)

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

In [None]:
import pandas as pd
import random
import re
from datasets import Dataset

csv_path = "/content/PoemDataset.csv"

df = pd.read_csv(csv_path)
print(df.columns)

poems_raw = df["Poem"].dropna().astype(str).tolist()

def extract_content_words(text, min_len=4):
    tokens = re.findall(r"[A-Za-z']+", text.lower())
    tokens = [t for t in tokens if len(t) >= min_len]
    return list(dict.fromkeys(tokens))

train_texts = []
random.seed(1337)

for p in poems_raw:
    p_clean = p.strip()
    if len(p_clean) < 40:
        continue
    if len(p_clean) > 800:
        continue

    words = extract_content_words(p_clean)
    if len(words) < 4:
        continue

    chosen = random.sample(words, 4)
    words_str = ", ".join(chosen)

    prompt = (
        f"Words: {words_str}\n"
        "Write a short, coherent lyrical poem using ALL of these words.\n"
        "Poem:\n"
    )

    full_text = prompt + p_clean
    train_texts.append(full_text)

print("samples:", len(train_texts))

max_examples = 8000
if len(train_texts) > max_examples:
    train_texts = random.sample(train_texts, max_examples)
    print("train data len:", len(train_texts))

dataset = Dataset.from_dict({"text": train_texts})
dataset


Index(['Title', 'Poem', 'Poet', 'Genre'], dtype='object')
samples: 4545


Dataset({
    features: ['text'],
    num_rows: 4545
})

In [None]:
dataset[0]

{'text': 'Words: alonewith, around, where, seemedto\nWrite a short, coherent lyrical poem using ALL of these words.\nPoem:\nWandered tonight through a cityas ruined as a body with brokenribs and a bared heart. Looked for you there with cookies in my pocket, searched for a sigh, for movement in demolished streets and alleys. Tonightsince I’d forgotten for a moment where you are,I searched for you with hope in my bones.But no matter how I lured you with my voiceand my eyes, walls of debris grew up steadily around you, cellars seemedto creep around you. I remained alonewith those cookies in my pocketand kept calling and walking.'}

In [None]:
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    DataCollatorForLanguageModeling,
    TrainingArguments,
    Trainer,
)

In [None]:
model_name = "gpt2"

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(model_name)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print("Using device:", device)

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Using device: cuda


In [None]:
max_length = 256

def tokenize_fn(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        max_length=max_length,
    )

tokenized = dataset.map(
    tokenize_fn,
    batched=True,
    remove_columns=["text"],
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

training_args = TrainingArguments(
    output_dir="constraint-poet",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    learning_rate=5e-5,
    weight_decay=0.01,
    fp16=torch.cuda.is_available(),
    logging_steps=50,
    save_strategy="epoch",
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized,
    data_collator=data_collator,
)


Map:   0%|          | 0/4545 [00:00<?, ? examples/s]

In [None]:
trainer.train()

trainer.save_model("constraint-poet-ft")
tokenizer.save_pretrained("constraint-poet-ft")
print("Saved fine-tuned model.")

Step,Training Loss
50,4.284
100,3.977
150,3.8961
200,3.9738
250,3.919
300,3.8724
350,3.8558
400,3.8396
450,3.8887
500,3.9281


Saved fine-tuned model.


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

In [None]:
model_dir = "constraint-poet-ft"

tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = AutoModelForCausalLM.from_pretrained(model_dir)
model.eval()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print("Using device:", device)

Using device: cuda


In [None]:
def format_into_lines(text, max_words_per_line=9):
    words = text.strip().split()
    lines = []
    cur = []
    for w in words:
        cur.append(w)
        end = any(w.endswith(p) for p in [".", "!", "?", ";", ":"])
        if len(cur) >= max_words_per_line or end:
            lines.append(" ".join(cur))
            cur = []
    if cur:
        lines.append(" ".join(cur))
    lines = [l.strip() for l in lines if l.strip()]
    return "\n".join(lines)


def generate_constrained_poem(
    words,
    extra_prompt="",
    max_new_tokens=80,
    temperature=0.8,
    top_k=20,
    repetition_penalty=1.05,
    max_tries=3,
):
    if isinstance(words, str):
        words_list = [w for w in words.strip().split() if w]
    else:
        words_list = [str(w).strip() for w in words if str(w).strip()]

    words_lower = [w.lower() for w in words_list]

    base_instruction = (
        "Write a short, coherent lyrical poem using ALL of these words. Each sentence should contain no more than 4 words.\n"
    )

    words_str = ", ".join(words_list)

    if extra_prompt:
        prompt = (
            f"Words: {words_str}\n"
            + base_instruction
            + extra_prompt.strip()
            + "\nPoem:\n"
        )
    else:
        prompt = (
            f"Words: {words_str}\n"
            + base_instruction
            + "Poem:\n"
        )

    last_raw = ""
    model_device = next(model.parameters()).device

    for attempt in range(1, max_tries + 1):
        inputs = tokenizer(prompt, return_tensors="pt")
        inputs = {k: v.to(model_device) for k, v in inputs.items()}
        prompt_len = inputs["input_ids"].shape[-1]

        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=max_new_tokens,
                do_sample=True,
                temperature=temperature,
                top_k=top_k,
                repetition_penalty=repetition_penalty,
                pad_token_id=tokenizer.eos_token_id,
                eos_token_id=tokenizer.eos_token_id,
            )

        gen_ids = outputs[0, prompt_len:]
        raw = tokenizer.decode(gen_ids, skip_special_tokens=True)
        last_raw = raw

        text_lower = raw.lower()
        missing = [w for w in words_lower if w not in text_lower]
        #print(f"Try {attempt}, missing:", missing)

        if not missing:
            return format_into_lines(raw)

    return format_into_lines(last_raw)


In [None]:
poem = generate_constrained_poem(
    "autumn rain sun love",
    extra_prompt="The style should be calm and nostalgic.",
)

print(" POEM ")
print(poem)


 POEM 
We are the birds.
We eat our own food in the autumn.This is
what makes us birds who are our friends.They do
not eat the leaves, nor the fruit,nor even the
leaves themselves.
We are the birds who eat ourselves’s clothing in
the autumn.Their clothes have become a kindled in their
bodies, as a kindling that we take for


In [None]:
!zip -r constraint-poet-ft.zip /content/constraint-poet-ft/

  adding: content/constraint-poet-ft/ (stored 0%)
  adding: content/constraint-poet-ft/generation_config.json (deflated 24%)
  adding: content/constraint-poet-ft/merges.txt (deflated 53%)
  adding: content/constraint-poet-ft/tokenizer.json (deflated 82%)
  adding: content/constraint-poet-ft/tokenizer_config.json (deflated 54%)
  adding: content/constraint-poet-ft/special_tokens_map.json (deflated 60%)
  adding: content/constraint-poet-ft/config.json (deflated 51%)
  adding: content/constraint-poet-ft/vocab.json (deflated 59%)
  adding: content/constraint-poet-ft/training_args.bin (deflated 53%)
  adding: content/constraint-poet-ft/model.safetensors (deflated 7%)


In [None]:
!pip install -q "transformers>=4.40.0" datasets accelerate

In [None]:
import pandas as pd
import random
import re
from datasets import Dataset

csv_path = "/content/PoemDataset.csv"

df = pd.read_csv(csv_path)
poems_raw = df["Poem"].dropna().astype(str).tolist()

def extract_content_words(text, min_len=4):

    tokens = re.findall(r"[A-Za-z']+", text.lower())
    tokens = [t for t in tokens if len(t) >= min_len]
    return list(dict.fromkeys(tokens))

train_texts = []
random.seed(1337)

for p in poems_raw:
    p_clean = p.strip()
    if len(p_clean) < 40:
        continue
    if len(p_clean) > 800:
        continue

    words = extract_content_words(p_clean)
    if len(words) < 4:
        continue


    chosen = random.sample(words, 4)
    words_str = ", ".join(chosen)

    prompt = (
        f"Words: {words_str}\n"
        "Write a short, coherent lyrical poem using ALL of these words.\n"
        "Poem:\n"
    )

    full_text = prompt + p_clean
    train_texts.append(full_text)

print("train data len:", len(train_texts))


max_examples = 8000
if len(train_texts) > max_examples:
    train_texts = random.sample(train_texts, max_examples)


dataset = Dataset.from_dict({"text": train_texts})
dataset


train data len: 4545


Dataset({
    features: ['text'],
    num_rows: 4545
})

In [None]:
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    DataCollatorForLanguageModeling,
    TrainingArguments,
    Trainer,
)

In [None]:
model_name = "Qwen/Qwen2-0.5B"

tokenizer = AutoTokenizer.from_pretrained(model_name)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(model_name)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print("Using device:", device)

max_length = 256

def tokenize_fn(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        max_length=max_length,
    )

tokenized = dataset.map(
    tokenize_fn,
    batched=True,
    remove_columns=["text"],
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

training_args = TrainingArguments(
    output_dir="qwen2-poet",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    num_train_epochs=2,
    learning_rate=5e-5,
    weight_decay=0.01,
    fp16=torch.cuda.is_available(),
    logging_steps=50,
    save_strategy="epoch",
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized,
    data_collator=data_collator,
)

trainer.train()

trainer.save_model("qwen2-poet-ft")
tokenizer.save_pretrained("qwen2-poet-ft")
print("Saved fine-tuned model to qwen2-poet-ft")

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/661 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/988M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/138 [00:00<?, ?B/s]

Using device: cuda


Map:   0%|          | 0/4545 [00:00<?, ? examples/s]

Step,Training Loss
50,4.1924
100,4.122
150,4.0979
200,4.1082
250,4.1264
300,4.0872
350,4.0256
400,4.046
450,4.0778
500,4.0981


Saved fine-tuned model to qwen2-poet-ft


In [None]:
#for generation

!pip install -q gdown

file_id = "1vXZL7K6hVk4FCpivu6fkfWs-1_Yi1dAX"
output_name = "qwen2-poet-ft.zip"

!gdown --id $file_id -O $output_name

!ls -lh


Downloading...
From (original): https://drive.google.com/uc?id=1vXZL7K6hVk4FCpivu6fkfWs-1_Yi1dAX
From (redirected): https://drive.google.com/uc?id=1vXZL7K6hVk4FCpivu6fkfWs-1_Yi1dAX&confirm=t&uuid=3a6455fb-0a65-4d56-82b7-0de3d0d08b30
To: /content/qwen2-poet-ft.zip
100% 1.84G/1.84G [00:52<00:00, 34.9MB/s]
total 1.8G
-rw-r--r-- 1 root root 1.8G Nov 17 00:48 qwen2-poet-ft.zip
drwxr-xr-x 1 root root 4.0K Nov 12 14:30 sample_data


In [None]:
#for generation

!unzip /content/qwen2-poet-ft.zip

Archive:  /content/qwen2-poet-ft.zip
   creating: content/qwen2-poet-ft/
  inflating: content/qwen2-poet-ft/generation_config.json  
  inflating: content/qwen2-poet-ft/merges.txt  
  inflating: content/qwen2-poet-ft/tokenizer.json  
  inflating: content/qwen2-poet-ft/added_tokens.json  
  inflating: content/qwen2-poet-ft/tokenizer_config.json  
  inflating: content/qwen2-poet-ft/special_tokens_map.json  
  inflating: content/qwen2-poet-ft/config.json  
  inflating: content/qwen2-poet-ft/vocab.json  
  inflating: content/qwen2-poet-ft/chat_template.jinja  
  inflating: content/qwen2-poet-ft/training_args.bin  
  inflating: content/qwen2-poet-ft/model.safetensors  


In [None]:
#for generation

from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

model_dir = "content/qwen2-poet-ft"

tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = AutoModelForCausalLM.from_pretrained(model_dir)
model.eval()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print("Using device:", device)

Using device: cpu


In [None]:
#for generation

def format_into_lines(text, max_words_per_line=9):
    words = text.strip().split()
    lines = []
    cur = []
    for w in words:
        cur.append(w)
        end = any(w.endswith(p) for p in [".", "!", "?", ";", ":"])
        if len(cur) >= max_words_per_line or end:
            lines.append(" ".join(cur))
            cur = []
    if cur:
        lines.append(" ".join(cur))
    lines = [l.strip() for l in lines if l.strip()]
    return "\n".join(lines)


def generate_qwen_poem(
    words,
    extra_prompt="",
    max_new_tokens=80,
    temperature=0.8,
    top_k=40,
    repetition_penalty=1.05,
    n_candidates=10,
    max_rounds=2,
):


    if isinstance(words, str):
        words_list = [w for w in words.strip().split() if w]
    else:
        words_list = [str(w).strip() for w in words if str(w).strip()]

    if not words_list:
        raise ValueError("Needs at least one word.")

    words_lower = [w.lower() for w in words_list]
    words_str = ", ".join(words_list)

    base_instruction = "Write a short, coherent lyrical poem using ALL of these words.\n"

    if extra_prompt:
        prompt = (
            f"Words: {words_str}\n"
            + base_instruction
            + extra_prompt.strip()
            + "\nPoem:\n"
        )
    else:
        prompt = (
            f"Words: {words_str}\n"
            + base_instruction
            + "Poem:\n"
        )

    model_device = next(model.parameters()).device
    best_raw = None

    for round_idx in range(1, max_rounds + 1):
        inputs = tokenizer(prompt, return_tensors="pt")
        inputs = {k: v.to(model_device) for k, v in inputs.items()}
        prompt_len = inputs["input_ids"].shape[-1]

        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=max_new_tokens,
                do_sample=True,
                temperature=temperature,
                top_k=top_k,
                repetition_penalty=repetition_penalty,
                num_return_sequences=n_candidates,
                pad_token_id=tokenizer.eos_token_id,
                eos_token_id=tokenizer.eos_token_id,
            )

        for i in range(n_candidates):
            gen_ids = outputs[i, prompt_len:]
            raw = tokenizer.decode(gen_ids, skip_special_tokens=True)
            text_lower = raw.lower()
            missing = [w for w in words_lower if w not in text_lower]
            #print(f"Round {round_idx}, cand {i+1}: missing -> {missing}")

            if best_raw is None:
                best_raw = raw

            if not missing:
                return format_into_lines(raw)

    if best_raw is not None:
        return format_into_lines(best_raw)
    else:
        return "(generation failed)"

In [None]:
#for generation

poem = generate_qwen_poem(
    words="autumn rain friend sad",
    max_new_tokens=80,
    temperature=0.8,
    top_k=40,
    repetition_penalty=1.05,
    n_candidates=10,
    max_rounds=2,
)

print("GENERATED POEM")
print(poem)


GENERATED POEM
When I was seven years old,I was lost in
a train.It was autumn.The rain was on the roofs.Sad
songs were sung.My friends were sad.They put their arms
around me—My father was sad, too—And said, “We’ll get
you home.”The train pulled up and I climbed down.Someone
gave me an apple.He said, “I’ll buy you another
one.”I
