# Generación de dataset usando GPT (baseline)

Este notebook genera textos sintéticos estilo Shakespeare utilizando un modelo GPT.
Los textos se utilizan como baseline en el benchmark comparativo
contra modelos locales fine-tuneados.

## 1. Imports.

In [None]:
import os
import random
import re
from pathlib import Path
from openai import OpenAI

## 2. Configuración.

In [None]:
OUTPUT_DIR = Path(
    "/content/drive/MyDrive/StoryWriter/Data/Benchmark_data/gpt_shakespeare_like"
)
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

MIN_WORDS = 150
MAX_WORDS = 300
N_SAMPLES = 200

random.seed(42)

client = OpenAI()
GPT_MODEL = "gpt-4.1-mini"

## 3. Helpers.

In [None]:
def clean_text(text: str) -> str:
    text = re.sub(r"\n{2,}", "\n\n", text)
    return text.strip()

def count_words(text: str) -> int:
    return len(re.findall(r"\w+", text))

def cut_to_word_range(text: str, min_w=MIN_WORDS, max_w=MAX_WORDS):
    words = text.split()
    if len(words) < min_w:
        return None
    return " ".join(words[:max_w])

## 4. Prompt y generación.

In [None]:
PROMPT = """
Write a single paragraph between 150 and 300 words in the style of
Shakespeare's stories. The paragraph must be original,
not copied, and self-contained.
"""

def generate_gpt_samples(n_samples):
    paragraphs = []
    for i in range(n_samples):
        print(f"[GPT] {i+1}/{n_samples}")
        resp = client.chat.completions.create(
            model=GPT_MODEL,
            messages=[{"role": "user", "content": PROMPT}],
            temperature=0.9,
            max_tokens=450,
        )
        text = clean_text(resp.choices[0].message.content)
        text = cut_to_word_range(text)
        if text:
            paragraphs.append(text)
    return paragraphs

## 5. Guardado.

In [None]:
samples = generate_gpt_samples(N_SAMPLES)

for i, txt in enumerate(samples):
    path = OUTPUT_DIR / f"gpt_{i:05d}.txt"
    path.write_text(txt, encoding="utf-8")

print(f"Guardados {len(samples)} textos en {OUTPUT_DIR}")