In [None]:
# ============================================
# 1. RUTAS Y PROMPTS
# ============================================
BASE_MODEL_DIR = "/content/drive/MyDrive/StoryWriter/Modelo_FineTuning/mistral-7b-instruct-v0.3"   # üîÅ CAMBIAR
LORA_DIR       = "/content/drive/MyDrive/StoryWriter/Modelo_FineTuning/mistral-finetuneado(lora)"   # üîÅ CAMBIAR
OUTPUT_DIR     = "/content/drive/MyDrive/StoryWriter/Data/Benchmark_data/mistral_finetune"        # üîÅ CAMBIAR

BASIC_PROMPT  = ("""
    Write a single paragraph between 150 and 300 words in the style of
    Shakespeare's stories. The paragraph must be original,
    not copied, and self-contained.
    """)
BETTER_PROMPT = ("""
You are an expert writer imitating William Shakespeare.
Write one single self-contained paragraph between 150 and 300 words in Early Modern English,
in the style of Shakespeare‚Äôs plays and sonnets. The paragraph must be original, not copied,
and should use iambic or quasi-iambic rhythm, archaic pronouns (thee, thou, thy), and
elevated metaphors.
Avoid copying any real Shakespeare sentences; the text must be entirely new.
"""
)

In [None]:
# ============================================
# 2. CARGAR TOKENIZER Y MODELOS
# ============================================
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel

os.makedirs(OUTPUT_DIR, exist_ok=True)

device = "cuda" if torch.cuda.is_available() else "cpu"


tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_DIR, trust_remote_code=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.model_max_length = 512

# Modelo base
model_base = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL_DIR,
    torch_dtype=torch.float16 if device == "cuda" else torch.float32,
    device_map="auto" if device == "cuda" else None,
)
if device == "cpu":
    model_base.to(device)
model_base.eval()



`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32768, 4096)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
        )
        (mlp): MistralMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): MistralRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): MistralRMSNorm((4096,), eps=1e-05)
      )
    )
    (norm): MistralRMSNorm((4096,)

In [None]:
# Modelo + LoRA
model_lora = PeftModel.from_pretrained(model_base, LORA_DIR)
model_lora.eval()

models = {
    "lora": model_lora,
}

prompts = {
    "basic": BASIC_PROMPT,
    "better": BETTER_PROMPT,
}


In [None]:
print(model_base==model_lora)

False


In [None]:
# ============================================
# 3. FUNCI√ìN DE GENERACI√ìN
# ============================================
def generate_text(model, prompt, max_new_tokens=700, seed=None):
    if seed is not None:
        torch.manual_seed(seed)
        if torch.cuda.is_available():
            torch.cuda.manual_seed_all(seed)

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        out = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=0.9,
            top_p=0.9,
            repetition_penalty=1.1,
            pad_token_id=tokenizer.eos_token_id,
            use_cache=True,
        )
    return tokenizer.decode(out[0], skip_special_tokens=True)


In [None]:
# ============================================
# 4. GENERAR 80 TEXTOS Y GUARDARLOS
#    2 modelos √ó 2 prompts √ó 20 samples = 80
# ============================================
n_samples_per_combo = 20
max_new_tokens = 700  # üîÅ ajust√° si quer√©s textos m√°s largos

for model_name, model in models.items():
    for prompt_name, prompt in prompts.items():
        print(f"=== Generando para modelo={model_name}, prompt={prompt_name} ===")
        for i in range(n_samples_per_combo):
            seed = 1000 + i  # cambia el seed para diversificar
            text = generate_text(model, prompt, max_new_tokens=max_new_tokens, seed=seed)

            filename = f"{model_name}_{prompt_name}_{i:02d}.txt"
            out_path = os.path.join(OUTPUT_DIR, filename)
            with open(out_path, "w", encoding="utf-8") as f:
                f.write(text)

            print(f"Guardado: {filename}")


=== Generando para modelo=lora, prompt=basic ===
Guardado: lora_basic_00.txt
Guardado: lora_basic_01.txt
Guardado: lora_basic_02.txt
Guardado: lora_basic_03.txt
Guardado: lora_basic_04.txt
Guardado: lora_basic_05.txt
Guardado: lora_basic_06.txt
Guardado: lora_basic_07.txt
Guardado: lora_basic_08.txt
Guardado: lora_basic_09.txt
Guardado: lora_basic_10.txt
Guardado: lora_basic_11.txt
Guardado: lora_basic_12.txt
Guardado: lora_basic_13.txt
Guardado: lora_basic_14.txt
Guardado: lora_basic_15.txt
Guardado: lora_basic_16.txt
Guardado: lora_basic_17.txt
Guardado: lora_basic_18.txt
Guardado: lora_basic_19.txt
=== Generando para modelo=lora, prompt=better ===
Guardado: lora_better_00.txt
Guardado: lora_better_01.txt
Guardado: lora_better_02.txt
Guardado: lora_better_03.txt
Guardado: lora_better_04.txt
Guardado: lora_better_05.txt
Guardado: lora_better_06.txt
Guardado: lora_better_07.txt
Guardado: lora_better_08.txt
Guardado: lora_better_09.txt
Guardado: lora_better_10.txt
Guardado: lora_better_

In [None]:
BASIC_PROMPT  = ("""
    Write a single paragraph between 150 and 300 words in the style of
    Shakespeare's stories. The paragraph must be original,
    not copied, and self-contained.
    """)
BETTER_PROMPT = ("""
You are an expert writer imitating William Shakespeare.

Write one single self-contained paragraph between 150 and 300 words in Early Modern English,
in the style of Shakespeare‚Äôs plays and sonnets. The paragraph must be original, not copied,
and should use iambic or quasi-iambic rhythm, archaic pronouns (thee, thou, thy), and
elevated metaphors.

Avoid copying any real Shakespeare sentences; the text must be entirely new.
"""
)

In [None]:
print(len(BETTER_PROMPT))

426


In [None]:
import os
from pathlib import Path

INPUT_DIR = "/content/drive/MyDrive/StoryWriter/Data/Benchmark_data/mistral_finetune_prompt_pro"   # carpeta donde est√°n los .txt


for path in Path(INPUT_DIR).glob("*.txt"):
    with open(path, "r", encoding="utf-8") as f:
        text = f.read()

    # borrar el prompt si aparece al comienzo
    cleaned = text[426:]

    with open(path, "w", encoding="utf-8") as f:
        f.write(cleaned)


In [None]:
import os
from pathlib import Path

INPUT_DIR = "/content/drive/MyDrive/StoryWriter/Data/Benchmark_data/mistral_finetune"   # carpeta donde est√°n los .txt


for path in Path(INPUT_DIR).glob("*.txt"):
    with open(path, "r", encoding="utf-8") as f:
        text = f.read()

    # borrar el prompt si aparece al comienzo
    cleaned = text[171:]

    with open(path, "w", encoding="utf-8") as f:
        f.write(cleaned)
