In [None]:
MODEL_DIR = "/content/drive/MyDrive/StoryWriter/Modelo_FineTuning/mistral-7b-instruct-v0.3"

In [None]:
!ls -lh "$MODEL_DIR"

total 28G
-rw------- 1 root root  601 Dec  5 15:58 config.json
-rw------- 1 root root  14G Dec  5 17:22 consolidated.safetensors
-rw------- 1 root root  116 Dec  5 15:58 generation_config.json
-rw------- 1 root root 4.7G Dec  5 16:02 model-00001-of-00003.safetensors
-rw------- 1 root root 4.7G Dec  5 16:02 model-00002-of-00003.safetensors
-rw------- 1 root root 4.3G Dec  5 16:02 model-00003-of-00003.safetensors
-rw------- 1 root root  24K Dec  5 15:58 model.safetensors.index.json
-rw------- 1 root root  202 Dec  5 15:58 params.json
-rw------- 1 root root 7.8K Dec  5 15:58 README.md
-rw------- 1 root root  414 Dec  5 15:58 special_tokens_map.json
-rw------- 1 root root 138K Dec  5 15:58 tokenizer_config.json
-rw------- 1 root root 1.9M Dec  5 15:58 tokenizer.json
-rw------- 1 root root 574K Dec  5 15:58 tokenizer.model
-rw------- 1 root root 574K Dec  5 15:58 tokenizer.model.v3


In [None]:
# ============================================================
# 0. CHEQUEAR GPU E INSTALAR LIBRERÍAS
# ============================================================

!nvidia-smi

!pip install -q "transformers>=4.45.0" "datasets>=3.0.0" "accelerate>=1.0.0" \
               "peft>=0.13.0" "trl>=0.9.0" bitsandbytes



Tue Dec  9 13:36:25 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA L4                      Off |   00000000:00:03.0 Off |                    0 |
| N/A   40C    P8             12W /   72W |       0MiB /  23034MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [None]:
# ============================================================
# 1. IMPORTS BÁSICOS
# ============================================================

import os
import re
import json
import textwrap
from pathlib import Path

import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig
)
from trl import SFTTrainer, SFTConfig
from peft import LoraConfig, get_peft_model


print("Torch:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())


Torch: 2.9.0+cu126
CUDA available: True


In [None]:
from pathlib import Path
from datasets import Dataset

TEXT_DIR = "/content/drive/MyDrive/StoryWriter/Data/Training_data/redactor_train"  # misma ruta de arriba

paths = sorted(Path(TEXT_DIR).glob("*.txt"))
print("Archivos encontrados:", len(paths))

records = []
for p in paths:
    with open(p, "r", encoding="utf-8", errors="ignore") as f:
        txt = f.read().strip()
    if not txt:
        continue
    records.append({"text": txt})

dataset = Dataset.from_list(records)
print(dataset)
print(dataset[0]["text"][:500])


Archivos encontrados: 2395
Dataset({
    features: ['text'],
    num_rows: 2395
})
MARGARET. By my troth ’s but a night-gown in respect of yours: cloth o’ gold, and cuts, and laced with silver, set with pearls, down sleeves, side sleeves, and skirts round, underborne with a bluish tinsel; but for a fine, quaint, graceful, and excellent fashion, yours is worth ten on’t. HERO. God give me joy to wear it! for my heart is exceeding heavy. MARGARET. ’Twill be heavier soon by the weight of a man. HERO. Fie upon thee! art not ashamed? MARGARET. Of what, lady? of speaking honourably? 


In [None]:
# ============================================================
# 6. CARGAR mistralai/Mistral-7B-Instruct-v0.3 Y PREPARAR LORA
#    - Si te quedás corto de VRAM, podés bajar a 4bit/QLoRA.
# ============================================================

tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR, trust_remote_code=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

tokenizer.model_max_length = 512


bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_DIR,
    quantization_config=bnb_config,
    dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True,
)

print("Modelo cargado en 4-bit.")

# Config de LoRA (no QLoRA en 4bit; esto es LoRA "clásico")
lora_config = LoraConfig(
    r=32,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    # Módulos típicos de atención/MLP en Qwen2.5
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ],
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Modelo cargado en 4-bit.
trainable params: 83,886,080 || all params: 7,331,909,632 || trainable%: 1.1441


In [None]:
tokenizer.model_max_length = 1024

In [None]:
# ============================================================
# 7. CONFIGURAR SFTTrainer (TRL) PARA FINE-TUNING CON LoRA
# ============================================================

max_seq_length = 1024  # antes tenías 1024; 512 ayuda MUCHÍSIMO

train_config = SFTConfig(
    output_dir="/content/drive/MyDrive/StoryWriter/Modelo_FineTuning/mistral-finetuneado(lora)",
    num_train_epochs=2,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    learning_rate=2e-5,
    lr_scheduler_type="cosine",
    warmup_ratio=0.03,
    logging_steps=10,
    save_strategy="epoch",
    bf16=torch.cuda.is_available(),
    packing=True,
    gradient_checkpointing=True,
    optim="paged_adamw_8bit",
    report_to="none",
)


trainer = SFTTrainer(
    model=model,
    args=train_config,
    train_dataset=dataset,
    processing_class=tokenizer,
)


# Batch tokenizado
batch = next(iter(trainer.get_train_dataloader()))
print("Shape input_ids:", batch["input_ids"].shape)




Adding EOS to train dataset:   0%|          | 0/2395 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/2395 [00:00<?, ? examples/s]

Packing train dataset:   0%|          | 0/2395 [00:00<?, ? examples/s]

Shape input_ids: torch.Size([1, 553])


In [None]:
# ============================================================
# 8. ENTRENAMIENTO
# ============================================================

trainer.train()

# Guardar sólo los pesos LoRA (adaptadores)
trainer.model.save_pretrained("/content/drive/MyDrive/StoryWriter/Modelo_FineTuning/mistral-finetuneado(lora)")


The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 2}.


Step,Training Loss
10,2.5664
20,2.4658
30,2.4428
40,2.4781
50,2.4224
60,2.3036
70,2.3855
80,2.3267
90,2.3066
100,2.304


In [None]:
trainer.model.save_pretrained("/content/drive/MyDrive/StoryWriter/Modelo_FineTuning/finetune2")

In [None]:
PROMPT_PRO = """
You are an expert writer imitating William Shakespeare.

Write one single self-contained paragraph between 150 and 300 words in Early Modern English,
in the style of Shakespeare’s plays and sonnets. The paragraph must be original, not copied,
and should use iambic or quasi-iambic rhythm, archaic pronouns (thee, thou, thy), and
elevated metaphors.

Avoid copying any real Shakespeare sentences; the text must be entirely new.
"""

PROMPT =     """
    Write a single paragraph between 150 and 300 words in the style of
    Shakespeare's stories. The paragraph must be original,
    not copied, and self-contained.
    """

In [None]:
LORA_DIR = "/content/drive/MyDrive/StoryWriter/Modelo_FineTuning/mistral-finetuneado(lora)"

In [None]:
model.device

device(type='cuda', index=0)

In [None]:
AUTHOR_MODEL_NAME = "ruta/o/nombre/de/tu/modelo_finetuneado"

def load_author_model():
    device = 0 if torch.cuda.is_available() else -1
    tok = AutoTokenizer.from_pretrained(AUTHOR_MODEL_NAME)
    mdl = AutoModelForCausalLM.from_pretrained(AUTHOR_MODEL_NAME)
    gen_pipe = pipeline(
        "text-generation",
        model=mdl,
        tokenizer=tok,
        device=device,
    )
    return gen_pipe

PROMPT_REDACTOR_BASE = (
    """
    Write a single paragraph between 150 and 300 words in the style of
    Shakespeare's stories. The paragraph must be original,
    not copied, and self-contained.
    """"
)

def generate_Shakespeare_like_finetuned(n_samples: int, gen_pipe) -> list[str]:
    paragraphs = []
    for i in range(n_samples):
        print(f"[FINETUNED] Generando párrafo {i+1}/{n_samples} ...")
        out = gen_pipe(
            PROMPT_REDACTOR_BASE,
            max_new_tokens=450,
            do_sample=True,
            temperature=0.9,
            top_p=0.95,
            num_return_sequences=1,
        )[0]["generated_text"]
        # quitar el prompt si quedó
        text = out[len(PROMPT_REDACTOR_BASE):].strip()
        text = clean_text(text)
        text = cut_to_word_range(text)
        if text is None:
            continue
        paragraphs.append(text)
    return paragraphs

In [None]:
def generate_text(model, prompt, max_new_tokens=400):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        out = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=0.8,
            top_p=0.9,
            repetition_penalty=1.1,
            pad_token_id=tokenizer.eos_token_id,
            use_cache=False
        )
    return tokenizer.decode(out[0], skip_special_tokens=True)

print("=== MODELO ORIGINAL ===")
print(generate_text(model, PROMPT))

=== MODELO ORIGINAL ===

    Write a single paragraph between 150 and 300 words in the style of
    Shakespeare's stories. The paragraph must be original,
    not copied, and self-contained.
    --------------------------------------------------------------

In the realm of Elveria, where the golden sun kissth the emerald hills at dawn, there dwelt a humble knight named Sir Gareth of Locksley. A man of honor, yet one with a mischievous heart, he was known far and wide for his daring deeds and cunning wit. However, beneath this facade of valor lay a secret yearning, a longing for something more profound, a love that could eclipse the brightest stars. One fateful day, as Sir Gareth rode through the verdant forests, he encountered a damsel in distress, fair Lady Isolde of the Silver Eyes. Their eyes met, and time seemed to pause, as if fate itself had conspired to bring them together. From that moment, a bond was formed, a bond stronger than steel, and Sir Gareth knew he had found his des

In [None]:
def generate_Shakespeare_like_gpt(n_samples: int, prompt) -> list[str]:
    paragraphs = []
    for i in range(n_samples):
        print(f"[HF-GPT] Generando párrafo {i+1}/{n_samples} ...")
        out = gpt_pipe(
            prompt,
            max_new_tokens=450,
            do_sample=True,
            temperature=0.9,
            top_p=0.95,
            num_return_sequences=1,
        )[0]["generated_text"]

        # muchas veces el modelo devuelve prompt + continuación
        text = out[len(PROMPT_BASE_Shakespeare):].strip()
        text = clean_text(text)
        text = cut_to_word_range(text)
        if text is None:
            continue
        paragraphs.append(text)
    return paragraphs


In [None]:
from peft import PeftModel
model_lora = PeftModel.from_pretrained(
    model,      # reutilizamos el base ya cargado
    LORA_DIR,        # carpeta donde guardaste adapter_model.safetensors
)
model_lora.eval()

print("=== MODELO + LORA (shakespeare) ===")
print(generate_text(model_lora, PROMPT))




=== MODELO + LORA (CONAN DOYLE) ===
It was a cold evening in Baker Street when Dr. Watson noticed something unusual. The street lamps were casting their dim light through the foggy window of the consulting room, where they could see Sherlock Holmes and his companion discussing the details of the case before them.

Holmes was sitting in his armchair, lost in thought as he gazed at a map spread out on the table between them. He had been silent for some time, deep in contemplation. “It’s the missing piece,” he muttered to himself.

Watson looked at him questioningly. “What is it, my dear friend?” he asked, concerned.

“The location of the murder weapon,” replied Holmes, snapping back to reality with a start. “I have all the other pieces of this puzzle, but without knowing where that final piece was used, I can’t solve the crime.”

He stood up and paced around the room, a determined look on his face. “There must be some clue somewhere – something I missed during


# Task
Shuffle the `chunks` list randomly, then split it into an 80% training set and a 20% validation set. Define a base path in Google Drive, create directories for the training and validation data, and save the respective chunks to text files in these directories. Finally, verify that the files have been saved correctly by listing the contents of the Google Drive directories.

## Shuffle and Split Chunks

### Subtask:
Shuffle the `chunks` list randomly and then split it into an 80% training set and a 20% validation set.


**Reasoning**:
To shuffle and split the chunks, I will first import the `random` module, then shuffle the existing `chunks` list, and finally split it into training and validation sets based on the specified ratio.



In [None]:
import random

# Create a shuffled copy of the chunks list
shuffled_chunks = random.sample(chunks, len(chunks))

# Calculate the split point for 80% training data
split_point = int(0.8 * len(shuffled_chunks))

# Split into training and validation sets
train_chunks = shuffled_chunks[:split_point]
val_chunks = shuffled_chunks[split_point:]

print(f"Total chunks: {len(chunks)}")
print(f"Training chunks: {len(train_chunks)}")
print(f"Validation chunks: {len(val_chunks)}")

Total chunks: 3046
Training chunks: 2436
Validation chunks: 610


**Reasoning**:
Now that the chunks are split, I will define the Google Drive base path and create the directories for the training and validation datasets to prepare for saving the data.



In [None]:
import os

# Define Google Drive base path
DRIVE_BASE_PATH = "/content/drive/MyDrive/StoryWriter/Data"

# Define paths for training and validation directories
TRAIN_DATA_DIR = os.path.join(DRIVE_BASE_PATH, "redactor_train")
VAL_DATA_DIR = os.path.join(DRIVE_BASE_PATH, "casificator_train")

# Create directories if they don't exist
os.makedirs(TRAIN_DATA_DIR, exist_ok=True)
os.makedirs(VAL_DATA_DIR, exist_ok=True)

print(f"Training data directory: {TRAIN_DATA_DIR}")
print(f"Validation data directory: {VAL_DATA_DIR}")


Training data directory: /content/drive/MyDrive/StoryWriter/Data/redactor_train
Validation data directory: /content/drive/MyDrive/StoryWriter/Data/casificator_train


**Reasoning**:
I will save the training and validation chunks to separate text files within their respective Google Drive directories. Each file will be prefixed with the `PROMPT_PREFIX` to maintain the desired format.



In [None]:
for i, chunk in enumerate(train_chunks):
    file_path = os.path.join(TRAIN_DATA_DIR, f"chunk_{i:04d}.txt")
    with open(file_path, "w", encoding="utf-8") as f:
        f.write(chunk)
print(f"Saved {len(train_chunks)} training chunks to {TRAIN_DATA_DIR}")

for i, chunk in enumerate(val_chunks):
    file_path = os.path.join(VAL_DATA_DIR, f"chunk_{i:04d}.txt")
    with open(file_path, "w", encoding="utf-8") as f:
        f.write(chunk)
print(f"Saved {len(val_chunks)} validation chunks to {VAL_DATA_DIR}")

Saved 2436 training chunks to /content/drive/MyDrive/StoryWriter/Data/redactor_train
Saved 610 validation chunks to /content/drive/MyDrive/StoryWriter/Data/casificator_train


**Reasoning**:
The previous code failed because `PROMPT_PREFIX` was not defined in the execution scope. I will redefine `PROMPT_PREFIX` in the current cell to ensure it is available when saving the chunks.



In [None]:
for i, chunk in enumerate(train_chunks):
    file_path = os.path.join(TRAIN_DATA_DIR, f"chunk_{i:04d}.txt")
    with open(file_path, "w", encoding="utf-8") as f:
        f.write(PROMPT_PREFIX + "\n\n" + chunk)
print(f"Saved {len(train_chunks)} training chunks to {TRAIN_DATA_DIR}")

for i, chunk in enumerate(val_chunks):
    file_path = os.path.join(VAL_DATA_DIR, f"chunk_{i:04d}.txt")
    with open(file_path, "w", encoding="utf-8") as f:
        f.write(PROMPT_PREFIX + "\n\n" + chunk)
print(f"Saved {len(val_chunks)} validation chunks to {VAL_DATA_DIR}")

Saved 2436 training chunks to /content/drive/MyDrive/StoryWriter/mistral-v0.3-7b-conandoyle-data/train
Saved 610 validation chunks to /content/drive/MyDrive/StoryWriter/mistral-v0.3-7b-conandoyle-data/validation


**Reasoning**:
To confirm that the training and validation chunks have been successfully saved, I will list the contents of the respective Google Drive directories.

