In [None]:
# ============================================================
# 1. IMPORTS BÁSICOS
# ============================================================

import os
import re
import json
import textwrap
from pathlib import Path
import random


In [None]:
# ============================================================
# 2. DESCARGAR CORPUS DE shakespeare (INGLÉS)
#    (desde Project Gutenberg, dominio público)
# ============================================================

os.makedirs("/content/raw_grueso", exist_ok=True)

# Todas las obras de shakespeare
GUTENBERG_URLS = [
  # AVENTURA
    # 1. Moby Dick
    "https://www.gutenberg.org/cache/epub/2701/pg2701.txt",
    # 2. The Adventures of Tom Sawyer
    "https://www.gutenberg.org/cache/epub/74/pg74.txt",
    # 3. The Wonderful Wizard
    "https://www.gutenberg.org/cache/epub/55/pg55.txt",
  # CIENCIA FICCIÓN
    # 4. A Journey to the Centre of the Earth
    "https://www.gutenberg.org/cache/epub/18857/pg18857.txt",
    # 5. The Time Machine
    "https://www.gutenberg.org/cache/epub/35/pg35.txt",
    # 6. The War of the Worlds
    "https://www.gutenberg.org/cache/epub/36/pg36.txt",
  # FANTASÍA
    # 7. Alice's Adventures in Wonderland
    "https://www.gutenberg.org/cache/epub/11/pg11.txt",
    # 8. Anne of Green Gables
    "https://www.gutenberg.org/cache/epub/45/pg45.txt",
    # 9. Peter Pan
    "https://www.gutenberg.org/cache/epub/16/pg16.txt",
  # ROMANCE
    # 10. Price and Prejudice
    "https://www.gutenberg.org/cache/epub/1342/pg1342.txt",
    # 11. Romeo an Juliet
    "https://www.gutenberg.org/cache/epub/1513/pg1513.txt",
    # 12. A Room with a View
    "https://www.gutenberg.org/cache/epub/2641/pg2641.txt",
  # TERROR
    # 13. Dracula
    "https://www.gutenberg.org/cache/epub/345/pg345.txt",
    # 14. Frankenstein
    "https://www.gutenberg.org/cache/epub/84/pg84.txt",
    # 15. The Strange Case of Dr. Jekyll and Mr. Hyde
    "https://www.gutenberg.org/cache/epub/43/pg43.txt"
    ]

for url in GUTENBERG_URLS:
    fname = url.split("/")[-1]
    out_path = Path("/content/raw_grueso") / fname
    if out_path.exists():
        print("Ya existe:", out_path)
        continue
    !wget -q "{url}" -O "{out_path}"
    print("Descargado:", out_path)


Descargado: /content/raw_grueso/pg2701.txt
Descargado: /content/raw_grueso/pg74.txt
Descargado: /content/raw_grueso/pg55.txt
Descargado: /content/raw_grueso/pg18857.txt
Descargado: /content/raw_grueso/pg35.txt
Descargado: /content/raw_grueso/pg36.txt
Descargado: /content/raw_grueso/pg11.txt
Descargado: /content/raw_grueso/pg45.txt
Descargado: /content/raw_grueso/pg16.txt
Descargado: /content/raw_grueso/pg1342.txt
Descargado: /content/raw_grueso/pg1513.txt
Descargado: /content/raw_grueso/pg2641.txt
Descargado: /content/raw_grueso/pg345.txt
Descargado: /content/raw_grueso/pg84.txt
Descargado: /content/raw_grueso/pg43.txt


Despues de la descarga borre el principio y final a mano directo en el txt. habia texto de derechos de autor y eso

In [None]:
os.makedirs("/content/raw_fino", exist_ok=True)

In [None]:
# ============================================================
# 3. LIMPIEZA AUTOMÁTICA DEL TEXTO DE GUTENBERG
#    - Quitar licencia/cabecera/pie
#    - Normalizar espacios
#    - Dejar sólo el cuerpo de la obra
# ============================================================

def clean_gutenberg_text(text: str) -> str:
    """
    Limpia texto típico de Project Gutenberg:
    - Remueve cabecera y pie de licencia.
    - Quita múltiples líneas en blanco excesivas.
    """

    #LIMPIAR A MANO

    # Normalizar finales de línea
    text = text.replace("\r\n", "\n").replace("\r", "\n")

    # Quitar dobles espacios
    text = re.sub(r"[ \t]+", " ", text)

    # Colapsar bloques enormes de saltos de línea a como mucho 2
    text = re.sub(r"\n{3,}", "\n\n", text)

    # Strip general
    text = text.strip()

    return text


def load_and_clean_all(input_dir="/content/raw_fino") -> str:
    """Concatena todas las obras limpias en un único gran string."""
    corpus_parts = []
    for path in sorted(Path(input_dir).glob("*.txt")):
        with open(path, "r", encoding="utf-8", errors="ignore") as f:
            raw = f.read()
        clean = clean_gutenberg_text(raw)
        paragraphs = split_into_paragraphs(clean)
        chunks = make_chunks_from_paragraphs(paragraphs)
        randomizados = random.sample(chunks, len(chunks))
        usables = randomizados[:100]
        corpus_parts.extend(usables)
    return corpus_parts


full_corpus = load_and_clean_all()
print("Tamaño total corpus limpio (caracteres):", len(full_corpus))

full_corpus[1]  # vistazo rápido


Tamaño total corpus limpio (caracteres): 134


"PRIDE. I am Pride. I disdain to have any parents. I am like to Ovid's flea; I can creep into every corner of a wench; sometimes, like a perriwig, I sit upon her brow; or, like a fan of feathers, I kiss her lips; indeed, I do--what do I not? But, fie, what a scent is here! I'll not speak another word, except the ground were perfumed, and covered with cloth of arras. FAUSTUS. What art thou, the second? COVETOUSNESS. I am Covetousness, begotten of an old churl, in an old leathern bag: and, might I have my wish, I would desire that this house and all the people in it were turned to gold, that I might lock you up in my good chest: O, my sweet gold! FAUSTUS. What art thou, the third? WRATH. I am Wrath. I had neither father nor mother: I leapt out of a lion's mouth when I was scarce half-an-hour old; and ever since I have run up and down the world with this case[106] of rapiers, wounding myself when I had nobody to fight withal. I was born in hell; and look to it, for some of you shall be my

In [None]:
print(len(full_corpus[0][0]))

50


In [None]:
# ============================================================
# 4. CHUNKS ESTILÍSTICOS
#    - Dividir en párrafos y luego en trozos ~512-1024 tokens
#    - Aquí aproximamos por número de palabras
# ============================================================

def split_into_paragraphs(text: str):
    # Separar por líneas en blanco dobles
    paras = [p.strip() for p in text.split("\n\n") if p.strip()]
    return paras


def make_chunks_from_paragraphs(paragraphs, min_words=150, max_words=400):
    """
    Junta párrafos hasta que haya entre min_words y max_words.
    Devuelve una lista de strings (chunks).
    """
    chunks = []
    current_words = []
    current_len = 0

    for para in paragraphs:
        words = para.split()
        if current_len + len(words) > max_words and current_words:
            chunks.append(" ".join(current_words))
            current_words = []
            current_len = 0
        current_words.extend(words)
        current_len += len(words)

    # último
    if current_words:
        if len(current_words) >= min_words:
            chunks.append(" ".join(current_words))

    return chunks


#paragraphs = split_into_paragraphs(full_corpus)
#chunks = make_chunks_from_paragraphs(paragraphs, min_words=150, max_words=350)
#
#print("Párrafos:", len(paragraphs))
#print("Chunks finales:", len(chunks))
#print("Ejemplo de chunk:\n")
#print(textwrap.fill(chunks[0][:800], width=90))


# Task
Shuffle the `chunks` list randomly, then split it into an 80% training set and a 20% validation set. Define a base path in Google Drive, create directories for the training and validation data, and save the respective chunks to text files in these directories. Finally, verify that the files have been saved correctly by listing the contents of the Google Drive directories.

## Shuffle and Split Chunks

### Subtask:
Shuffle the `chunks` list randomly and then split it into an 80% training set and a 20% validation set.


**Reasoning**:
To shuffle and split the chunks, I will first import the `random` module, then shuffle the existing `chunks` list, and finally split it into training and validation sets based on the specified ratio.



In [None]:
import random

# Create a shuffled copy of the chunks list
shuffled_chunks = random.sample(chunks, len(chunks))

**Reasoning**:
Now that the chunks are split, I will define the Google Drive base path and create the directories for the training and validation datasets to prepare for saving the data.



In [None]:
import os

# Define Google Drive base path
DRIVE_BASE_PATH = "/content/drive/MyDrive/StoryWriter/Data"

# Define paths for training and validation directories
TRAIN_DATA_DIR = os.path.join(DRIVE_BASE_PATH, "redactor_train")

# Create directories if they don't exist
os.makedirs(TRAIN_DATA_DIR, exist_ok=True)

print(f"Training data directory: {TRAIN_DATA_DIR}")


Training data directory: /content/drive/MyDrive/StoryWriter/Data/redactor_train
Validation data directory: /content/drive/MyDrive/StoryWriter/Data/casificator_train


**Reasoning**:
I will save the training and validation chunks to separate text files within their respective Google Drive directories. Each file will be prefixed with the `PROMPT_PREFIX` to maintain the desired format.



In [None]:
for i, chunk in enumerate(full_corpus):
    file_path = os.path.join("/content/drive/MyDrive/StoryWriter/Data/Training_data/clasificator_train/negativos_finos", f"chunk_{i:04d}.txt")
    with open(file_path, "w", encoding="utf-8") as f:
        f.write(chunk)
print(f"Saved {len(full_corpus)} validation chunks to /content/drive/MyDrive/StoryWriter/Data/Training_data/clasificator_train/negativos_finos")

Saved 134 validation chunks to /content/drive/MyDrive/StoryWriter/Data/Training_data/clasificator_train/negativos_finos
