<a href="https://colab.research.google.com/github/michael-j-england/AustenGPT/blob/main/notebooks/austen_gpt.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
from google.colab import drive
drive.mount('/content/drive')

def prepare_environment():
  os.environ["BASE_DIR"] = "/content/drive/MyDrive/AustenGPT"
  os.environ["RAW_TEXT_DIR"] = os.path.join(os.environ["BASE_DIR"], "raw_text")
  os.environ["CLEANED_TEXT_DIR"] = os.path.join(os.environ["BASE_DIR"], "cleaned_texts")
  os.environ["TOKENIZED_TEXT_DIR"] = os.path.join(os.environ["BASE_DIR"], "tokenized_texts")
  os.environ["MODEL_DIR"] = os.path.join(os.environ["BASE_DIR"], "model")

Mounted at /content/drive


In [2]:
!pip install datasets accelerate -U

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting accelerate
  Downloading accelerate-1.4.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_cupti_cu12-12.

In [3]:
import os
import requests
from tqdm import tqdm

# Jane Austen's six main movels and their Gutenberg IDs
austen_books = {
    "Pride and Prejudice": 1342,
    "Sense and Sensibility": 161,
    "Emma": 158,
    "Mansfield Park": 141,
    "Northanger Abbey": 121,
    "Persuasion": 105,
}

prepare_environment()
RAW_TEXT_DIR = os.getenv("RAW_TEXT_DIR")
os.makedirs(RAW_TEXT_DIR, exist_ok=True)

# Function to download the raw text from Project Gutenberg
def get_austen_text(book_id):
    url = f"https://www.gutenberg.org/files/{book_id}/{book_id}-0.txt"
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        return response.text
    except requests.exceptions.RequestException as e:
        print(f"Error fetching book {book_id}: {e}")
        return None

# Download texts and save them locally
def download_and_save_books(austen_books, output_dir):
    austen_texts = []

    for title, book_id in tqdm(austen_books.items(), desc="Downloading Austen Texts"):
        text = get_austen_text(book_id)
        if text:
            austen_texts.append({"title": title, "text": text})

    for book in austen_texts:
        title = book['title']
        text = book['text']
        filename = f"{title}.txt"
        filepath = os.path.join(output_dir, filename)

        try:
            with open(filepath, "w", encoding="utf-8", newline='') as f:
                f.write(text)
            print(f"{title} saved to: {filepath}")
        except Exception as e:
            print(f"Error saving {title}: {e}")


if __name__ == "__main__":
    download_and_save_books(austen_books, RAW_TEXT_DIR)


Downloading Austen Texts: 100%|██████████| 6/6 [00:01<00:00,  3.60it/s]

Pride and Prejudice saved to: /content/drive/MyDrive/AustenGPT/raw_text/Pride and Prejudice.txt
Sense and Sensibility saved to: /content/drive/MyDrive/AustenGPT/raw_text/Sense and Sensibility.txt
Emma saved to: /content/drive/MyDrive/AustenGPT/raw_text/Emma.txt
Mansfield Park saved to: /content/drive/MyDrive/AustenGPT/raw_text/Mansfield Park.txt
Northanger Abbey saved to: /content/drive/MyDrive/AustenGPT/raw_text/Northanger Abbey.txt
Persuasion saved to: /content/drive/MyDrive/AustenGPT/raw_text/Persuasion.txt





In [4]:
import re
import os

prepare_environment()
RAW_TEXT_DIR = os.getenv("RAW_TEXT_DIR")
CLEANED_TEXT_DIR= os.getenv("CLEANED_TEXT_DIR")
os.makedirs(CLEANED_TEXT_DIR, exist_ok=True)

def normalize_line_endings(text):
    normalized_text = re.sub(r"\r\n|\r|\n", "\n", text)
    return normalized_text

def clean_before_chapter_1(text):
    target = r"((\*\*\* START OF )|(The Project Gutenberg eBook))[\s\S]*?(Chapter|CHAPTER) (I|1)(\.\]|\.|)\n\n"
    cleaned_text = re.sub(target, r"Chapter I\n\n", text, flags=re.DOTALL)
    return cleaned_text

def remove_illustrations(text):
    text = re.sub(r"\[_Copyright 1894 by George Allen._\]", "", text)
    text = re.sub(r"\[Illustration(:?\s*[^\]]*)?\]", "", text)
    return text

def clean_after_gutenberg_end(text):
    cleaned_text = re.sub(r"\*\*\* END OF THE PROJECT GUTENBERG EBOOK.*", "", text, flags=re.DOTALL)
    return cleaned_text

def remove_extra_newlines(text):
    return re.sub(r"(?<!\n)\n(?!\n)", " ", text)
    # return re.sub(r"(?<=\S)\n(?=\S)", " ", text)

def remove_multiple_newlines(text):
    return re.sub(r"\n{3,}", "\n\n", text)

def clean_text(text):
    text = normalize_line_endings(text)
    text = clean_before_chapter_1(text)
    text = remove_illustrations(text)
    text = clean_after_gutenberg_end(text)
    text = remove_extra_newlines(text)
    text = remove_multiple_newlines(text)
    return text

def clean_and_save_texts():
    for filename in os.listdir(RAW_TEXT_DIR):
        filepath = os.path.join(RAW_TEXT_DIR, filename)

        if os.path.isfile(filepath):
            with open(filepath, "r", encoding="utf-8", newline='') as f:
                raw_text = f.read()

            cleaned_text = clean_text(raw_text)

            cleaned_filepath = os.path.join(CLEANED_TEXT_DIR, filename)
            with open(cleaned_filepath, "w", encoding="utf-8", newline='') as f:
                f.write(cleaned_text)

            print(f"Cleaned {filename} saved to: {cleaned_filepath}")

if __name__ == "__main__":
    clean_and_save_texts();

Cleaned Pride and Prejudice.txt saved to: /content/drive/MyDrive/AustenGPT/cleaned_texts/Pride and Prejudice.txt
Cleaned Sense and Sensibility.txt saved to: /content/drive/MyDrive/AustenGPT/cleaned_texts/Sense and Sensibility.txt
Cleaned Emma.txt saved to: /content/drive/MyDrive/AustenGPT/cleaned_texts/Emma.txt
Cleaned Mansfield Park.txt saved to: /content/drive/MyDrive/AustenGPT/cleaned_texts/Mansfield Park.txt
Cleaned Northanger Abbey.txt saved to: /content/drive/MyDrive/AustenGPT/cleaned_texts/Northanger Abbey.txt
Cleaned Persuasion.txt saved to: /content/drive/MyDrive/AustenGPT/cleaned_texts/Persuasion.txt


In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer

def load_model(model_name="gpt2"):
    return AutoModelForCausalLM.from_pretrained(model_name)

def load_tokenizer(model_name="gpt2"):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token
    return tokenizer

In [6]:
import os
from datasets import load_dataset, DatasetDict

def tokenize_and_save():
    prepare_environment()
    CLEANED_TEXT_DIR = os.getenv("CLEANED_TEXT_DIR")
    TOKENIZED_TEXT_DIR = os.getenv("TOKENIZED_TEXT_DIR")
    os.makedirs(TOKENIZED_TEXT_DIR, exist_ok=True)

    data_files = {"train": os.path.join(CLEANED_TEXT_DIR, "*.txt")}
    dataset = load_dataset("text", data_files=data_files)

    tokenizer = load_tokenizer()

    def tokenize_function(examples):
        outputs = tokenizer(
            examples["text"],
            padding="max_length",
            truncation=True,
            max_length=512,  # Ensures uniform input length
        )
        outputs["labels"] = outputs["input_ids"].copy()
        return outputs

    tokenized_dataset = dataset.map(tokenize_function, batched=True)

    split_ratio = 0.1
    split = tokenized_dataset["train"].train_test_split(test_size=split_ratio)
    split_dataset = DatasetDict({"train": split["train"], "test": split["test"]})
    split_dataset.save_to_disk(TOKENIZED_TEXT_DIR)

    print(f"Tokenized dataset saved to {TOKENIZED_TEXT_DIR}")

if __name__ == "__main__":
    tokenize_and_save()

Generating train split: 0 examples [00:00, ? examples/s]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Map:   0%|          | 0/20576 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/18518 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2058 [00:00<?, ? examples/s]

Tokenized dataset saved to /content/drive/MyDrive/AustenGPT/tokenized_texts


In [4]:
import os
from datasets import load_from_disk
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback
import torch

def train():
    prepare_environment()
    TOKENIZED_TEXT_DIR = os.getenv("TOKENIZED_TEXT_DIR")
    MODEL_DIR = os.getenv("MODEL_DIR")
    os.makedirs(MODEL_DIR, exist_ok=True)

    tokenized_datasets = load_from_disk(TOKENIZED_TEXT_DIR)

    # Select training and evaluation sets (use .select(range(x)) for subsets)
    train_dataset = tokenized_datasets["train"].select(range(1000))
    eval_dataset = tokenized_datasets["test"].select(range(200))

    # Disable Weights & Biases logging
    os.environ["WANDB_DISABLED"] = "true"

    model = load_model()
    tokenizer = load_tokenizer()

    # Training arguments
    training_args = TrainingArguments(
        output_dir=MODEL_DIR,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="loss",
        greater_is_better=False,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=1, # 5
        weight_decay=0.01,
        logging_dir="./logs",
        save_total_limit=2,
        learning_rate=5e-5,
        gradient_accumulation_steps=2,
        fp16=torch.cuda.is_available(),  # Enables mixed-precision training if GPU available
        logging_steps=100,
    )

    # Early stopping callback
    early_stopping = EarlyStoppingCallback(
        early_stopping_patience=2,
        early_stopping_threshold=0.02,
    )

    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        callbacks=[early_stopping],
    )

    # Train the model
    trainer.train()

    # Save model and tokenizer
    model.save_pretrained(MODEL_DIR)
    tokenizer.save_pretrained(MODEL_DIR)
    print(f"Model saved to {MODEL_DIR}")

if __name__ == "__main__":
    train()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Epoch,Training Loss,Validation Loss
0,No log,0.308679


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


Model saved to /content/drive/MyDrive/AustenGPT/model


In [5]:
import os
from transformers import pipeline

# Load fine-tuned model
prepare_environment()
MODEL_DIR = os.getenv("MODEL_DIR")
generator = pipeline("text-generation", model=MODEL_DIR)

# Generate text
prompt = "It is a truth universally acknowledged"
output = generator(
    prompt,
    max_length=300,
    num_return_sequences=1,
    temperature=0.4,  # Adjusts randomness
    top_p=0.5,        # Nucleus sampling
    repetition_penalty=1.2,  # Penalizes repetitive output
    do_sample=True,  # Enables sampling instead of greedy decoding
)

# Print result
print(output[0]["generated_text"])

Device set to use cuda:0
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


It is a truth universally acknowledged that the whole of this country, and all parts thereof in it; but I am not sure how much more so than you are. You have not only made me feel as if my life depended upon your being here to-day on account for some of its inconveniences which may befall us both—I do believe there will always exist an occasion when we shall be able either by any means or opportunity whatsoever,—to go home again without having been at least once acquainted with each other before our arrival."
