In [1]:
import torch
import torch.nn as nn
from torch.nn import functional as F
import matplotlib.pyplot as plt
import time
import datetime
import nltk
import re
import tiktoken

In [2]:
# hyperparameters
batch_size = 128
block_size = 32  # spatial extent of the model for its context
max_iters = 5000  # number of training iterations
eval_interval = 10  # frequency of printing training stats
learning_rate = 3e-4
device = torch.device('mps')
eval_iters = 200  # number of iterations to evaluate the model
n_embd = 600 
n_layers = 6
dropout = 0.2
n_head = 6
data_filepath = 'combined.txt'
train_val_split = 0.95

In [3]:

def tokenize(text):
    # Use regular expression to tokenize the text
    tokens = re.findall(r'\b\w+\b|\s|\S', text)
    return tokens

# Load data
with open(data_filepath, 'r', encoding='utf-8') as f:
    text = f.read()

# Tokenize into words
words = tokenize(text) # list of words
vocab = list(set(words))
vocab_size = len(vocab)
print(f' - Number of words: {len(words):,}')
print(vocab[:50])
print(f' - Vocabulary size: {vocab_size:,}')
stoi = {word: i for i, word in enumerate(vocab)}  # string to int
itos = {i: word for i, word in enumerate(vocab)}  # int to string


def encode(s): return [stoi[word] for word in s]
def decode(l): return ' '.join([itos[i] for i in l])


# Split into train and validation
data = torch.tensor(encode(words), dtype=torch.long)
n = int(train_val_split * len(data))
train_data = data[:n]
val_data = data[n:]
print(f' - Length of train data: {len(train_data):,}')
print(f' - Length of val data: {len(val_data):,}')


def get_batch(split, batch_size, block_size):
    """
    Get a batch of data for training or validation.

    Parameters:
    split (str): The split to get the data from. Can be 'train' or 'val'.
    batch_size (int): The batch size.
    block_size (int): The sequence length.

    Returns:
    torch.Tensor, torch.Tensor: The input data (x) and target data (y) as tensors.
    """

    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i: i + block_size] for i in ix])
    y = torch.stack([data[i + 1: i + block_size + 1] for i in ix])
    return x, y






 - Number of words: 2,622,431
['sheer', 'cubicles', 'Rita', 'lads', 'Ooclumency', 'Taped', 'overgrown', 'undertaking', 'raptly', 'kettle', 'enchant', 'snake', 'Valerian', 'ARE', 'blase', 'agonized', 'poise', 'chickens', 'decrees', 'birdcage', 'fcod', '142', 'flourishing', 'inject', 'textbooks', 'water', 'drinking', 'floors', 'peckers', 'shortcomings', 'carnage', 'AN', 'Itovc', 'everything', 'impress', 'BEYOND', 'Clause', 'hater', 'butterfly', 'pulsed', 'recommended', 'licence', 'compartments', 'academic', 'leet', 'hadl', 'trouser', 'creatures', 'mold', 'snow']
 - Vocabulary size: 26,960
 - Length of train data: 2,491,309
 - Length of val data: 131,122
