In [51]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import spacy
import numpy as np

In [52]:
# Set the random seed for reproducibility
torch.manual_seed(1337)

# Load the spaCy model with pre-trained word embeddings ('en_core_web_sm')
nlp = spacy.load('en_core_web_sm')

# Load the text data
with open('cote_v7.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# Tokenize the text using spaCy
doc = nlp(text)

In [53]:
#Displays sample of tokenized document
for token in doc[:100]:
    print(token.text)

Stay
up
to
date
On
Light
Novels
by
Downloading
our
mobile
App


Zerobooks
Android


Zerobooks
IOS


Download
all
your
Favorite
Light
Novels


Jnovels.comTable
of
Contents


Character
Gallery


Table
of
Contents
Page


Title
Page


Copyrights
and
Credits


Chapter
1
:
Ryuuen
Kakeru
’s
Soliloquy


Chapter
2
:
The
Sound
of
Footsteps
in
the
Middle
of
Winter


Chapter
3
:
Reunions
and
Farewells


Chapter
4
:
Insanity


Chapter
5
:
Time
to
Settle
Things


Chapter
6
:
Intersecting
Thoughts


Chapter
7
:
What
Ryuuen
Wins
and
Loses


In [54]:
# Create a list of unique words in the text
words = sorted(list(set([token.text for token in doc])))
vocab_size = len(words)

In [55]:
print(words[:100])
print(vocab_size)

['\n', '\n \n', ' ', '!', '(', ')', ',', '-', '.', '1', '10', '100', '101', '102', '103', '104', '105', '106', '107', '108', '109', '11', '110', '111', '112', '113', '114', '115', '116', '117', '118', '119', '12', '120', '121', '122', '123', '124', '125', '126', '127', '128', '129', '13', '130', '131', '132', '133', '134', '135', '136', '137', '138', '139', '14', '140', '141', '142', '143', '144', '145', '146', '147', '148', '149', '15', '150', '151', '152', '153', '154', '155', '156', '157', '158', '159', '16', '160', '161', '162', '163', '164', '165', '166', '167', '168', '169', '17', '170', '171', '172', '173', '174', '175', '176', '177', '178', '179', '18', '180']
5517


In [56]:
# Create mappings from words to integers and vice versa
stoi = {word: i for i, word in enumerate(words)}
itos = {i: word for i, word in enumerate(words)}

# Define a function to get word vectors from spaCy
def get_word_vectors_batched(texts, batch_size=32):
    # Tokenize the texts in batches
    docs = list(nlp.pipe(texts, batch_size=batch_size)) # processes in batches
    
    # Find the maximum dimension of word vectors in the batch and avoid dimensional mismatch.
    max_dim = max(max(token.vector.shape[0] for token in doc) for doc in docs)
    
    # Convert and pad word vectors to the maximum dimension for each token
    padded_vectors = []
    for doc in docs:
        for token in doc:
            padded_vector = torch.cat((torch.tensor(token.vector), torch.zeros(max_dim - token.vector.shape[0])))
            padded_vectors.append(padded_vector)
    
    return torch.stack(padded_vectors)

In [57]:
# Split the data into training and validation sets
texts = [token.text for token in doc]
data = get_word_vectors_batched(texts)
n = int(0.9 * len(data))  # 90% of the data for training, the rest for validation
train_data = data[:n]
val_data = data[n:]

In [58]:
# Define a function for loading data batches
def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

In [59]:
# Define a function for estimating loss (used for evaluation)
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [60]:
print(block_size)

32


In [151]:
class Head(nn.Module):
    """ One head of self-attention """
    def __init__(self, head_size, input_features):
        super().__init__()
        self.key = nn.Linear(input_features, head_size, bias=False)
        self.query = nn.Linear(input_features, head_size, bias=False)
        self.value = nn.Linear(input_features, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B, T, C = x.shape  # Unpacks batch size, sequence, length, input feature dimension

        k = self.key(x)
        q = self.query(x)
        v = self.value(x)
                
        wei = q @ k.transpose(-2, -1) * (C ** -0.5)  # Produces score matrix with stable gradients

        # Calculate attention weights and probability values
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
        wei = F.softmax(wei, dim=-1)
        wei = self.dropout(wei)

        out = wei @ v
        return out

In [152]:
# Define a class for multi-head attention
class MultiHeadAttention(nn.Module):
    """ Multiple heads of self-attention in parallel """
    def __init__(self, num_heads, head_size, input_features, dropout):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size, input_features) for _ in range(num_heads)])
        self.proj = nn.Linear(input_features, input_features)  # Adjusted input_features here
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        head_outputs = [head(x) for head in self.heads]
        out = torch.cat(head_outputs, dim=-1)
        out = self.dropout(self.proj(out))
        return out

In [153]:
# Define a class for feed-forward layers
class FeedForward(nn.Module):
    """ A simple linear layer followed by a non-linearity """
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

In [154]:
class Block(nn.Module):
    """ Transformer block: communication followed by computation """
    def __init__(self, n_embd, n_head, dropout):
        super().__init__()
        head_size = n_embd
        self.sa = MultiHeadAttention(n_head, head_size, n_embd, dropout)  # Pass n_embd and dropout here
        self.ffwd = FeedForward(n_embd)  # Assuming you have a FeedForward class defined
        self.ln1 = nn.LayerNorm(n_embd)  # LayerNorm after self-attention
        self.ln2 = nn.LayerNorm(n_embd)  # LayerNorm after feed-forward

    def forward(self, x):
        x_sa = self.sa(x)  # Apply self-attention
        x = x + self.ln1(x_sa)  # Add and normalize 
        x_ffwd = self.ffwd(x)  # Apply feed-forward
        x = x + self.ln2(x_ffwd)  # Add and normalize
        return x

In [172]:
class LanguageModel(nn.Module):
    def __init__(self, n_layer=6, n_embd=96, n_head=1, vocab_size=5517, dropout=0.1):
        super().__init__()
        self.blocks = nn.Sequential(*[Block(n_embd, n_head, dropout) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd)
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, x, targets=None):
        # Check the batch size of x
        batch_size_x = x.size(0)

        if targets is None:
            loss = None
        else:
            # Check the batch size of targets
            batch_size_targets = targets.size(0)

            # Rest of your code
            B, T, C = x.shape
            x = self.blocks(x)  # Pass through Transformer blocks
            x = self.ln_f(x)  # Apply layer normalization
            logits = self.lm_head(x)  # Generate logits for next token prediction

            # Check if the batch size matches between logits and targets
            if batch_size_x != batch_size_targets:
                raise ValueError("Mismatched batch size between logits and targets")

            logits = logits.view(batch_size_x * T, -1)
            targets = targets.view(batch_size_x * T)  # Flatten the targets to 1D
            loss = F.cross_entropy(logits, targets)

        return logits, loss

In [173]:
# Create an instance of the LanguageModel
model = LanguageModel()
m = model.to(device)

# Print the number of parameters in the model
print(sum(p.numel() for p in m.parameters()) / 1e6, 'M parameters')

1.204653 M parameters


In [174]:
# Create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

# Training loop
for iter in range(max_iters):
    # Every once in a while, evaluate the loss on the train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # Sample a batch of training data
    xb, yb = get_batch('train')

    # Evaluate the loss and perform backpropagation
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

# Generate text from the trained model
context = torch.zeros((1, 1, n_embd), dtype=torch.float32, device=device)
generated_text = []
for _ in range(2000):
    logits, _ = model(context)
    predicted_token = torch.argmax(logits[:, -1, :], dim=-1)
    generated_text.append(predicted_token.item())
    context = torch.cat([context, model.blocks[-1](model.ln_f(context))[:, -1:, :]], dim=1)

generated_text = [itos[i] for i in generated_text]
generated_text = ' '.join(generated_text)
print(generated_text)

RuntimeError: shape '[512]' is invalid for input of size 49152