## Transformers decoder only (gpt2 like) trained at Lewis Carrolls Alice's Adventures in Wonderland

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader
#import lightning as L

"./content/drive/MyDrive/datasets/alice.txt"

'./content/drive/MyDrive/datasets/alice.txt'

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#device = torch.device("cpu")
device

device(type='cuda')

In [3]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(start=0, end=max_len, step=1).float().unsqueeze(1)
        embedding_index = torch.arange(start=0, end=d_model, step=2).float()
        div_term = 1 / torch.tensor(10000.0)**(embedding_index / d_model)
        #print(div_term)
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe',pe)

    def forward(self, word_embeddings):
        pe_temp = self.pe[:word_embeddings.size(0), :]
        pe_temp_expanded = pe_temp.unsqueeze(1)
        #print(f"word_embeddings.shape: {word_embeddings.shape}, self.pe.shape: {pe_temp_expanded.shape}, ")
        return word_embeddings + pe_temp_expanded

In [4]:
class Attention(nn.Module):
    def __init__(self, d_model=2):
        super().__init__()
        self.W_q = nn.Linear(in_features=d_model, out_features=d_model, bias=False)
        self.W_k = nn.Linear(in_features=d_model, out_features=d_model, bias=False)
        self.W_v = nn.Linear(in_features=d_model, out_features=d_model, bias=False)
        self.row_dim = 0
        self.col_dim = 1

    def forward(self, encodings, mask=None):
        q = self.W_q(encodings)
        k = self.W_k(encodings)
        v = self.W_v(encodings)

        # Ensure k has the same shape as q before transpose
        assert k.shape == q.shape

        # Transpose k to align with q for dot product
        k_transposed = k.transpose(-1, -2)

        # Check shapes
        #print("Shape of q:", q.shape)  # [1, 5, 2]
        #print("Shape of k_transposed:", k_transposed.shape)  # [1, 2, 5]
        sims = torch.matmul(q, k_transposed)
        scaled_sims = sims / torch.tensor(k.size(1)**0.5)

        if mask is not None:
            mask = mask.to(device)
            scaled_sims = scaled_sims.masked_fill(mask=mask, value=-1e9)

        attention_percents = F.softmax(scaled_sims)
        attention_scores = torch.matmul(attention_percents, v)
        return attention_scores

In [5]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model=2,heads=2):
        super().__init__()

        self.W_qs = []
        self.W_ks = []
        self.W_vs = []

        for index in range(heads):
            W_q = nn.Linear(in_features=d_model, out_features=d_model, bias=False)
            W_k = nn.Linear(in_features=d_model, out_features=d_model, bias=False)
            W_v = nn.Linear(in_features=d_model, out_features=d_model, bias=False)

            self.W_qs.append(W_q)
            self.W_ks.append(W_k)
            self.W_vs.append(W_v)

        self.unify_heads = nn.Linear(d_model * heads, d_model)

        self.row_dim = 0
        self.col_dim = 1
        self.heads = heads

    def forward(self, encodings, mask=None):
        attentionscores = []
        #encodings.to(device)
        for index in range(self.heads):
            W_q = self.W_qs[index].to(device)
            W_k = self.W_ks[index].to(device)
            W_v = self.W_vs[index].to(device)

            q = W_q(encodings.to(device))
            k = W_k(encodings.to(device))
            v = W_v(encodings.to(device))

            k_transposed = k.transpose(-1, -2)
            sims = torch.matmul(q, k_transposed)
            scaled_sims = sims / torch.tensor(k.size(1)**0.5)

            if mask is not None:
                mask = mask.to(device)
                scaled_sims = scaled_sims.masked_fill(mask=mask, value=-1e9)

            attention_percents = F.softmax(scaled_sims)
            attention_scores = torch.matmul(attention_percents, v)
            attentionscores.append(attention_scores)

        combined_attention_scores = torch.cat(attentionscores, dim=-1)
        combined_output = self.unify_heads(combined_attention_scores)

        return combined_output

In [20]:
class DecoderBlock(nn.Module):
    def __init__(self, d_model, num_heads, num_tokens, using_mask=True):
        super(DecoderBlock, self).__init__()
        self.self_attention = MultiHeadAttention(d_model=d_model, heads=num_heads)
        self.layer_norm1 = nn.LayerNorm(d_model)
        self.relu = nn.ReLU()
        self.fc_layer = nn.Linear(in_features=d_model, out_features=d_model)
        self.layer_norm2 = nn.LayerNorm(d_model)
        self.fc_layer2 = nn.Linear(in_features=d_model, out_features=d_model)
        dropout=0.1
        self.dropout = nn.Dropout(dropout)
        self.using_mask = using_mask

    def forward(self, position_encoded, mask=None):
        if self.using_mask:
            self_attention_values = self.self_attention(position_encoded, mask=mask)
        else:
            self_attention_values = self.self_attention(position_encoded)

        residual_connection_values = position_encoded + self_attention_values
        normalized_values1 = self.layer_norm1(residual_connection_values)

        fc_layer_output_relu = self.relu(self.fc_layer(normalized_values1))
        #fc_layer_output_dropout = self.dropout(fc_layer_output_relu)
        #fc_layer_output = self.fc_layer2(fc_layer_output_dropout)
        #final_output = self.layer_norm2(normalized_values1 + fc_layer_output)
        #fc_layer_output = self.fc_layer2(self.dropout(self.relu(self.fc_layer(normalized_values1))))
        #return final_output
        #fc_layer_output = self.fc_layer(normalized_values1)
        fc_layer_output = self.relu(self.fc_layer(normalized_values1))
        return fc_layer_output

In [21]:
class DecoderOnlyTransformerBlockTransformer(nn.Module):
    def __init__(self, num_tokens, d_model, max_len, using_mask=True):
        super(DecoderOnlyTransformerBlockTransformer, self).__init__()
        self.number_heads = 12
        self.we = nn.Embedding(num_embeddings=num_tokens, embedding_dim=d_model)
        self.pe = PositionalEncoding(d_model=d_model, max_len=max_len)
        self.decoder_block1 = DecoderBlock(d_model=d_model, num_heads=self.number_heads, num_tokens=num_tokens, using_mask=using_mask)
        self.decoder_block2 = DecoderBlock(d_model=d_model, num_heads=self.number_heads, num_tokens=num_tokens, using_mask=using_mask)
        self.decoder_block3 = DecoderBlock(d_model=d_model, num_heads=self.number_heads, num_tokens=num_tokens, using_mask=using_mask)
        #self.decoder_block4 = DecoderBlock(d_model=d_model, num_heads=self.number_heads, num_tokens=num_tokens, using_mask=using_mask)
        #self.decoder_block5 = DecoderBlock(d_model=d_model, num_heads=self.number_heads, num_tokens=num_tokens, using_mask=using_mask)
        #self.decoder_block6 = DecoderBlock(d_model=d_model, num_heads=self.number_heads, num_tokens=num_tokens, using_mask=using_mask)

        self.fc_layer = nn.Linear(in_features=d_model, out_features=num_tokens)
        self.loss = nn.CrossEntropyLoss()

    def forward(self, token_ids):
        word_embeddings = self.we(token_ids)
        position_encoded = self.pe(word_embeddings)

        if self.decoder_block1.using_mask:
            mask_ones = torch.ones((token_ids.size(dim=1), token_ids.size(dim=1)))
            mask = torch.tril(mask_ones)
            mask = mask == 0
        else:
            mask = None

        output_block1 = self.decoder_block1(position_encoded, mask=mask)
        output_block2 = self.decoder_block2(output_block1, mask=mask)
        output_block3 = self.decoder_block3(output_block2, mask=mask)
        #output_block4 = self.decoder_block4(output_block3, mask=mask)
        #output_block5 = self.decoder_block5(output_block4, mask=mask)
        #output_block6 = self.decoder_block6(output_block5, mask=mask)

        fc_layer_output = self.fc_layer(output_block3)

        return fc_layer_output

In [22]:
class DecoderOnlyTransformer(nn.Module):

    def __init__(self, num_tokens, d_model, max_len, using_mask=True):
        super().__init__()
        self.we = nn.Embedding(num_embeddings=num_tokens, embedding_dim=d_model)
        self.pe = PositionalEncoding(d_model=d_model, max_len=max_len)
        self.self_attention = MultiHeadAttention(d_model=d_model, heads=8)
        self.fc_layer = nn.Linear(in_features=d_model, out_features=num_tokens)
        self.loss = nn.CrossEntropyLoss()
        self.layer_norm1 = nn.LayerNorm(d_model)
        self.using_mask = using_mask

    def forward(self, token_ids):
        # calculate word embeddings out of the tokens
        word_embeddings = self.we(token_ids)
        # apply positional encoding (using the PositionalEncoder layer) to the word embeddings
        position_encoded = self.pe(word_embeddings)
        # create mask for decoder only transformer so it can not cheat
        if (self.using_mask == True):
            mask_ones = torch.ones((token_ids.size(dim=1), token_ids.size(dim=1)))
            mask = torch.tril(mask_ones)
            mask = mask == 0
            # calculate self attention with the Attention Layer
            self_attention_values = self.self_attention(position_encoded, mask=mask)
        else:
            self_attention_values = self.self_attention(position_encoded)
        # add original position_encoded values to the calculated self attention values (residual connection)
        residual_connection_values = position_encoded + self_attention_values

        normalized_values1 = self.layer_norm1(residual_connection_values)
        fc_layer_output = self.fc_layer(normalized_values1)

        # use the final linear layer to calculate the output probabilities
        #fc_layer_output = self.fc_layer(residual_connection_values)

        return fc_layer_output

In [None]:
import datasets
pretraining_dataset = datasets.load_dataset(
    "upstage/Pretraining_Dataset",
    split="train"
)

In [23]:
import torch
from torch.utils.data import Dataset, DataLoader
from collections import defaultdict

class Text8Dataset(Dataset):
    def __init__(self, file_path, sequence_length):
        with open(file_path, 'r', encoding='utf-8-sig') as f:
            self.text = f.read()
        self.min_length = 100
        self.text = self.remove_punctuation(self.text)
        self.sequence_length = sequence_length
        print(f'sequence length: {self.sequence_length}')
        self.vocab = self.create_vocabulary(file_path)
        print(f'created vocab size: {len(self.vocab)}')
        for i, (word, count) in enumerate(self.vocab.items()):
            if i >= 10:
                break
            print(f'{word}: {count}')
        self.word_to_idx = {word: idx for idx, word in enumerate(self.vocab)}
        self.idx_to_word = {idx: word for word, idx in self.word_to_idx.items()}
        words = self.text.split()
        self.text_as_int = [self.word_to_idx[word] for word in words if word in self.word_to_idx]

    def remove_punctuation(self, text):
        # Definiere die zu entfernenden Zeichen
        punctuation = [',', '"', "'", '.', ';', ':', '!', '?', '_', '“', '‘', '(', ')']

        # Ersetze jedes Zeichen in punctuation durch einen leeren String
        for char in punctuation:
            text = text.replace(char, '')

        return text

    def create_vocabulary(self, file_path):
        with open(file_path, 'r', encoding='utf-8-sig') as f:
            text = f.read()
        vocab = defaultdict(int)
        index = 0

        words = text.split(' ')
        for word in words:
            word = word.lower()
            if word not in vocab:
                vocab[word] = index
                index += 1

        # Add special tokens
        vocab['<EOS>'] = index
        index += 1
        vocab['<PAD>'] = index
        return vocab

    def __len__(self):
        return len(self.text_as_int) - self.sequence_length

    def __getitem__(self, idx):
        input_seq = self.text_as_int[idx:idx+self.sequence_length]
        target_seq = self.text_as_int[idx+1:idx+self.sequence_length+1]
        return torch.tensor(input_seq), torch.tensor(target_seq)

# Beispiel wie man das Dataset nutzt
sequence_length = 100  # Länge der Sequenz
#file_path = './datasets/text8.txt'  # Pfad zur Text8-Datei
#file_path = './datasets/alice.txt'  # Pfad zur alice-Datei
#file_path = "/content/drive/MyDrive/datasets/alice.txt"
file_path = './datasets/treatise_of_human_nature.txt'

dataset = Text8Dataset(file_path, sequence_length)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

# Zugriff auf ein Batch
for input_seq, target_seq in dataloader:
    print(input_seq.shape)  # (batch_size, sequence_length)
    print(target_seq.shape)  # (batch_size, sequence_length)
    break

token_to_id = dataset.vocab
print(len(token_to_id))
id_to_token = dict(map(reversed, token_to_id.items()))
print(len(id_to_token))

sequence length: 100
created vocab size: 7040
the: 0
project: 1
gutenberg: 2
ebook: 3
of: 4
alice's: 5
adventures: 6
in: 7
wonderland
: 8
: 9
torch.Size([32, 100])
torch.Size([32, 100])
7040
7040


In [29]:
from torch.optim.lr_scheduler import ReduceLROnPlateau
max_len = 100
token_to_id = dataset.vocab
#print(token_to_id)
id_to_token = dict(map(reversed, token_to_id.items()))
print(len(token_to_id))

#dimension_model = 768
dimension_model = 256

transformer_model = DecoderOnlyTransformerBlockTransformer(num_tokens=len(token_to_id), d_model=dimension_model, max_len=max_len)
transformer_model.to(device)
optimizer = Adam(transformer_model.parameters(), lr=0.001)
scheduler = ReduceLROnPlateau(optimizer, 'min', patience=5, factor=0.5)
criterion = nn.CrossEntropyLoss()

epochs = 200
for epoch in range(epochs):
    transformer_model.train()
    epoch_loss = 0
    total_loss = 0
    for data in dataloader:
        optimizer.zero_grad()
        input_tokens, labels = data
        input_tokens = input_tokens.to(device)  # Move inputs to GPU if available
        labels = labels.to(device)  # Move labels to GPU if available
        # Debugging: Ausgabe der maximalen und minimalen Werte von input_seq
        #print(f"Input Seq - Max Index: {input_seq.max().item()}, Min Index: {input_seq.min().item()}")
        #print(input_tokens.shape)
        prediction = transformer_model(input_tokens)
        prediction = prediction.view(-1, prediction.size(-1))  # [batch_size * seq_length, num_tokens]
        labels = labels.view(-1)  # [batch_size * seq_length
        loss = criterion(prediction, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * input_tokens.size(0)
        epoch_loss += loss.item()
    scheduler.step(epoch_loss)
    average_loss = total_loss / len(dataloader.dataset)
    print(f"Epoch {epoch}, Train Loss: {average_loss}")

7040


  attention_percents = F.softmax(scaled_sims)


Epoch 0, Train Loss: 5.826795768115005
Epoch 1, Train Loss: 4.922311894898297
Epoch 2, Train Loss: 4.140790677517488
Epoch 3, Train Loss: 3.5592738381173006
Epoch 4, Train Loss: 3.0866622420103837
Epoch 5, Train Loss: 2.726003172904943
Epoch 6, Train Loss: 2.449965819159129
Epoch 7, Train Loss: 2.2485720697992906
Epoch 8, Train Loss: 2.0911445463192027
Epoch 9, Train Loss: 1.9251614667897956
Epoch 10, Train Loss: 1.7946303841034659
Epoch 11, Train Loss: 1.6910913865303996
Epoch 12, Train Loss: 1.590948512895691
Epoch 13, Train Loss: 1.5041507127256493
Epoch 14, Train Loss: 1.4301396057144922
Epoch 15, Train Loss: 1.3600492415239303
Epoch 16, Train Loss: 1.2862502792876203
Epoch 17, Train Loss: 1.2364798078867794
Epoch 18, Train Loss: 1.1698355076220208
Epoch 19, Train Loss: 1.1246915960340198
Epoch 20, Train Loss: 1.0786409505945704
Epoch 21, Train Loss: 1.0353536207998852
Epoch 22, Train Loss: 0.9960346187043662
Epoch 23, Train Loss: 0.965520780092179
Epoch 24, Train Loss: 0.924665947

In [30]:
# Testtexte
test_texts = [
    "The quick brown fox jumps over the lazy", #dog
    "She sells seashells by the", #seashore
    "How much wood would a woodchuck chuck if a woodchuck could chuck", #wood
    "To be or not to be, that is the", #question.
    "All that glitters is not", #gold
    "A journey of a thousand miles begins with a single", #step
    "Beauty is in the eye of the", #beholder
    "Actions speak louder than", #words
    "The early bird catches the", #worm
    "A picture is worth a thousand" #words,
    "Once upon a time, in a land far, far away, there lived a brave ", #knight.
    "The stars in the night sky were bright and beautiful, lighting up the ", #darkness.
    "In the middle of the forest, there was a small, hidden cottage made of ", #gingerbread.
    "He who laughs last laughs ", #longest.
    "Every cloud has a silver ", #lining.
    "It's always darkest before the ", #dawn.
    "When the going gets tough, the tough get", #going.
    "Two heads are better than ", #one.
    "A watched pot never ", #boils.
    "Honesty is the best ", #policy.
    "Alice was not a bit hurt, and she",
    "Alice opened the door and found that",
    "After a while, finding that nothing more happened",
    "Just then her head struck",
    "As she said this she looked down at her", #hands
    "won’t talk about cats or", #hands
    "easy to"
]

def string_to_model_input(input_string):
    # Split the input string into tokens
    tokens = input_string.lower().split()

    model_input = []
    for token in tokens:
        if token in token_to_id:
            model_input.append(token_to_id[token])

    model_input.append(token_to_id['<EOS>'])
    model_input_tensor = torch.tensor(model_input)
    return model_input_tensor

# Schleife zum Testen des Transformers
for text in test_texts:
    model_input_expanded = string_to_model_input(text)
    model_input_expanded = model_input_expanded.to(device)
    model_input_expanded = model_input_expanded.unsqueeze(0)
    input_length = model_input_expanded.size(dim=0)
    predictions = transformer_model(model_input_expanded)

    last_predictions = predictions[-1, :]

    max_index = torch.argmax(last_predictions[-1,:])

    predicted_id = torch.tensor([max_index], device=device)
    predicted_ids = predicted_id

    for id in predicted_ids:
        topk_values, topk_indices = torch.topk(last_predictions[-1,:], k=5)

        # Convert top indices to tokens
        predicted_ids = topk_indices
        possible_tokens = " ["
        for id in predicted_ids:
            possible_tokens += id_to_token[id.item()] + ", "
        possible_tokens += "] "

        #print(f"{id_to_token[id.item()]}")
        print(f"{text} - {id_to_token[id.item()]} - {possible_tokens}")

  attention_percents = F.softmax(scaled_sims)


The quick brown fox jumps over the lazy - could -  [an, all, course, just, could, ] 
She sells seashells by the - about -  [well, all, was, very, about, ] 
How much wood would a woodchuck chuck if a woodchuck could chuck - but -  [herself, thought, is, think, but, ] 
To be or not to be, that is the - to -  [herself, thought, but, must, to, ] 
All that glitters is not - see -  [never, herself, got, eyes, see, ] 
A journey of a thousand miles begins with a single - any -  [don’t, me, these, few, any, ] 
Beauty is in the eye of the - and -  [sat, to, the, of, and, ] 
Actions speak louder than - nothing -  [that’s, thought, to, would, nothing, ] 
The early bird catches the - herself -  [nothing, one, the, any, herself, ] 
A picture is worth a thousandOnce upon a time, in a land far, far away, there lived a brave  - would -  [her, very, the, off, would, ] 
The stars in the night sky were bright and beautiful, lighting up the  - much -  [herself, talking, it, was, much, ] 
In the middle of t