<a href="https://colab.research.google.com/github/pinkfloyed/Transformer_Scratch_Code/blob/main/Transformer_From_Scratch_encoding_decoding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
import torch, torch.nn as nn, torch.optim as optim, torch.utils.data as data, math, copy

In [7]:
from numpy.ma.core import indices
from torch.ao.nn.quantized import ReLU6
from torch.nn import ReLU
from transformers import pipeline

In [8]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super().__init__()

        self.dimension_model = d_model
        self.num_heads = num_heads
        self.dimension_head = d_model // num_heads

        self.weight_query = nn.Linear(d_model, d_model)
        self.weight_key = nn.Linear(d_model, d_model)
        self.weight_value = nn.Linear(d_model, d_model)
        self.weight_output = nn.Linear(d_model, d_model)

    def scaled_dot_product_attention(self, query, key, value, mask = None):
        attention_scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(self.dimension_head)
        if mask is not None:
            attention_scores = attention_scores.masked_fill(mask == 0, -1e9)
        attention_probs = torch.softmax(attention_scores, dim = -1)
        output = torch.matmul(attention_probs, value)
        return output

    def split_heads(self, x):
        batch_size, seq_length, d_model = x.size()
        return x.view(batch_size, seq_length, self.num_heads, self.dimension_head).transpose(1,2)

    def combine_heads(self, x):
        batch_size, _, seq_length, d_k = x.size()
        return x.transpose(1,2).contiguous().view(batch_size, seq_length, self.dimension_model)

    def forward(self, query, key, value, mask = None):
        query = self.split_heads(self.weight_query(query))
        key = self.split_heads(self.weight_key(key))
        value = self.split_heads(self.weight_value(value))

        attention_output = self.scaled_dot_product_attention(query, key, value, mask)
        output = self.weight_output(self.combine_heads(attention_output))
        return output

In [9]:
class PositionWiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super().__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)
        self.relu = nn.ReLU()

    def forward(self, x):
        return self.fc2(self.relu(self.fc1(x)))

In [10]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_seq_length):
       super().__init__()

       pe = torch.zeros(max_seq_length, d_model)
       position = torch.arange(0, max_seq_length, dtype = torch.float).unsqueeze(1)
       div_term = torch.exp(torch.arange(0, d_model, 2).float()* -(math.log(10000.0)/d_model))

       pe[:, ::2] = torch.sin(position*div_term)
       pe[:, 1::2] = torch.cos(position * div_term)

       self.register_buffer('pe', pe.unsqueeze(0))

    def forward(self, x):
        return x + self.pe[:, :x.size(1)]

In [11]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super().__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask):
        attn_output = self.self_attn(x, x, x, mask)
        x = self.norm1(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm2(x + self.dropout(ff_output))
        return x

In [12]:
class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super().__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.cross_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, enc_out, src_mask, tgt_mask):
        attn_output = self.self_attn(x, x, x, tgt_mask)
        x = self.norm1(x + self.dropout(attn_output))
        attn_output = self.cross_attn(x, enc_out, enc_out, src_mask)
        x = self.norm2(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm3(x + self.dropout(ff_output))
        return x

In [13]:
class Transformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout):
        super().__init__()
        self.encoder_embedding = nn.Embedding(src_vocab_size, d_model)
        self.decoder_embedding = nn.Embedding(tgt_vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model, max_seq_length)

        self.encoder_layers = nn.ModuleList(
            [EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        self.decoder_layers = nn.ModuleList(
            [DecoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])

        self.fc = nn.Linear(d_model, tgt_vocab_size)
        self.dropout = nn.Dropout(dropout)

    def generate_mask(self, src, tgt):
        src_mask = (src != 0).unsqueeze(1).unsqueeze(2)
        tgt_mask = (tgt != 0).unsqueeze(1).unsqueeze(2)
        seq_length = tgt.size(1)
        nopeak_mask = (1 - torch.triu(torch.ones(seq_length, seq_length), diagonal=1)).bool()
        tgt_mask = tgt_mask & nopeak_mask
        return src_mask, tgt_mask


    def forward(self, src, tgt):
        src_mask, tgt_mask = self.generate_mask(src, tgt)
        src_embedded = self.dropout(self.positional_encoding(self.encoder_embedding(src)))
        tgt_embedded = self.dropout(self.positional_encoding(self.decoder_embedding(tgt)))

        enc_output = src_embedded
        for enc_layer in self.encoder_layers:
            enc_output = enc_layer(enc_output, src_mask)

        dec_output = tgt_embedded
        for dec_layer in self.decoder_layers:
            dec_output = dec_layer(dec_output, enc_output, src_mask, tgt_mask)

        output = self.fc(dec_output)

        return output


    def generate(self, src, start_token, max_length, temperature=1.0, top_k=None):
        self.eval()
        src_mask = (src != 0).unsqueeze(1).unsqueeze(2)

        src_embedded = self.dropout(self.positional_encoding(self.encoder_embedding(src)))
        enc_output = src_embedded
        for enc_layer in self.encoder_layers:
            enc_output = enc_layer(enc_output, src_mask)

        generated = torch.tensor([[start_token]], dtype=torch.long)

        for _ in range(max_length - 1):
            tgt_mask = (generated != 0).unsqueeze(1).unsqueeze(2)
            seq_length = generated.size(1)
            nopeak_mask = (1 - torch.triu(torch.ones(seq_length, seq_length), diagonal=1)).bool()
            tgt_mask = tgt_mask & nopeak_mask

            tgt_embedded = self.dropout(self.positional_encoding(self.decoder_embedding(generated)))
            dec_output = tgt_embedded
            for dec_layer in self.decoder_layers:
                dec_output = dec_layer(dec_output, enc_output, src_mask, tgt_mask)

            logits = self.fc(dec_output[:, -1, :]) / temperature
            if top_k is not None:
                top_k = min(top_k, logits.size(-1))
                values, indices = torch.topk(logits, top_k)
                logits = torch.full_like(logits, float('-inf')).scatter(-1, indices, values)

            probs = nn.functional.softmax(logits, dim=-1)
            next_token = torch.multinomial(probs, num_samples=1)

            generated = torch.cat([generated, next_token], dim=1)

            if len(generated[0]) == max_length:
                break
        return generated

In [14]:
with open('/content/input.txt', 'r', encoding='utf-8') as file:
    text = file.read()

In [15]:
chars = sorted(list(set(text)))
v_size = len(chars)

In [16]:
src_vocab_size = v_size
tgt_vocab_size = v_size
d_model = 512
num_heads = 8
num_layers = 6
d_ff = 2048
max_seq_length =100
dropout = 0.1

In [17]:
string_to_int = {c: i for i, c in enumerate(chars)}
int_to_string = {i: c for i, c in enumerate(chars)}

In [18]:
encode = lambda s: [string_to_int[c] for c in s]
decode = lambda i: ''.join(int_to_string[c.item()] for c in i)

In [19]:
transformer = Transformer(src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout)
context_with_given_input = torch.tensor([encode('hello')], dtype = torch.long)
src = torch.tensor([[1]])
generated = transformer.generate(
    src = context_with_given_input,
    start_token = 2,
    max_length = max_seq_length,
    temperature = 0.7,
    top_k = None
)
print(decode(generated[0]))

!PÑe,VuZQ:;HimU[ï&kTI”ÑYn#rQWe
y%fïI’ éq“72'aN&'’E—L5Y3G,]ap‘/ñ!uÑKé8N‘q%_mO!Ñu“huYÑGV1QA‘iXO,VuWPGO


In [20]:
src_data = torch.randint(1, src_vocab_size, (64, max_seq_length))
tgt_data = torch.randint(1, tgt_vocab_size, (64, max_seq_length))

In [21]:
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.Adam(transformer.parameters(), lr=0.0065, betas=(0.9, 0.98), eps=1e-9)
transformer.train()

Transformer(
  (encoder_embedding): Embedding(95, 512)
  (decoder_embedding): Embedding(95, 512)
  (positional_encoding): PositionalEncoding()
  (encoder_layers): ModuleList(
    (0-5): 6 x EncoderLayer(
      (self_attn): MultiHeadAttention(
        (weight_query): Linear(in_features=512, out_features=512, bias=True)
        (weight_key): Linear(in_features=512, out_features=512, bias=True)
        (weight_value): Linear(in_features=512, out_features=512, bias=True)
        (weight_output): Linear(in_features=512, out_features=512, bias=True)
      )
      (feed_forward): PositionWiseFeedForward(
        (fc1): Linear(in_features=512, out_features=2048, bias=True)
        (fc2): Linear(in_features=2048, out_features=512, bias=True)
        (relu): ReLU()
      )
      (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
  )
  (decoder_layers): ModuleList(

In [22]:
for epoch in range(5):
    optimizer.zero_grad()
    output = transformer(src_data, tgt_data[:, :-1])
    loss = criterion(output.contiguous().view(-1, tgt_vocab_size), tgt_data[:,1:].contiguous().view(-1))
    loss.backward()
    optimizer.step()
    print(f"Epoch : {epoch+1}, Loss : {loss.item()}")

generated = transformer.generate(
    src=src,
    start_token =2,
    max_length= max_seq_length,
    temperature = 0.7,
    top_k = None
)
print(generated)

Epoch : 1, Loss : 4.7122063636779785
Epoch : 2, Loss : 5.265438079833984
Epoch : 3, Loss : 5.3804545402526855
Epoch : 4, Loss : 5.289546489715576
Epoch : 5, Loss : 5.362061023712158
tensor([[ 2, 64, 69,  2,  7, 92,  4, 52, 92, 92, 40, 69,  4, 15, 37, 79, 19, 90,
         37, 44, 80,  4, 42, 70, 52,  4,  4, 90, 18, 92,  4, 71, 38, 34,  1, 90,
         79, 87, 69, 42,  4, 87, 22,  4, 42, 31,  4, 89, 71, 92, 62, 90, 34, 86,
         92, 38,  4, 19, 71, 29, 33, 92, 27, 70,  4, 53, 88, 21, 73, 52, 70, 87,
         66, 34, 44, 25, 90, 42, 34, 40, 89, 88, 37, 92, 79,  7, 10,  4, 31,  7,
         43,  7,  3,  4, 80, 92, 38, 80,  4, 92]])


In [23]:
decoded_output = decode(generated[0])
print(decoded_output)

!gl!&”#X””Ll#/Iv3’IPw#NmX##’2”#nJF ’vñlN#ñ6#NC#‘n”e’Fï”J#3nAE”;m#Y—5pXmñiFP9’NFL‘—I”v&)#C&O&"#w”Jw#”


In [24]:
def generate_text(prompt, method='nucleus', max_length=100, temperature=0.7, top_k=50, top_p=0.9, num_beams=3):
    generator = pipeline("text-generation", model="gpt2")

    if method == 'beam':
        output = generator(prompt, max_length=max_length, num_return_sequences=1, num_beams=num_beams, early_stopping=True)
    elif method == 'top_k':
        output = generator(prompt, max_length=max_length, do_sample=True, top_k=top_k, temperature=temperature)
    elif method == 'nucleus':
        output = generator(prompt, max_length=max_length, do_sample=True, top_p=top_p, temperature=temperature)
    else:
        raise ValueError("Invalid decoding method. Choose from 'beam', 'top_k', or 'nucleus'.")

    return output[0]['generated_text']


In [25]:
if __name__ == "__main__":
    prompt = "Once upon a time"
    print("Beam Search:", generate_text(prompt, method='beam'))
    print("Top-K Sampling:", generate_text(prompt, method='top_k'))
    print("Nucleus Sampling:", generate_text(prompt, method='nucleus'))

Device set to use cpu
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Beam Search: Once upon a time, there was a man in a black suit who wanted to kill everyone on Earth. He was a man in a black suit who wanted to kill everyone on Earth.

This man was the man who was responsible for the destruction of Earth. He was the man who was responsible for the destruction of Earth.

The man was the man who was responsible for the destruction of Earth.

The man was the man who was responsible for the destruction of Earth.




Device set to use cpu
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Top-K Sampling: Once upon a time, we were talking about the "big three" - the NFL and the NFLPA. The NFL, which is a small league with a relatively small population, is a small league, with only 1,000 people in the NBA, 2,500 in the NFL and 3,100 in the NBA. The NFL is a small league with a relatively small population, with only 1,000 people in the NBA, 2,500 in the NFL and 3,100 in the NBA


Device set to use cpu
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Nucleus Sampling: Once upon a time, we had been given the opportunity to make a decision about the future of the world. But we didn't.

Today, the world is still divided. We are not ready for a democratic society, and we are not ready to create a new one. We are not ready to see the future of our children, our grandchildren, our children's children. We are not ready to see a future of a peaceful world. We are not ready to see the future of our
