In [1]:
import torch
import torch.nn as nn

In [2]:
config = {
    "vocab_size": 50257,     # vocabulary size
    "context_length": 1024,  # context length
    "d_model": 768,          # Embedding dimension | hidden size
    "num_heads": 12,         # Number of attention heads
    "num_layers": 12,        # Number of layers
    "drop_rate": 0.1,        # Dropout rate
    "qkv_bias": False,       # query-key-value bias
}

In [67]:
class MultiHeadAttention(nn.Module):
    def __init__(self, T, d_model, num_heads, drop_rate, qkv_bias=False):
        super(MultiHeadAttention, self).__init__()
        assert d_model % num_heads == 0, "d_model should be divisible by num_heads"
        
        self.T = T
        self.d_model =  d_model
        self.num_heads = num_heads
        self.head_dim = d_model // num_heads
        self.Wq = nn.Linear(d_model, d_model, bias=qkv_bias)
        self.Wk = nn.Linear(d_model, d_model, bias=qkv_bias)
        self.Wv = nn.Linear(d_model, d_model, bias=qkv_bias)

        self.dropout = nn.Dropout(drop_rate)
        self.out_proj = nn.Linear(d_model, d_model)

    def forward(self, x):
        # x: (B, T, embed_size)   # embed_size == d_model
        B = x.shape[0]

        queries = self.Wq(x)   # (B, T, d_model)
        keys = self.Wk(x)   # (B, T, d_model)
        values = self.Wv(x)   # (B, T, d_model)

        # Divide d_model in num_heads
        queries = queries.view(B, self.T, self.num_heads, self.head_dim)   # (B, T, num_heads, head_dim)
        keys = keys.view(B, self.T, self.num_heads, self.head_dim)   # (B, T, num_heads, head_dim)
        values = values.view(B, self.T, self.num_heads, self.head_dim)   # (B, T, num_heads, head_dim)

        queries = queries.transpose(1, 2)   # (B, num_heads, T, head_dim)
        keys = keys.transpose(1, 2)   # (B, num_heads, T, head_dim)
        values = values.transpose(1, 2)   # (B, num_heads, T, head_dim)
        
        # Compute attention scores
        attention_scores = queries @ keys.transpose(2, 3)   # (B, num_heads, T, T)

        # Compute attention weights
        mask = torch.triu(torch.ones(self.T, self.T), diagonal=1)   # (T, T)
        masked_attention_scores = attention_scores.masked_fill(mask.bool(), -torch.inf)  # (B, num_heads, T, T)
        attention_weights = torch.softmax( masked_attention_scores / keys.shape[-1] ** 0.5 , dim=-1)  # (B, num_heads, T, T)

        # Compute context vector
        Z = attention_weights @ values   # (B, num_heads, T, head_dim)
        Z = Z.contiguous()
        Z = Z.view(B, self.T, self.d_model)
        Z = self.out_proj(Z)

        return Z

B = 2
T = 4096
d_model = 4608
num_heads = 48
drop_rate = 0.5
X = torch.rand(B, T, d_model)
multiheadattention = MultiHeadAttention(T, d_model, num_heads, drop_rate)
out = multiheadattention(X)
out.shape


torch.Size([2, 4096, 4608])

In [68]:
class LayerNorm(nn.Module):
    def __init__(self):
        super(LayerNorm, self).__init__()
        self.eps = 1e-5
        self.scale = nn.Parameter(torch.ones(d_model))
        self.shift = nn.Parameter(torch.zeros(d_model))
        
    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim=-1, keepdim=True, unbiased=False)
        norm_x = (x - mean) / torch.sqrt(var + self.eps)
        return self.scale * norm_x + self.shift

In [69]:
class GELU(nn.Module):
    def __init__(self):
        super(GELU, self).__init__()

    def forward(self, x):
        return 0.5 * x * (1 + torch.tanh( torch.sqrt(torch.tensor( 2.0 / torch.pi )) * (x + 0.044715 * torch.pow(x, 3)) ))

In [70]:
class FeedForwardNetwork(nn.Module):
    def __init__(self, cfg):
        super(FeedForwardNetwork, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(cfg['d_model'], 4 * cfg['d_model']),
            GELU(),
            nn.Linear(4 * cfg['d_model'], cfg['d_model']),
        )

    def forward(self, x):
        return self.layers(x)

In [71]:
class FeedForwardNetwork(nn.Module):
    def __init__(self, cfg):
        super(FeedForwardNetwork, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(cfg['d_model'], 4 * cfg['d_model']),
            GELU(),
            nn.Linear(4 * cfg['d_model'], cfg['d_model']),
        )

    def forward(self, x):
        return self.layers(x)

In [72]:
class TransformerBlock(nn.Module):
    def __init__(self, cfg):
        super(TransformerBlock, self).__init__()
        self.norm1 = LayerNorm()
        self.norm2 = LayerNorm()
        self.drop_resd = nn.Dropout(cfg['drop_rate'])
        self.ffn = FeedForwardNetwork(cfg)
        self.attn = MultiHeadAttention(T=cfg['context_length'],
                                      d_model=cfg['d_model'],
                                      num_heads=cfg['num_heads'],
                                      drop_rate=cfg['drop_rate'],
                                      qkv_bias=cfg['qkv_bias'])

    def forward(self, x):
        # x: (B, T, embed_size)  embed_size == d_model
        shortcut = x              # (B, T, d_model)
        x = self.norm1(x)         # (B, T, d_model)
        x = self.attn(x)           # (B, T, d_model)
        x = self.drop_resd(x)     # (B, T, d_model)
        x = x + shortcut          # (B, T, d_model)

        shortcut = x
        x = self.norm2(x)         # (B, T, d_model)
        x = self.ffn(x)           # (B, T, d_model)
        x = self.drop_resd(x)     # (B, T, d_model)
        x = x + shortcut          # (B, T, d_model)

        return x

B = 2
T = config['context_length']
d_model = config['d_model']
num_heads = config['num_heads']
X = torch.rand(B, T, d_model)
model = TransformerBlock(config)
out = model(X)

print('X: ', X.shape)
print('out: ', X.shape)

X:  torch.Size([2, 1024, 768])
out:  torch.Size([2, 1024, 768])


In [73]:
class GPTModel(nn.Module):
    def __init__(self, cfg):
        super(GPTModel, self).__init__()
        self.tok_emb = nn.Embedding(cfg['vocab_size'], cfg['d_model'])
        self.pos_emb = nn.Embedding(cfg['context_length'], cfg['d_model'])
        self.dropout = nn.Dropout(cfg['drop_rate'])
        self.transformer_blocks = nn.Sequential(*[TransformerBlock(cfg) for _ in range(cfg['num_layers'])])
        self.final_norm = LayerNorm()
        self.out_head = nn.Linear(cfg['d_model'], cfg['vocab_size'], bias=False)

    def forward(self, inputs):
        # inputs: (B, T)
        B, T = inputs.shape
        tok_embeds = self.tok_emb(inputs)   # (B, d_model)
        pos_embeds = self.pos_emb(torch.arange(T, device=inputs.device))  # (B, d_model)
        x = tok_embeds + pos_embeds
        x = self.dropout(x)
        x = self.transformer_blocks(x)
        x = self.final_norm(x)
        logits = self.out_head(x)
        return logits

model = GPTModel(config)
B = 2
T = config['context_length']
X = torch.randint(0, config['vocab_size'], (B, T))
out = model(X)

print('X: ', X.shape)
print('out: ', out.shape)

X:  torch.Size([2, 1024])
out:  torch.Size([2, 1024, 50257])


## Generating Text

Converting tensor output of gpt model back into text

In [74]:
def generate_text(model, encoded_input, max_new_tokens, context_size):
    for _ in range(max_new_tokens):
        encode_input_context = encoded_input[:, -context_size:]
        with torch.no_grad():
            logits = model(encode_input_context)
        logits = logits[:, -1, :]
        probas = torch.softmax(logits, dim=-1)
        encoded_input_next = torch.argmax(probas, dim=-1, keepdim=True)
        encoded_input = torch.cat((encoded_input, encoded_input_next), dim=1)

    return encoded_input

In [77]:
import tiktoken

tok = tiktoken.get_encoding('gpt2')

text1 = """Mars is the fourth planet from the Sun. The surface of Mars is orange-red because it is covered in iron(III) oxide dust, giving it the nickname "the Red Planet".[21][22] Mars is among the brightest objects in Earth's sky and its high-contrast albedo features have made it a common subject for telescope viewing. It is classified as a terrestrial planet and is the second smallest of the Solar System's planets with a diameter of 6,779 km (4,212 mi). In terms of orbital motion, a Martian solar day (sol) is equal to 24.5 hours and a Martian solar year is equal to 1.88 Earth years (687 Earth days). Mars has two natural satellites that are small and irregular in shape: Phobos and Deimos.
The relatively flat plains in northern parts of Mars strongly contrast with the cratered terrain in southern highlands – this terrain observation is known as the Martian dichotomy. Mars hosts many enormous extinct volcanos (such as Olympus Mons, 21.9 km or 13.6 mi tall) and one of the largest canyons in the Solar System (Valles Marineris, 4,000 km or 2,500 mi long). Geologically, the planet is fairly active with marsquakes trembling underneath the ground, dust devils sweeping across the landscape, and cirrus clouds. Carbon dioxide is substantially present in Mars's polar ice caps and thin atmosphere. During a year, there are large surface temperature swings on the surface between −78.5 °C (−109.3 °F) to 5.7 °C (42.3 °F)[c] similar to Earth's seasons, as both planets have significant axial tilt.
Mars was formed approximately 4.5 billion years ago. During the Noachian period (4.5 to 3.5 billion years ago), Mars's surface was marked by meteor impacts, valley formation, erosion, and the possible presence of water oceans. The Hesperian period (3.5 to 3.3–2.9 billion years ago) was dominated by widespread volcanic activity and flooding that carved immense outflow channels. The Amazonian period, which continues to the present, was marked by the wind as a dominant influence on geological processes. Due to Mars's geological history, the possibility of past or present life on Mars remains of great scientific interest.
Since the late 20th century, Mars has been explored by uncrewed spacecraft and rovers, with the first flyby by the Mariner 4 probe in 1965, the first Mars orbiter by the Mars 2 probe in 1971, and the first landing by the Viking 1 probe in 1976. As of 2023, there are at least 11 active probes orbiting Mars or at the Martian surface. Mars is an attractive target for future human exploration missions, though in the 2020s no such mission is planned.
Scientists have theorized that during the Solar System's formation, Mars was created as the result of a random process of run-away accretion of material from the protoplanetary disk that orbited the Sun. Mars has many distinctive chemical features caused by its position in the Solar System. Elements with comparatively low boiling points, such as chlorine, phosphorus, and sulfur, are much more common on Mars than on Earth; these elements were probably pushed outward by the young Sun's energetic solar wind.[23]
After the formation of the planets, the inner Solar System may have been subjected to the so-called Late Heavy Bombardment. About 60% of the surface of Mars shows a record of impacts from that era,[24][25][26] whereas much of the remaining surface is probably underlain by immense impact basins caused by those events. However, more recent modelling has disputed the existence of the Late Heavy Bombardment.[27] There is evidence of an enormous impact basin in the Northern Hemisphere of Mars, spanning 10,600 by 8,500 kilometres (6,600 by 5,300 mi), or roughly four times the size of the Moon's South Pole–Aitken basin, which would be the largest impact basin yet discovered if confirmed.[28] It has been hypothesized that the basin was formed when Mars was struck by a Pluto-sized body about four billion years ago. The event, thought to be the cause of the Martian hemispheric dichotomy, created the smooth Borealis basin that covers 40% of the planet.[29][30]
A 2023 study shows evidence, based on the orbital inclination of Deimos (a small moon of Mars), that Mars may once have had a ring system 3.5 billion years to 4 billion years ago.[31] This ring system may have been formed from a moon, 20 times more massive than Phobos, orbiting Mars billions of years ago; and Phobos would be a remnant of that ring.[32][33]
The geological history of Mars can be split into many periods, but the following are the three primary periods:[34][35]
Noachian period: Formation of the oldest extant surfaces of Mars, 4.5 to 3.5 billion years ago. Noachian age surfaces are scarred by many large impact craters. The Tharsis bulge, a volcanic upland, is thought to have formed during this period, with extensive flooding by liquid water late in the period. Named after Noachis Terra.[36]
Hesperian period: 3.5 to between 3.3 and 2.9 billion years ago. The Hesperian period is marked by the formation of extensive lava plains. Named after Hesperia Planum.[36]
Amazonian period: between 3.3 and 2.9 billion years ago to the present. Amazonian regions have few meteorite impact craters but are otherwise quite varied. Olympus Mons formed during this period, with lava flows elsewhere on Mars. Named after Amazonis Planitia.[36]
Geological activity is still taking place on Mars. The Athabasca Valles is home to sheet-like lava flows created about 200 million years ago. Water flows in the grabens called the Cerberus Fossae occurred less than 20 million years ago, indicating equally recent volcanic intrusions.[37] The Mars Reconnaissance Orbiter has captured images of avalanches."""
encoded = torch.tensor(tok.encode(text1)).unsqueeze(0)

model.eval()
out = generate_text(model, encoded, max_new_tokens=5, context_size=config['context_length'])
out.shape

torch.Size([1, 1282])

In [76]:
decoded_text = tok.decode(out.squeeze(0).tolist())
decoded_text

'Mars is the fourth planet from the Sun. The surface of Mars is orange-red because it is covered in iron(III) oxide dust, giving it the nickname "the Red Planet".[21][22] Mars is among the brightest objects in Earth\'s sky and its high-contrast albedo features have made it a common subject for telescope viewing. It is classified as a terrestrial planet and is the second smallest of the Solar System\'s planets with a diameter of 6,779 km (4,212 mi). In terms of orbital motion, a Martian solar day (sol) is equal to 24.5 hours and a Martian solar year is equal to 1.88 Earth years (687 Earth days). Mars has two natural satellites that are small and irregular in shape: Phobos and Deimos.\nThe relatively flat plains in northern parts of Mars strongly contrast with the cratered terrain in southern highlands – this terrain observation is known as the Martian dichotomy. Mars hosts many enormous extinct volcanos (such as Olympus Mons, 21.9 km or 13.6 mi tall) and one of the largest canyons in th