In [16]:
import torch

In [3]:
with open("shakespeare.txt", encoding='utf-8') as f:
    text=f.read()

In [5]:
text[:1000]

"First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou are all resolved rather to die than to famish?\n\nAll:\nResolved. resolved.\n\nFirst Citizen:\nFirst, you know Caius Marcius is chief enemy to the people.\n\nAll:\nWe know't, we know't.\n\nFirst Citizen:\nLet us kill him, and we'll have corn at our own price.\nIs't a verdict?\n\nAll:\nNo more talking on't; let it be done: away, away!\n\nSecond Citizen:\nOne word, good citizens.\n\nFirst Citizen:\nWe are accounted poor citizens, the patricians good.\nWhat authority surfeits on would relieve us: if they\nwould yield us but the superfluity, while it were\nwholesome, we might guess they relieved us humanely;\nbut they think we are too dear: the leanness that\nafflicts us, the object of our misery, is as an\ninventory to particularise their abundance; our\nsufferance is a gain to them Let us revenge this with\nour pikes, ere we become rakes: for the gods know I\nspeak this in hunger 

In [6]:
len(text)

1115394

In [9]:
chars=sorted(list(set(text)))
vocab_size=len(chars)
print(chars)
print(''.join(chars))
print(len(chars))

['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']

 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


In [13]:
stoi={s:i for i,s in enumerate(chars)}
itos={i:s for i,s in enumerate(chars)}
encode=lambda s: [stoi[c] for c in s]
decode=lambda i: ''.join([itos[d] for d in i])

In [15]:
print(encode('hii there!'))
print(decode(encode('hii there!')))

[46, 47, 47, 1, 58, 46, 43, 56, 43, 2]
hii there!


In [17]:
data=torch.tensor(encode(text),dtype=torch.long)

In [19]:
data[:100]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59])

In [22]:
type(data), data.dtype, data.shape

(torch.Tensor, torch.int64, torch.Size([1115394]))

In [28]:
n=int(0.9*len(data))
train_data=data[:n]
val_data=data[n:]

In [29]:
print(train_data.shape)

torch.Size([1003854])


In [45]:
torch.manual_seed=1337
batch_size=4
block_size=8

def get_batch(split):
    data=train_data if split=='train' else val_data
    ix=torch.randint(len(data)-block_size, (batch_size,))
    x=torch.stack([data[i:i+block_size] for i in ix])
    print(x.shape)
    print(x)
    print(ix)
    y=torch.stack([data[i+1:i+1+block_size] for i in ix])
    return x, y
    
xb, yb=get_batch('train')

for b in range(batch_size):
    for t in range(block_size):
        context=xb[b,:t+1]
        target=yb[b,t]
        print(f"When input is {context.tolist()} the target is {target}")

torch.Size([4, 8])
tensor([[ 1, 61, 56, 43, 58, 41, 46, 43],
        [10,  0, 32, 46, 43, 56, 43,  1],
        [20, 27, 30, 32, 17, 26, 31, 21],
        [57, 47, 50, 60, 43, 56,  1, 57]])
tensor([243712, 149213, 990673, 557150])
When input is [1] the target is 61
When input is [1, 61] the target is 56
When input is [1, 61, 56] the target is 43
When input is [1, 61, 56, 43] the target is 58
When input is [1, 61, 56, 43, 58] the target is 41
When input is [1, 61, 56, 43, 58, 41] the target is 46
When input is [1, 61, 56, 43, 58, 41, 46] the target is 43
When input is [1, 61, 56, 43, 58, 41, 46, 43] the target is 42
When input is [10] the target is 0
When input is [10, 0] the target is 32
When input is [10, 0, 32] the target is 46
When input is [10, 0, 32, 46] the target is 43
When input is [10, 0, 32, 46, 43] the target is 56
When input is [10, 0, 32, 46, 43, 56] the target is 43
When input is [10, 0, 32, 46, 43, 56, 43] the target is 1
When input is [10, 0, 32, 46, 43, 56, 43, 1] the ta

In [49]:
import torch.nn as nn

class BigramModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table=nn.Embedding(vocab_size,vocab_size)
        
    def forward(self, idx, targets):
        logits=self.token_embedding_table(idx)
        
        return logits
    
m=BigramModel(vocab_size)
m(xb,yb)

tensor([[[-0.0860,  0.5680, -0.1164,  ...,  0.7800, -0.7668, -0.1259],
         [-0.2464, -0.1141, -0.9482,  ...,  1.3305, -1.3975,  0.6340],
         [ 0.1722,  1.2483, -0.2914,  ..., -0.5909,  0.6979, -1.3956],
         ...,
         [-0.4518,  0.0034, -1.6165,  ...,  0.6285, -0.9934, -0.4570],
         [-0.7593, -1.0071,  0.3124,  ...,  0.0314, -0.3278,  0.2390],
         [-0.3086, -0.0265, -0.0835,  ...,  0.2177,  0.6768, -1.7555]],

        [[-0.1271, -0.4112, -1.3993,  ...,  0.5765, -1.4168, -0.1157],
         [-0.8526,  1.0479,  0.9910,  ...,  0.9044,  1.0173,  1.2960],
         [-0.6925,  1.3320, -0.6304,  ...,  1.4915,  0.3075,  0.6991],
         ...,
         [ 0.1722,  1.2483, -0.2914,  ..., -0.5909,  0.6979, -1.3956],
         [-0.3086, -0.0265, -0.0835,  ...,  0.2177,  0.6768, -1.7555],
         [-0.0860,  0.5680, -0.1164,  ...,  0.7800, -0.7668, -0.1259]],

        [[ 0.6177, -1.9278, -0.4505,  ...,  0.3308, -0.3623, -0.8878],
         [ 0.0205, -0.0395, -0.4427,  ...,  1