In [111]:
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [112]:
print('length of dataset in characters: ', len(text)) 

length of dataset in characters:  1115389


In [113]:
print(text[:1000])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for revenge.



In [114]:
# get all characters occurred in dataset
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(ch for ch in chars))
print('vocab size: ', vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
vocab size:  65


In [115]:
# create a mapping from character to integer
stoi = {s:i for i, s in enumerate(chars)}
itos = {i:s for i, s in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s] # encode: take a string, output a list of integers
decode = lambda l: ''.join(itos[i] for i in l) # decode: take a list, output a string

print(encode("hii there"))
print(decode(encode("hii there")))

[46, 47, 47, 1, 58, 46, 43, 56, 43]
hii there


In [116]:
import torch
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:1000])

torch.Size([1115389]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13, 50, 50, 10,  0, 30, 43, 57, 53, 50, 60, 43, 42,
         8,  1, 56, 43, 57, 53, 50, 60, 43, 42,  8,  0,  0, 18, 47, 56, 57, 58,
         1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 18, 47, 56, 57, 58,  6,  1, 63,
        53, 59,  1, 49, 52, 53, 61,  1, 15, 39, 47, 59, 57,  1, 25, 39, 56, 41,
      

In [117]:
# split data into train and validation sets
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

In [118]:
torch.manual_seed(1337)
batch_size = 4  # how many sequences will we process in parallel
block_size = 8  # what is the maximum context length of for predictions

def get_batch(split):
    # get a small batch of data of inputs x and targets y
    data = train_data if split=='train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size, ))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y 

xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)

print('----')

for b in range(batch_size):    # batch dimension
    for t in range(block_size):
        context = xb[b, :t+1]
        target = yb[b, t]
        print(f"when input is {context.tolist()}, the target: {target}")

inputs:
torch.Size([4, 8])
tensor([[59, 57,  1, 58, 56, 39, 47, 58],
        [30, 10,  0, 15, 53, 51, 43,  6],
        [34, 21, 26, 15, 17, 26, 32, 21],
        [27, 10,  0, 32, 46, 53, 59,  1]])
targets:
torch.Size([4, 8])
tensor([[57,  1, 58, 56, 39, 47, 58, 53],
        [10,  0, 15, 53, 51, 43,  6,  1],
        [21, 26, 15, 17, 26, 32, 21, 27],
        [10,  0, 32, 46, 53, 59,  1, 42]])
----
when input is [59], the target: 57
when input is [59, 57], the target: 1
when input is [59, 57, 1], the target: 58
when input is [59, 57, 1, 58], the target: 56
when input is [59, 57, 1, 58, 56], the target: 39
when input is [59, 57, 1, 58, 56, 39], the target: 47
when input is [59, 57, 1, 58, 56, 39, 47], the target: 58
when input is [59, 57, 1, 58, 56, 39, 47, 58], the target: 53
when input is [30], the target: 10
when input is [30, 10], the target: 0
when input is [30, 10, 0], the target: 15
when input is [30, 10, 0, 15], the target: 53
when input is [30, 10, 0, 15, 53], the target: 51
when i

In [119]:
print(xb.shape)
print(xb)

torch.Size([4, 8])
tensor([[59, 57,  1, 58, 56, 39, 47, 58],
        [30, 10,  0, 15, 53, 51, 43,  6],
        [34, 21, 26, 15, 17, 26, 32, 21],
        [27, 10,  0, 32, 46, 53, 59,  1]])


In [120]:
import torch
import torch.nn as nn
import torch.nn.functional as F
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):
    
    def __init__(self, vocab_size):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
    
    def forward(self, idx, targets=None):
        # idx and targets are both (B, T) tensor of integers
        logits = self.token_embedding_table(idx)    # (4, 8) -> (4, 8, 65)
        
        if targets == None:
            loss = None
        else: 
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        
        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            logits, loss = self(idx)  # idx: (B, T), logits:(B*T, C)
            logits = logits[:, -1, :]
            props = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(props, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx
    
m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)
print(decode(m.generate(idx=torch.zeros((1,1), dtype=torch.long), max_new_tokens=100)[0].tolist()))

torch.Size([32, 65])
tensor(4.5242, grad_fn=<NllLossBackward0>)

Sr?qP-QWktXoL&jLDJgOLVz'RIoDqHdhsV&vLLxatjscMpwLERSPyao.qfzs$Ys$zF-w,;eEkzxjgCKFChs!iWW.ObzDnxA Ms$3


In [121]:
# Create a pytorch optimizer
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [122]:
batch_size = 32
for step in range(10000):
    # sample from a batch of data
    xb, yb = get_batch('train')
    
    # evaluate the loss
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=None)
    loss.backward()
    optimizer.step()
    
    print(loss.item())

4.773647785186768
4.707029819488525
4.7259931564331055
4.68586540222168
4.6611247062683105
4.728892803192139
4.702362060546875
4.737646102905273
4.792877197265625
4.80597448348999
4.752073764801025
4.7641167640686035
4.768823623657227
4.760959625244141
4.716278076171875
4.718400001525879
4.799113750457764
4.662482261657715
4.691165447235107
4.6693010330200195
4.650801658630371
4.608971118927002
4.749551296234131
4.71426248550415
4.669522285461426
4.745335578918457
4.67812442779541
4.654453277587891
4.6910786628723145
4.689185619354248
4.65010404586792
4.616771221160889
4.692744731903076
4.612875938415527
4.746123313903809
4.703676700592041
4.7789154052734375
4.714477062225342
4.631750106811523
4.598884105682373
4.7095255851745605
4.778952598571777
4.575927257537842
4.624885082244873
4.657304763793945
4.615592002868652
4.632367134094238
4.5890350341796875
4.6374616622924805
4.65617036819458
4.642502307891846
4.788652420043945
4.62267541885376
4.537405967712402
4.751932621002197
4.626575

In [124]:
print(decode(m.generate(idx=torch.zeros((1,1), dtype=torch.long), max_new_tokens=1000)[0].tolist()))



Cengerofo'dsssit ey
KIN d pe wither vouprrouthercc.
hathe; d!
My hind ttid?
ig t ouchos tes; st yo hind wotin grotonear 'so it t jod weancotha:
h haybet--s n prids, r loncave w hollular s O:
HIs; ht anjx?

DUThinqunt.

LaZAnde.
athave l.
KEONH:
ARThanco be y,-hedarwnoddy scar t tridesar, wnl'shenous s ls, theresseys
PlorseelapinghiybHen yof GLUCEN the. sE:
I hisgothers je are!-e!
QLYotouciullle's fldrwertho s?
NDan'spererfo cist ripl chys er orlese;
Yo jehof h hecere ek? wf HEThot mowo soaf yoit, ince his, t, f at. fal thetrimy bupof tor atha By!
JOutho f cimimave.
NEDUSt cir sella p wie wede
Ro n apenor f'Y tover witys an sh d w t e w!
CEOntiretoaveEd the we n ck. cung.
ORIsthies hacin benqurd bll, d a r w wistatsowor ath
Fivet blall ang a-I theeancusemee tsce larry t I lag sze t
A hy thit,
n.
Faure ds ppplirn!
meftou ow uring, avewist th;
TENTEMETCo gienco, An he waro whilagoueas s imuror?
Bu ne-ingof acat nd l,
Fothin f by y:
ARDUTA llllld!
AMQUThes med thestw cos wand herf s hafo

In [135]:
a = nn.Linear(20, 30)
c = torch.randn((3,30, 20))
b = a(c)
b.shape

torch.Size([3, 30, 30])