In [1]:
import torch
import torch.nn.functional as F
import matplotlib as plt
%matplotlib inline

In [7]:
#read dataset and some info about it   
words=open('names.txt','r').read().splitlines()
print(len(words))
print(max(len(w)for  w in words))
print(words[:5])

32033
15
['emma', 'olivia', 'ava', 'isabella', 'sophia']


In [10]:
#build vocab of chars and mapping to/from integers 
chars=sorted(list(set(''.join(words))))
#string to integers 
stoi= {s:i+1 for i,s in enumerate(chars)}
stoi['.']=0 #special chars for marking the beg/end
itos={i:s for s,i in stoi.items()} #int to string 
vocab_size=len(itos)
print(itos)
print(vocab_size)

{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}
27


In [11]:
#random shuffle the words
import random 
random.seed(42)
random.shuffle(words)

In [12]:
#build dataset 
block_size=8

def build_dataset(words):
    X,Y=[],[]
    for w in words:
        context=[0]*block_size
        for ch in w+'.':
            ix=stoi[ch]
            X.append(context)
            Y.append(ix)
            context=context[1:]+[ix]

    X=torch.tensor(X)
    Y=torch.tensor(Y)
    print(X.shape,Y.shape)
    return X,Y


n1=int(0.8*len(words))
n2=int(0.9*len(words))

#build train(80%),dev(10%),test(10%) datasets
Xtr,Ytr=build_dataset(words[:n1])
Xdev,Ydev=build_dataset(words[n1:n2])
Xte,Yte=build_dataset(words[n2:])


torch.Size([182625, 8]) torch.Size([182625])
torch.Size([22655, 8]) torch.Size([22655])
torch.Size([22866, 8]) torch.Size([22866])


In [13]:
for x,y in zip(Xtr[:20],Ytr[:20]):
    print(''.join(itos[ix.item()]for ix in x),'--->',itos[y.item()])

........ ---> y
.......y ---> u
......yu ---> h
.....yuh ---> e
....yuhe ---> n
...yuhen ---> g
..yuheng ---> .
........ ---> d
.......d ---> i
......di ---> o
.....dio ---> n
....dion ---> d
...diond ---> r
..diondr ---> e
.diondre ---> .
........ ---> x
.......x ---> a
......xa ---> v
.....xav ---> i
....xavi ---> e


In [23]:
#almost identical as pytorch layers 
class Linear:

    def __init__(self,fan_in,fan_out,bias=True):
        self.weight=torch.rand((fan_in,fan_out))/fan_in**0.5 #kaiming init
        self.bias=torch.zeros(fan_out)if bias else None
    
    def __call__(self, x):
        self.out=x @self.weight
        if self.bias is not None:
            self.out+=self.bias
        return self.out
    
    def parameters(self):
        return [self.weight]+([]if self.bias is None else [self.bias]) 
    

class BatchNorm1d:
    
    def __init__(self,dim,eps=1e-5,momentum=0.1):
        self.eps=eps
        self.momentum=momentum
        self.training=True
        #parameters(trained with backprop)
        self.gamma=torch.ones(dim)
        self.beta=torch.zeros(dim)
        #buffers(trained with a running momentum update)
        self.running_mean=torch.zeros(dim)
        self.running_var=torch.ones(dim)


    def __call__(self,x):
        if self.training:
            if x.ndim==2:
                dim=0
            elif x.ndim==3:
                dim=(0,1)
            xmean=x.mean(dim,keepdim=True)#batch mean
            xvar=x.var(dim,keepdim=True)#batch mean
        else:
            xmean=self.running_mean
            xvar=self.running_var
        xhat=(x-xmean)/torch.sqrt(xvar+self.eps)#normalize to unit variance
        self.out=self.gamma*xhat+self.beta #scale and shift
        #update the buffers
        if self.training:
            with torch.no_grad():
                self.running_mean=(1-self.momentum)*self.running_mean+self.momentum
                self.running_var=(1-self.momentum)*self.running_var+self.momentum
        return self.out
    
    def parameters(self):
        return [self.gamma,self.beta]
    
class Tanh:
    def _call__(self,x):
        self.out=torch.tanh(x)
        return self.out
    
    def parameters(self):
        return []
    

class Embedding:
    #initalize embedding lookup table   
    def __init__(self,num_embeddings,embedding_dim):
        self.weight=torch.rand((num_embeddings,embedding_dim))

    def __call__(self,IX):
        self.out=self.weight[IX]

    def parameters(self):
        return [self.weight]
    
class FlattenConsecutive:

    def __init__(self,n):
        self.n=n #block size

    def __call__(self,x):
        B,T,C=x.shape #[batch,block_size,embeddings]
        x=x.view(B,T//self.n,C*self.n)
        if x.shape[1]==1:
            x=x.squeeze(1)
        self.out=x
        return self.out
    
    def parameters(self):
        return []
    

class Sequential:

    def __init__(self,layers):
        self.layers=layers
    
    #forward pass
    def __call__(self,x):
        for layer in self.layers:
            x=layer(x)
        self.out=x
        return self.out
    
    def parameters(self):
        return [p for layer in self.layers for p in layer.parameters()]

In [24]:
torch.manual_seed(42); # seed rng for reproducibility


In [27]:
n_embd=24 #dims of chars embedding vectors 
n_hidden=128#number of neurons in the hidden layer of MLP 
#define model layers
model = Sequential([
    Embedding(vocab_size,n_embd),
    FlattenConsecutive(2),Linear(n_embd*2,n_hidden,bias=False),BatchNorm1d(n_hidden),Tanh(),
    FlattenConsecutive(2),Linear(n_hidden*2,n_hidden,bias=False),BatchNorm1d(n_hidden),Tanh(),
    FlattenConsecutive(2),Linear(n_hidden*2,n_hidden,bias=False),BatchNorm1d(n_hidden),Tanh(),
    Linear(n_hidden,vocab_size),
])

with torch.no_grad():
    model.layers[-1].weight*=0.1 #last layer make less confident at init

parameters=model.parameters()
print(sum(p.nelement() for p in parameters))

#set params require grad for backprop
for p in parameters:
    p.requires_grad=True


76579
