In [1]:
import torch
import numpy as np

# Broadcasting 

## pytorch

In [2]:
t_data = torch.tensor([[1, 1, 1],
                  [5, 3, 2],
                  [10, 10, 80]
                 ])

In [3]:
t_data.shape

torch.Size([3, 3])

In [4]:
wrong_norms = t_data.sum(axis=1)
print(wrong_norms)
print(wrong_norms.shape)

tensor([  3,  10, 100])
torch.Size([3])


In [5]:
t_data / wrong_norms

tensor([[0.3333, 0.1000, 0.0100],
        [1.6667, 0.3000, 0.0200],
        [3.3333, 1.0000, 0.8000]])

In [6]:
correct_norms = t_data.sum(axis=1, keepdim=True)
print(correct_norms)
print(correct_norms.shape)

tensor([[  3],
        [ 10],
        [100]])
torch.Size([3, 1])


In [7]:
t_data / correct_norms

tensor([[0.3333, 0.3333, 0.3333],
        [0.5000, 0.3000, 0.2000],
        [0.1000, 0.1000, 0.8000]])

## numpy

In [8]:
n_data = np.array([[1, 1, 1],
                  [5, 3, 2],
                  [10, 10, 80]
                 ])

In [9]:
n_data.shape

(3, 3)

In [10]:
wrong_norms = n_data.sum(axis=1,)
print(wrong_norms)
print(wrong_norms.shape)

[  3  10 100]
(3,)


In [11]:
n_data / wrong_norms

array([[0.33333333, 0.1       , 0.01      ],
       [1.66666667, 0.3       , 0.02      ],
       [3.33333333, 1.        , 0.8       ]])

In [12]:
correct_norms = n_data.sum(axis=1, keepdims=True)
print(correct_norms)
print(correct_norms.shape)

[[  3]
 [ 10]
 [100]]
(3, 1)


In [13]:
n_data / correct_norms

array([[0.33333333, 0.33333333, 0.33333333],
       [0.5       , 0.3       , 0.2       ],
       [0.1       , 0.1       , 0.8       ]])

# Backpropagation

In [14]:
W = torch.tensor([[4., 5, 6],
                  [1, 2, 3]
                 ], requires_grad=True)


In [15]:
W.dtype

torch.float32

In [16]:
x = torch.tensor([[3., 3]])
x.shape

torch.Size([1, 2])

In [17]:
y_pred = x @ W
y_pred

tensor([[15., 21., 27.]], grad_fn=<MmBackward0>)

In [18]:
y_true = torch.tensor([[5, 1, 7]])

In [19]:
loss = ((y_pred - y_true)**2).sum()


In [20]:
loss

tensor(900., grad_fn=<SumBackward0>)

In [21]:
W.grad = None
loss.backward()

In [22]:
W.grad 

tensor([[ 60., 120., 120.],
        [ 60., 120., 120.]])

# Models


## Model with frequency counts 

In [23]:
file_path = '../names.txt'

In [24]:
# reading the names from the file
names = open(file_path, 'r').read().split('\n')

In [25]:
names[:5]

['emma', 'olivia', 'ava', 'isabella', 'sophia']

In [26]:
names[:5][:-1]

['emma', 'olivia', 'ava', 'isabella']

In [27]:
# Creating bigrams dataset
X = []
y = []
for name in names:

    # adding start and end characters
    word = '.' + name + '.'

    # creating all bigrams for the current word
    for ch1, ch2 in zip(word[:-1] ,word[1:]):

        X.append(ch1)
        y.append(ch2)


In [28]:
# Creating vocabulary of all possible characters
vocabulary = sorted(list(set(X)))
len(vocabulary)

27

In [29]:
# Creating dictionaries to conver from character to numerical index, and viceversa
stoi = {v:i for i,v in enumerate(vocabulary)}
itos = {i:v for i,v in enumerate(vocabulary)}

In [30]:
# Creating a matrix computing the frequency of each bigram

# initializing the counter matrix
C = torch.zeros((len(vocabulary), len(vocabulary)))

# going through all the bigrams
for x_i, y_i in zip(X, y):

    # getting the numerical indexes for the current characters
    x_i_idx = stoi[x_i]
    y_i_idx = stoi[y_i]

    # updating the frequency matrix
    C[x_i_idx, y_i_idx] += 1

# Smoothing the frequency matrix to avoid any zeros
C +=1

In [31]:
# Normalising the frequency matrix so that every row is a probability distribution
row_sums = C.sum(axis=1, keepdim=True)
C = C / row_sums

In [32]:
# Checking that each row sums to 1
C.sum(axis=1)

tensor([1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
        1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
        1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000])

In [33]:
# Generating names from the learnt distribution

def generate_name(M):
    """
    Generates a name, given a bigram probability distribution matrix (M)
    """

    
    # starting with the word delimiter character
    ch = '.'
    generated = []
    
    while True:
        # converting to index 
        idx = stoi[ch]
        
        # getting the corresponding row from the frequency matrix
        dist = M[idx]
        
        # getting a sample from the probability distribution
        idx = torch.multinomial(dist, 1, replacement=True).item()
        
        # converting index to character
        ch = itos[idx]
        
        # stopping when delimiter char is reached
        if ch == '.':
            break
        else:
            generated.append(ch)
    
    return "".join(generated)

for i in range(10):
    print(generate_name(C))

zarr
awigeinnemt
ahiahaki
chrairar
aikemian
ckait
tymiahas
ka
mi
o


## Neural network model

In [34]:
# Initializing model
W = torch.rand((len(vocabulary), len(vocabulary)), requires_grad=True)

In [35]:
W.min(), W.max()

(tensor(0.0005, grad_fn=<MinBackward1>),
 tensor(0.9994, grad_fn=<MaxBackward1>))

In [36]:
# Test:
# Computing the average loss function, one sample at a time 

log_loss = 0
for x_i, y_i in zip(X,y):
    #print(x_i, y_i)

    # converting characters to index
    x_i_index = stoi[x_i]
    y_i_index = stoi[y_i]

    # converting index to one hot encoding vector
    input_ =  torch.nn.functional.one_hot(torch.tensor(x_i_index), len(vocabulary)).float()

    # multiplying ohe vector with weights matrix to select the row corresponding to the current character
    row = input_ @ W

    # normalizing the row to get a probability distribution (softmax)
    row = torch.exp(row) 
    row /= row.sum()

    # getting the predicted probability for next character in the training set
    pred_p = row[y_i_index]

    # updating log_loss accumulator
    log_loss += torch.log(pred_p)

# computing the average negative log_loss
avg_neg_log_loss = - log_loss / len(X)
avg_neg_log_loss 

tensor(3.3329, grad_fn=<DivBackward0>)

In [37]:
# Converting the datasets to numerical tensors
Xt = torch.tensor([stoi[x_i] for x_i in X])
yt = torch.tensor([stoi[y_i] for y_i in y])
Xt.shape, yt.shape,

(torch.Size([228146]), torch.Size([228146]))

In [38]:
# Applying One Hot encoding to the numerical tensors
Xoh = torch.nn.functional.one_hot(Xt, len(vocabulary)).float()
yoh = torch.nn.functional.one_hot(yt, len(vocabulary)).float()
Xoh.shape, yoh.shape,

(torch.Size([228146, 27]), torch.Size([228146, 27]))

In [39]:
# normalizing each row sto get a probability distribution (softmax)
P = torch.exp(W) /  torch.exp(W).sum(axis=1, keepdims=True)
P.sum(axis=1)

tensor([1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
        1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
        1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
       grad_fn=<SumBackward1>)

In [40]:
# Computing the log loss in a vectorized way

# Multiplying the input OHE matrix by P, so that every sample of X selects the appropriate row from P
# (the one corresponding to the first character of the bigram)
rows = Xoh @ P

# Selecting for each row the appropriate element
# (the one corresponding to the second character of the bigram)
output = rows[torch.arange(len(yt)), yt ]

# computing the average, negative log loss
anll =  - output.log().mean()

In [41]:
anll

tensor(3.3329, grad_fn=<NegBackward0>)

In [42]:
# backward pass to compute gradients
W.grad = None
anll.backward()

In [43]:
# updating the weights using the gradient 
W = W -0.1 * W.grad

In [44]:
# Performing several trining steps

# Initializing model
W = torch.rand((len(vocabulary), len(vocabulary)), requires_grad=True)

steps = 30

for i in range(steps):

    ######################################################
    # Applying One Hot encoding to the numerical tensors #
    ######################################################
    Xoh = torch.nn.functional.one_hot(Xt, len(vocabulary)).float()
    
    ##############################################
    # Computing the log loss in a vectorized way #
    ##############################################

    # Multiplying the input OHE matrix by W, so that every sample of X selects the appropriate row from W
    # (the one corresponding to the first character of the bigram)
    rows = Xoh @ W

    # normalizing each row sto get a probability distribution (softmax) 
    P = torch.exp(rows) /  torch.exp(rows).sum(axis=1, keepdims=True)
    
    # Selecting for each row the appropriate element
    # (the one corresponding to the second character of the bigram)
    output = P[torch.arange(len(yt)), yt ]
    
    # computing the average, negative log loss
    anll =  - output.log().mean()
    print(anll)

    ##############################################
    #    backward pass to compute gradients      #
    ##############################################
    W.grad = None
    anll.backward()

    ##############################################
    #   updating the weights using the gradient  #
    ##############################################
    W.data += -(3 * W.grad)


tensor(3.3561, grad_fn=<NegBackward0>)
tensor(3.3388, grad_fn=<NegBackward0>)
tensor(3.3218, grad_fn=<NegBackward0>)
tensor(3.3051, grad_fn=<NegBackward0>)
tensor(3.2888, grad_fn=<NegBackward0>)
tensor(3.2729, grad_fn=<NegBackward0>)
tensor(3.2572, grad_fn=<NegBackward0>)
tensor(3.2419, grad_fn=<NegBackward0>)
tensor(3.2270, grad_fn=<NegBackward0>)
tensor(3.2124, grad_fn=<NegBackward0>)
tensor(3.1981, grad_fn=<NegBackward0>)
tensor(3.1842, grad_fn=<NegBackward0>)
tensor(3.1706, grad_fn=<NegBackward0>)
tensor(3.1573, grad_fn=<NegBackward0>)
tensor(3.1444, grad_fn=<NegBackward0>)
tensor(3.1318, grad_fn=<NegBackward0>)
tensor(3.1196, grad_fn=<NegBackward0>)
tensor(3.1077, grad_fn=<NegBackward0>)
tensor(3.0960, grad_fn=<NegBackward0>)
tensor(3.0847, grad_fn=<NegBackward0>)
tensor(3.0738, grad_fn=<NegBackward0>)
tensor(3.0631, grad_fn=<NegBackward0>)
tensor(3.0527, grad_fn=<NegBackward0>)
tensor(3.0426, grad_fn=<NegBackward0>)
tensor(3.0328, grad_fn=<NegBackward0>)
tensor(3.0232, grad_fn=<N