# Model

In [147]:
from mininlp.transformer import DTransformer
import json
from mininlp.data import Tokenizer, SequenceDataset
import os
import torch


VERSION = 0.1
MODEL_NAME = f'decoder_transformer_v{VERSION}'
config = json.load(open(f"../models/{MODEL_NAME}.json"))

tokenizer = Tokenizer()
tokenizer.load(os.path.join('../models', 'tokenizer.pkl'))

model = DTransformer(
    config['layers'], 
    config['embedding_dim'], 
    len(tokenizer), 
    config['seq_len'], 
    config['heads'], 
    config['factor'],
    True)
state_dict = torch.load(f"../models/{MODEL_NAME}.pt")
model.load_state_dict(state_dict)
model.to('cuda')

DTransformer(
  (_embedding): Embedding(
    (_token_embedding): Embedding(105, 512)
  )
  (_decoders): ModuleList(
    (0-5): 6 x Decoder(
      (_laynorm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (_laynorm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (_mmha): MultiHeadAttention(
        (_projection): ModuleList(
          (0-2): 3 x Linear(in_features=512, out_features=512, bias=True)
        )
        (_reprojection): Linear(in_features=512, out_features=512, bias=True)
      )
      (_mha): MultiHeadAttention(
        (_projection): ModuleList(
          (0-2): 3 x Linear(in_features=512, out_features=512, bias=True)
        )
        (_reprojection): Linear(in_features=512, out_features=512, bias=True)
      )
      (_ff): FeedForward(
        (_laynorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (_dropout): Dropout(p=0.2, inplace=False)
        (_ff): Sequential(
          (0): Linear(in_features=512, out_features=2048, 

In [148]:
tokenizer = Tokenizer()
tokenizer.load("../models/tokenizer.pkl")
dataset = SequenceDataset('../data/anna.txt', tokenizer, config['seq_len'], 1)

import torch.nn.functional as F
import matplotlib.pyplot as plt

model.eval()
with torch.no_grad():
    input = dataset[0][0].unsqueeze(0)
    output = model(input.to('cuda'))
    probs = F.softmax(output[0, -1, :], dim=0)
    probs = probs.detach().cpu()
    
plt.figure(figsize=(20, 5))
plt.bar(tokenizer.decode(torch.tensor(range(len(probs)))), probs)
plt.xticks(rotation=90)
plt.show()

In [149]:
text = tokenizer.decode(dataset[0][0])
text += ["<msk>"]
model.eval()
with torch.no_grad():
    prompt = dataset[0][0].unsqueeze(0).to('cuda')
    text += tokenizer.decode(model.generate(prompt, 500))

In [150]:
text = [t for t in text if t != "<pad>"]
print("".join(text))

<sos> all,
ridiculous people, who believe that one husband ought to live with the
one wife whom he has lawfully married; that a girl should be innocent, a
woman modest, and a man manly, self-controlled, and strong; that one
ought to bring up one's children, earn one's bread, and pay one's debts;
and various similar absurdities. This was the class of old-fashioned and
ridiculous people. But there was another class of people, the real
people. To this class they all belonged, and in it the great thing was
to be elegant, generous, plucky, gay, to abandon oneself without a blush
to every passion, and to laugh at everything else.

For the first moment only, Vronsky was startled after the impression of
a quite different world that he had brought with him from Moscow. But
immediately as though slipping his feet into old slippers, he dropped
back into the light-hearted, pleasant world he had always lived in.

The coffee was never really made, but spluttered over every one, and
boiled away, doin

In [151]:
arr = [4, 6, 8, 10, 12, 14, 16, 18, 20]
arr[5:40]

[14, 16, 18, 20]

### Gradient Accumulation

In [152]:
import torch 
import torch.nn.functional as F

data = torch.tensor([4.0, 6, 8, 10, 12, 14, 16, 18, 20]).reshape(3,-1)
labels = torch.tensor([10.0, 20, 30]).reshape(3,-1)

In [153]:
model = torch.nn.Linear(3, 1)
model(data)

tensor([[-2.0627],
        [-3.9297],
        [-5.7967]], grad_fn=<AddmmBackward0>)

Non gradient accumulation

In [154]:
loss = F.mse_loss(model(data), labels)
loss.backward()
model.weight.grad, model.bias.grad

(tensor([[-573.5305, -669.2493, -764.9683]]), tensor([-47.8594]))

Gradient accumulation

In [155]:
i = 0
model.weight.grad = torch.zeros_like(model.weight.grad)
model.bias.grad = torch.zeros_like(model.bias.grad)
model.weight.grad, model.bias.grad

(tensor([[0., 0., 0.]]), tensor([0.]))

In [156]:
loss = F.mse_loss(model(data[i,]), labels[i,]) / 3
loss.backward()
i += 1
model.weight.grad, model.bias.grad

(tensor([[-32.1672, -48.2509, -64.3345]]), tensor([-8.0418]))