In [None]:
%load_ext autoreload
%autoreload 2
import torch
from torch import nn
from positional_encoding import LearnedPositionalEmbedding, SinusoidalPositionalEmbedding
from utils import Embedding
from config import Config as config

## Generate sample dataset

In [None]:
def generate_data(generator, vocab_size, batch_size=8, length=32, padding_idx=0, 
                  unk_idx=1, bos_idx=2, eos_idx=3,):
    min_pad_idx = max(1, length // 2)
    tensor = torch.randint(1, vocab_size, size=(batch_size, length), generator=generator, dtype=torch.long)
    mask_idx = torch.randint(min_pad_idx, length, size=(batch_size, 1), generator=generator)
    mask = torch.arange(length) >= mask_idx
    tensor.masked_fill_(mask, padding_idx)
    tensor_length = (~mask).sum(dim=1)
    tensor[torch.arange(batch_size), 0] = bos_idx
    tensor[torch.arange(batch_size), tensor_length - 1] = eos_idx
    return tensor, tensor_length    

In [None]:
generator = torch.Generator()
torch.manual_seed(config.seed)
generator.manual_seed(config.seed)


In [None]:
src_tensor, src_length = generate_data(generator, config.encoder.vocab_size, batch_size=config.batch_size,
                                      length=config.seq_length)

In [None]:
src_tensor

In [None]:
src_tensor.ne(0).sum(dim=1), src_length

In [None]:
tgt_tensor, tgt_length = generate_data(generator, 
                                       config.decoder.vocab_size, 
                                       batch_size=config.batch_size,
                                       length=config.seq_length
                                       )

In [None]:
tgt_tensor

In [None]:
tgt_tensor.size()

## Embedding

In [None]:
encoder_pos_embed = LearnedPositionalEmbedding(num_embeddings=config.seq_length,
                                               embedding_dim=config.encoder.embed_dim,
                                               padding_idx=0)


In [None]:
encoder_pos_embed(src_tensor)

In [None]:
encoder_pos_embed(src_tensor).shape

In [None]:
decoder_pos_embed = LearnedPositionalEmbedding(num_embeddings=config.seq_length,
                                               embedding_dim=config.decoder.embed_dim,
                                               padding_idx=0)

In [None]:
decoder_pos_embed(tgt_tensor)

In [None]:
decoder_pos_embed(tgt_tensor).shape

In [None]:
tgt_tensor.size(), src_tensor.size()

In [None]:
enc_pos_embed = SinusoidalPositionalEmbedding(num_embeddings=config.seq_length,
                                               embedding_dim=config.encoder.embed_dim,
                                               padding_idx=0)


In [None]:
enc_pos_embed.weights

In [None]:
enc_pos_embed.weights.dtype

In [None]:
enc_pos_embed = SinusoidalPositionalEmbedding(num_embeddings=config.seq_length,
                                               embedding_dim=config.encoder.embed_dim,
                                               padding_idx=0)
enc_pos_embed = enc_pos_embed.half()

In [None]:
enc_pos_embed.weights.dtype

In [None]:
enc_pos_embed.weights

In [None]:
enc_pos_embed._float_tensor

In [None]:
enc_pos_embed(src_tensor).dtype

In [None]:
enc_pos_embed.weights.dtype

In [None]:
enc_pos_embed = SinusoidalPositionalEmbedding(num_embeddings=config.seq_length,
                                               embedding_dim=config.encoder.embed_dim,
                                               padding_idx=0)
dec_pos_embed = SinusoidalPositionalEmbedding(num_embeddings=config.seq_length,
                                               embedding_dim=config.decoder.embed_dim,
                                               padding_idx=0)

In [None]:
enc_token_embed = Embedding(num_embeddings=config.encoder.vocab_size,
                            embedding_dim=config.encoder.embed_dim,
                            padding_idx=0)
dec_token_embed = Embedding(num_embeddings=config.decoder.vocab_size,
                            embedding_dim=config.decoder.embed_dim,
                            padding_idx=0)

In [None]:
enc_token_embed.weight

## Transformer Encoder

In [None]:
%load_ext autoreload
%autoreload 2
from model import TransformerModel

In [None]:
transformer = TransformerModel()

In [None]:
src_tensor.shape

In [None]:
src_tensor = src_tensor.cuda()
tgt_tensor = tgt_tensor.cuda()
transformer.train()
transformer.cuda()
print()

In [None]:
src_length

In [None]:
out = transformer(src_tensor, src_length, tgt_tensor)

In [None]:
out.size()

In [None]:
out.dtype

In [None]:
out[:, -1:, :].shape

In [None]:
out[0]

In [None]:
transformer.half()

In [None]:
next(transformer.parameters()).dtype

In [None]:
out = transformer(src_tensor, src_length, tgt_tensor)
out.dtype

In [None]:
out

In [None]:
transformer.enc_token_embed.weight.dtype

In [None]:
def generate( src_tensor, src_lengths, tgt_tensor, incremental_state):
    transformer.eval()
    with torch.no_grad():
        enc_out, enc_key_pad = transformer.encoder(src_tensor, src_length)
        inps = tgt_tensor[:, :1]
        for i in range(5):
            out = transformer.decoder(inps, enc_out, enc_key_pad, incremental_state)
            scores = torch.softmax(out, -1)
            max_idx = scores.argmax(dim=-1)
            inps = torch.column_stack([inps, max_idx])
            print(inps)

incremental_state = dict()
generate(src_tensor, src_length, tgt_tensor, incremental_state)

In [None]:
transformer.decoder.embed_pos.weights.shape

## Dataloader

In [None]:
%load_ext autoreload
%autoreload 2
from dataset import LazyParallelDataset
from tokenizer import Tokenizer
import os
from config import Config as config
import torch

In [None]:
dataset_path = config.dataset.path

In [None]:
tokenizer = Tokenizer(vocab_fname=os.path.join(dataset_path, config.train.dataset.vocab_fname, ),
                      bpe_fname=os.path.join(dataset_path, config.train.dataset.bpe_fname,),
                      lang={'src': config.dataset.src.name,
                            'tgt': config.dataset.tgt.name}
                      )
tokenizer

In [None]:
tokenizer.tok2idx

In [None]:
train_src_fname = os.path.join(dataset_path, config.dataset.src.train_fname )
train_tgt_fname = os.path.join(dataset_path, config.dataset.tgt.train_fname  )
train_max_len = config.dataset.src.max_seq_len
train_min_len = config.dataset.src.min_seq_len
train_data = LazyParallelDataset(src_fname=train_src_fname,
                             tgt_fname=train_tgt_fname,
                             tokenizer=tokenizer,
                             min_len=train_min_len,
                             max_len=train_max_len,
                             sort=False,
                             max_size=config.dataset.size
                             )

In [None]:
len(train_data)

In [None]:
train_data.raw_src[0]

In [None]:
train_data.raw_tgt[0]

In [None]:
train_data.raw_src[150]

In [None]:
train_data.raw_tgt[150]

In [None]:
tokenizer.bpe.process_line("This is not taking a stand against the three intellectuals quoted here.")

In [None]:
tokenizer.tokenize("This is not taking a stand against the three intellectuals quoted here.")

In [None]:
[tokenizer.idx2tok[i] for i  in [2,   148,    35,    68,  1511,    17,  6984,   373,     7,   664,
        17338,  1787, 25129,  1875,     5,     3]]

In [None]:
tensor = tokenizer.tokenize("This is not taking a stand against the three intellectuals quoted here.")
tokenizer.detokenize(tensor.numpy())

In [None]:
generator = torch.Generator()

In [None]:
def get_epoch_seeds(generator, epochs):
    seeds = torch.randint(0, int(9e6), size=(epochs, ), generator=generator).tolist()
    return seeds
        
seeds = get_epoch_seeds(generator, config.train.epochs)
seeds

In [None]:
train_loader = train_data.get_loader(batch_size=2,
                                     seeds=seeds,
                                     batch_first=True,
                                     shuffle=True, 
                                     batching="bucketing",
                                     batching_opt={'num_buckets': 5},
                                     num_workers=0,
                                     drop_last=True
                                     )

In [None]:
train_loader.sampler.set_epoch(2)

In [None]:
next(iter(train_loader))

In [None]:
for i,  ( (src_tensor, src_length), (tgt_tensor, tgt_length), (dec_tgt_tensor, dec_tgt_length)) in enumerate(train_loader):
    print(tokenizer.detokenize(src_tensor.numpy()[0]))
    print(tokenizer.detokenize(tgt_tensor.numpy()[0]))
    print(tokenizer.detokenize(dec_tgt_tensor.numpy()[0]))
    # print(dec_tgt_tensor.numpy())
    print()
    break

In [None]:
src_tensor[0]

In [None]:
tgt_tensor[0]

In [None]:
dec_tgt_tensor[0]

In [None]:
from loss import SoftmaxLoss

In [None]:
x = torch.arange(12).view(4, 3)

In [None]:
y = x.roll(-1, -1)
y

In [None]:
y[:, 0] = 5
x, y

In [None]:
src_tensor, src_length = generate_data(generator, config.encoder.vocab_size, 8)

In [None]:
src_tensor

In [None]:
loss = SoftmaxLoss()

In [None]:
out = torch.randn(src_tensor.size(0), src_tensor.size(1), 32)

In [None]:
src_tensor.size(), out.size()

In [None]:
s = loss( out, src_tensor)

In [None]:
s

In [None]:
s

In [None]:
s.shape

## Model reconstruction

In [None]:
%load_ext autoreload
%autoreload 2
from trainer import Trainer
import torch
# TODO elimnate TODO in step lamb
# TODO total_batch_size = 256 in config
# TODO check initialization of net
# TODO check Qlinear in multi head ent


In [None]:
from utils import Embedding

In [None]:
src = torch.tensor([[2, 4, 5, 0, 0, 0]])

In [None]:
embed = Embedding(num_embeddings=10, embedding_dim=6, padding_idx=0)

In [None]:
for w in embed.weight:
    print(w.requires_grad)

In [None]:
from torch import nn
def fair_Embedding(num_embeddings, embedding_dim, padding_idx):
    m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx)
    nn.init.normal_(m.weight, mean=0, std=embedding_dim**-0.5)
    nn.init.constant_(m.weight[padding_idx], 0)
    return m


In [None]:
embed = fair_Embedding(num_embeddings=10, embedding_dim=6, padding_idx=0)

In [None]:
for w in embed.weight:
    print(w.requires_grad)

In [None]:
%load_ext autoreload
%autoreload 2
import torch
from config_fairseq import config
from trainer_fairseq import get_epoch_seeds, Trainer as FairSeqTrainer, get_dataloader
from trainer import Trainer
import torch
from utils import Embedding
from torch import nn
def fair_Embedding(num_embeddings, embedding_dim, padding_idx):
    m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx)
    nn.init.normal_(m.weight, mean=0, std=embedding_dim**-0.5)
    nn.init.constant_(m.weight[padding_idx], 0)
    return m


In [None]:
generator = torch.Generator()
seeds = get_epoch_seeds(generator, config.train.epochs)
train_loader, tokenizer = get_dataloader(seeds)

In [None]:
fair_trainer = FairSeqTrainer(train_loader=train_loader, tokenizer=tokenizer,
                  generator=generator, seeds=seeds)

In [None]:
fair_trainer.model.encoder.embed_tokens

In [None]:
fair_trainer.model.encoder.embed_tokens.weight[0]

In [None]:
fair_trainer.model.encoder.embed_tokens.weight[0].requires_grad

In [None]:
fair_trainer.train()

In [None]:
fair_trainer.model.encoder.embed_tokens.weight[0]

In [None]:
fair_trainer.model.encoder.embed_positions(torch.tensor([[1, 2, 3, 0, 0]]).cuda())

In [None]:
model = fair_trainer.model

In [None]:
encoder = model.encoder

In [None]:
for m in encoder.embed_tokens.parameters():
    print(m.grad)

In [None]:
layer = encoder.layers[0]
layer

In [None]:
layer.quant_noise_block_size

In [None]:
list(layer.final_layer_norm.parameters())

In [None]:
layer.fc1_weight.requires_grad

In [None]:
fair_trainer.train()

In [None]:
for m in layer.parameters():
    if m.grad is None:
        print(m.grad is None, m.size())
        print(type(m))
        print(id(m))

In [None]:
nn.Parameter

In [None]:
for m in layer.self_attn.parameters():
    print(m.grad is None, m.size())

In [None]:
for m in layer.self_attn_layer_norm.parameters():
    print(m.grad is None, m.size())

In [None]:
for m in layer.fc1.parameters():
    print(m.grad is None, m.size())

In [None]:
for m in layer.fc2.parameters():
    print(m.grad is None, m.size())

In [None]:
for m in layer.final_layer_norm.parameters():
    print(m.grad is None, m.size())

In [None]:
for m in layer.activation_dropout_module.parameters():
    print(m.grad is None, m.size())

In [None]:
for buffer in layer.self_attn.buffers():
    print(buffer)

In [None]:
for child in layer.children():
    for param in child.parameters():
        print(param.grad is None)

In [None]:
for k, v in layer.self_attn.__dict__.items():
    print(id(v), id(v) == 140143996381760)

In [None]:
layer.quant_noise_block_size 

In [None]:
trainer = Trainer()
print(len(trainer.train_loader))

In [None]:
# state = torch.load("/mnt/dl/transformer/checkpoint-ep-00.pth")
# trainer.model.load_state_dict(state['model'])

In [None]:
trainer.train()

In [None]:
# for param in trainer.model.parameters():
#     print(param.device)

In [None]:
out = trainer.generate_torch_former(sample=52)

In [None]:
out.squeeze().argmax()

In [None]:
trainer.train(1)

In [None]:
from positional_encoding import PositionalEmbedding
from config import Config as config

In [None]:
embed_pos = PositionalEmbedding(num_embeddings=config.seq_length,
                                                 embedding_dim=config.encoder.embed_dim,
                                                 padding_idx=config.padding_idx)

In [None]:
x = torch.tensor([[2, 3, 4, 5, 0, 0, 0, 0],
                  [2, 4, 5, 3, 2, 5, 2, 0] ])

In [None]:
embed_pos(x).size()

In [None]:
trainer.model.embed_dec_pos.weights.size()

In [None]:
from torch.nn import Transformer

In [None]:
# state = torch.load("/mnt/dl/transformer/checkpoint-ep-00.pth")
# trainer.model.load_state_dict(state['model'])

In [None]:
time.time() - start

In [None]:
trainer.scheduler.last_epoch

In [None]:
%load_ext autoreload
%autoreload 2
from trainer import Trainer
import time
import torch
# TODO elimnate TODO in step lamb
from positional_encoding import SinusoidalPositionalEmbedding


In [None]:
trainer = Trainer()

In [None]:
trainer.train()

In [None]:
trainer.model.embed_enc_pos(torch.tensor([[1, 2, 3, 0, 0,]]).cuda())

In [None]:
import matplotlib.pyplot as plt
import numpy as np
plt.style.use('seaborn-v0_8')

In [None]:
plt.plot(trainer.lrs)

In [None]:
fig, ax = plt.subplots()
x = np.arange(1, 1001)
y = 1 - np.exp(-(x+100)/250)
ax.plot(x, y)

In [None]:
# 1 - exp(-(t + b) / a)
# t = 0 => lr1 = 1 - exp(-b/a) => log(1 / 1 - lr1) = b / a
# t = 1000 => lr2 = 1 

In [None]:
lr2 = 1e-3
lr1 = 5e-5
s = 1000
c = 5
ratio = np.log(lr2 / (lr2 - lr1))
a = 1000. / (c - ratio)
b = a * ratio
x = np.arange(1, 1001)
y = lr2 * ( 1 - np.exp(-(x + b) / a ))
fig, ax = plt.subplots()
ax.plot(x, y)

In [None]:
y

In [None]:
y[-1] * 0.99**100

In [None]:
import math
def get_lrs_warmup_stage(lr, warmup_lr, last_epoch):
        constant = 5
        ratio = math.log(lr / (lr - warmup_lr))
        a = 2000. / (constant - ratio)
        b = a * ratio
        return lr * ( 1 - math.exp(-(last_epoch + b) / a ))
y = [get_lrs_warmup_stage(1e-3, 1e-5, i) for i in range(1, 1001)]
fig, ax = plt.subplots()
ax.plot(range(1, 1001), y)


In [None]:
y[:4]

In [None]:
y[-1]

In [None]:
lr2 = 3e-3
lr1 = 5e-5
s = 1000
c = 5
ratio = np.log(lr2 / (lr2 - lr1))
a = 1000. / (c - ratio)
b = a * ratio
x = np.arange(1, 1001)
y = lr2 *  (1 - ((x + 1)/10000) ** 0.5)
fig, ax = plt.subplots()
ax.plot(x, y)

In [None]:
x = np.arange(1, 1000)
lr = 1e-3
y = lr *  np.exp(x * np.log(0.9992))
fig, ax = plt.subplots()
ax.plot(x, y)

In [None]:
y[-1]

In [None]:
100/101

In [None]:
0.98**0.4

In [None]:
x = np.arange(0, 1000)
lr = 1e-4
y = lr * ( 1 - (x / 12000) ** 0.9)
fig, ax = plt.subplots()
ax.plot(x, y)

In [None]:
x = np.arange(0, 1000)
lr = 3e-3
y = lr * ( 1 - (x / 12000) ** 0.1)
fig, ax = plt.subplots()
ax.plot(x, y)

In [None]:
y[-1]

In [None]:
%load_ext autoreload
%autoreload 2
import torch
from config_fairseq import config
from trainer_fairseq import get_epoch_seeds, Trainer, get_dataloader

## Fairseq Transformer

In [None]:
generator = torch.Generator()
seeds = get_epoch_seeds(generator, config.train.epochs)
train_loader, tokenizer = get_dataloader(seeds)

In [None]:
# Trainer.reset(trainer)

In [None]:
trainer = Trainer(train_loader=train_loader, tokenizer=tokenizer,
                  generator=generator, seeds=seeds)

In [None]:
trainer.train()

In [None]:
state = torch.load('/mnt/dl/transformer_fairseq/checkpoint-ep-06.pth')
trainer.model.load_state_dict(state['model'])
trainer.optimizer.load_state_dict(state['optimizer'])
trainer.scheduler.load_state_dict(state['scheduler'])

In [None]:
trainer.train(start_epoch=7)

In [None]:
trainer.generate()

In [None]:
trainer.model.net.training

## Training with my model

In [None]:
%load_ext autoreload
%autoreload 2
import torch
from config import Config as cfg
# from trainer import get_epoch_seeds, Trainer, get_dataloader
from trainer_fp16 import get_epoch_seeds, Trainer, get_dataloader

In [None]:
generator = torch.Generator()
seeds = get_epoch_seeds(generator, cfg.train.epochs)
train_loader, tokenizer = get_dataloader(cfg, seeds)

In [None]:
trainer = Trainer(cfg=cfg, train_loader=train_loader, tokenizer=tokenizer,
                  generator=generator, seeds=seeds)

In [None]:
# trainer.train()

In [None]:
import os
checkpoint = os.path.join(cfg.savepath, 'checkpoint99 (copy).pth')
state = torch.load(checkpoint)

In [None]:
state.keys()

In [None]:
model = trainer.model
optimizer = trainer.optimizer
scheduler = trainer.scheduler
model.load_state_dict(state['model'])
optimizer.load_state_dict(state['optimizer'])
scheduler.load_state_dict(state['scheduler'])

src = state['src']
tgt = state['tgt']
dec_tgt = state['dec_tgt']

In [None]:
model.half()

In [None]:
with torch.no_grad():
    out = model(src, tgt)
    loss = trainer.loss_fn(out, dec_tgt)
    print(loss)

In [None]:
state['losses']

In [None]:
out = model(src, tgt)
loss = trainer.loss_fn(out, dec_tgt)
print(loss)

In [None]:
trainer.fp_16opt.step(loss, None, optimizer, True, scheduler, )

In [None]:
%load_ext autoreload
%autoreload 2
import torch
from config import Config as cfg
# from trainer import get_epoch_seeds, Trainer, get_dataloader
# from trainer_fp16 import get_epoch_seeds, Trainer, get_dataloader
from trainer_scaler import get_epoch_seeds, Trainer, get_dataloader

In [None]:
generator = torch.Generator()
seeds = get_epoch_seeds(generator, cfg.train.epochs)
train_loader, tokenizer = get_dataloader(cfg, seeds)

In [None]:
trainer = Trainer(cfg=cfg, train_loader=train_loader, tokenizer=tokenizer,
                  generator=generator, seeds=seeds)

In [None]:
trainer.train()

In [None]:
# trainer.resume(ckp='checkpoint-ep-09.pth')

In [None]:
import torch

In [None]:
input = torch.randn((2, 10))
target = torch.randint(0, 10, (2, ))

In [None]:
target

In [None]:
input

In [None]:
torch.nn.functional.cross_entropy(input, target, reduce='sum') 

In [None]:
torch.nn.functional.cross_entropy(input, target, reduce='sum') / 10

In [None]:
weight = torch.empty(10).fill_(1000.)
torch.nn.functional.cross_entropy(input, 
                                  target, 
                                  reduce='sum', 
                                  weight=weight
                                  ) 

In [None]:
weight.shape

In [None]:
trainer.model

In [None]:
model = trainer.model.cuda()

In [None]:
samples = trainer.get_sample(2)

In [None]:
samples['text']['src'][1]

In [None]:
samples['text']['tgt'][1]

In [None]:
samples['text']['dec_tgt'][1]

In [None]:
samples['tensors']

In [None]:
samples['tensors']['src'].size()

In [None]:
samples['tensors']['tgt'].size()

In [None]:
cfg.encoder.vocab_size

In [None]:
s, t = samples['tensors']['src'][:1], samples['tensors']['tgt'][:1]

In [None]:
s = torch.concat([s, s.new_zeros(1, 64-s.size(1))], dim=1)

In [None]:
s.size()

In [None]:
t = torch.concat([t, t.new_zeros(1, 64-t.size(1))], dim=1)

In [None]:
t.size()

In [None]:
s1 = s.clone().expand(256, -1)
t1 = t.clone().expand(256, -1)

In [None]:
model(s1.cuda(), t1.cuda()).size()

In [1]:
%load_ext autoreload
%autoreload 2
import torch
from config import Config as cfg
# from trainer import get_epoch_seeds, Trainer, get_dataloader
# from trainer_fp16 import get_epoch_seeds, Trainer, get_dataloader
from trainer_scaler import get_epoch_seeds, Trainer, get_dataloader

In [2]:
generator = torch.Generator()
seeds = get_epoch_seeds(generator, cfg.train.epochs)
train_loader, tokenizer = get_dataloader(cfg, seeds)


trainer = Trainer(cfg=cfg, train_loader=train_loader, tokenizer=tokenizer,
                generator=generator, seeds=seeds)

Building vocabulary /mnt/dl/Translation/WMT_15/en-fr/vocab.bpe.40000
Vocabulary size 40282
Bucket 0 had length 360175
Bucket 1 had length 1109303
Bucket 2 had length 1136269
Bucket 3 had length 733587
Bucket 4 had length 382284
New samples  3721344
Number of train steps for training is 10902


In [25]:
import os
trainer.model.load_state_dict(torch.load(os.path.join(trainer.savepath, "checkpoint10.pth"))['model'])

<All keys matched successfully>

In [30]:
sample = trainer.get_sample()

Getting sample...


In [31]:
sample

{'tensors': {'src': tensor([[    2,   296,  4895,     8, 18550,     4,   222,   936,    28,    91,
             380,     4,    17,  1393,     8,  2932,   568,  8219,   954,  5812,
           20891,     7,  1108,   312, 19147,     8,  6026,     4,   414,  5688,
             902,     4,    10,     7,   514,     8,     7,   279,   418,     4,
              59,    55, 17131,   711,    59,   106,    81,  4689,     5,     3]]),
  'tgt': tensor([[    2,   117,   380,     4,    34, 11464,    22,  2932,   568,  3206,
            2339, 25286, 22736,   766,     6,     9, 16117,   559,     6,    14,
            1266,     4,    47, 13757,     6,     9,  4800, 27112,    23,  6026,
               4,     6,   220,  5688,   902,    11,    23,   461,     6,    14,
             437,   318,     5,     3,     0,     0,     0,     0,     0,     0,
               0]]),
  'dec_tgt': tensor([[  117,   380,     4,    34, 11464,    22,  2932,   568,  3206,  2339,
           25286, 22736,   766,     6,     9, 161

In [39]:
trainer.translate_random_sample()

Src:  We recall that the hybrid creation of the Peacebuilding Commission as an intergovernmental advisory body by concurrent resolutions of the General Assembly and the Security Council on 20 December 2005 makes it a unique institutional mechanism and the first of its kind in the United Nations.
Tgt:  Nous rappelons qu'ayant été créée par l'adoption simultanée de résolutions de l'Assemblée générale et du Conseil de sécurité le 20 décembre 2005, la Commission de consolidation de la paix est un dispositif institutionnel hors pair et le premier de cette nature au sein de l'ONU.
Starting translation... 

Translation is:  Nous rappelons que la création hybride de la Commission de consolidation de la paix comme organe consultatif intergouvernemental par les résolutions de l'Assemblée générale et du Conseil de sécurité le 20 décembre 2005 constitue un mécanisme institutionnel unique et le premier de son genre dans l'ONU.


In [34]:
trainer.train_loader.dataset[10]

(tensor([    2,  4419,  6213,  2417,     4,    74,    49, 12262,    20,     7,
         19482,  2350,    21,  2433, 10597,  8178, 10357,     4,  9067,  1634,
             4, 36270,   602,  5658,     4, 36587,  3494,    19,   260,    42,
         22064,   390, 12833,     7, 31222,   202,     7, 19482,  2350,     5,
             3]),
 tensor([    2,    64,   263,     6,     9,  1144, 10425,    16,    14,  1811,
            32,    14,  2687,     6,  8355,    21, 24133,     6,  1414, 23240,
          1478,     4,  9291, 18851,     4, 22439,   190,  5658,     4, 29087,
            19,    77,   733,    65,  1120,    24,   290,   766,     6,    14,
           508,    12, 27268,  5759,    32,    14,  2687,     6,  8355,    43,
             3]))

In [2]:
import torch
import os

In [5]:
losses = torch.load("/mnt/dl/transformer_lamb/checkpoint11.pth")

In [6]:
losses['epoch']

2

In [1]:
%load_ext autoreload
%autoreload 2
import torch
from config import Config as cfg
from trainer_fp16 import get_epoch_seeds, Trainer, get_dataloader

In [8]:
x = torch.tensor([2**15 - 1]).half()

In [9]:
x

tensor([32768.], dtype=torch.float16)

In [27]:
2**16

65536

In [46]:
x +  32744

tensor([65504.], dtype=torch.float16)

In [28]:
64768. - 32000

32768.0

In [47]:
hex(65504)

'0xffe0'

In [76]:
torch.tensor(-11111111).half() / 255888

tensor(nan, dtype=torch.float16)

In [61]:
65504/(2**16 - 2)

0.99954222235786

In [56]:
2**15

32768

In [62]:
65504 // 2

32752

In [63]:
32752 // 2

16376

In [64]:
16376 // 2

8188

In [65]:
8188 // 2

4094

In [66]:
4094 // 2

2047

In [None]:
2047 

In [72]:
65504  + 65504

131008

In [74]:
131008 / 2**16

1.9990234375