In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
# downlaod deepmind's pretrained language model
# !wget -O deepmind_assets/language_perceiver_io_bytes.pickle https://storage.googleapis.com/perceiver_io/language_perceiver_io_bytes.pickle

In [3]:
command = "--dataset EURLex-4K"

import argparse
parser = argparse.ArgumentParser()
parser.add_argument('--project', default='PerceiverIO')
parser.add_argument('--dataset', default='EURLex-4K')
parser.add_argument('--device', type=str, default='cuda:0')

args = parser.parse_args(command.split())

In [4]:
args.expname = args.project
args.maxlen = 2048
args.vocab_size = 262
args.embed_dim = 768
args.num_latents = 256

args.n_epochs = 10
args.xc_lr = 1e-2
args.enc_lr = 2e-5
args.bsz = 32
args.dropout = 0.4
args.warmup = 0.1
args.loss_with_logits = True
args.amp = False
args.eval_interval = 2

In [5]:
from perceiver_io.perceiver_lm import PerceiverLM

import torch
import torch.nn as nn
import transformers

from deepmind_assets import bytes_tokenizer
import numpy as np
from tqdm.notebook import tqdm
import scipy.sparse as sp
# The tokenizer is just UTF-8 encoding (with an offset)
tokenizer = bytes_tokenizer.BytesTokenizer()

In [6]:
encoder = PerceiverLM(vocab_size=args.vocab_size, 
                    max_seq_len=args.maxlen, 
                    embedding_dim=args.embed_dim, 
                    num_latents=args.num_latents, 
                    latent_dim=1280, 
                    qk_out_dim=256, 
                    num_self_attn_per_block=26, 
                    lm=False)

encoder.load_pretrained("deepmind_assets/language_perceiver_io_bytes.pickle")

In [None]:
input_str = "This is an incomplete sentence where some words are missing."
input_tokens = tokenizer.to_int(input_str)

# Mask " missing.". Note that the model performs much better if the masked chunk
# starts with a space.
input_tokens[51:60] = tokenizer.mask_token
print("Tokenized string without masked bytes:")
print(tokenizer.to_string(input_tokens))

#@title Pad and reshape inputs
inputs = input_tokens[None]
input_mask = np.ones_like(inputs)

def pad(max_sequence_length: int, inputs, input_mask):
    input_len = inputs.shape[1]
    assert input_len <= max_sequence_length
    pad_len = max_sequence_length - input_len
    padded_inputs = np.pad(
      inputs,
      pad_width=((0, 0), (0, pad_len)),
      constant_values=tokenizer.pad_token)
    padded_mask = np.pad(
      input_mask,
      pad_width=((0, 0), (0, pad_len)),
      constant_values=0)
    return padded_inputs, padded_mask

inputs, input_mask = pad(args.maxlen, inputs, input_mask)

encoder.eval()
mask = torch.tensor(input_mask)
input_ids = torch.tensor(inputs)
out = encoder.forward(input_ids, mask)

embs = out * mask.unsqueeze(-1) / mask.sum(dim=-1)

logits = torch.matmul(out, encoder.token_embedding.weight.T) + encoder.decoder_token_bias
masked_tokens_predictions = logits[0, 51:60].argmax(dim=-1)
print("Greedy predictions:")
print(masked_tokens_predictions)
print()
print("Predicted string:")
print(tokenizer.to_string(masked_tokens_predictions.cpu().detach().numpy()))

In [7]:
from utils import csr_to_pad_tensor, ToD, read_sparse_mat
from torch.nn.utils.rnn import pad_sequence

class XMLDataset(torch.utils.data.Dataset):
    def __init__(self, inputs, labels, tokenizer, maxlen):
        self.maxlen = maxlen
        self.input_ids = pad_sequence([torch.LongTensor(tokenizer.to_int(x)[:maxlen]) for x in inputs], batch_first=True, padding_value=0)
        self.input_mask = (self.input_ids != 0).long()
        self.labels = labels
            
    def __getitem__(self, index):
        return index
    
    def get_fts(self, indices, source='point'):
        input_mask = self.input_mask[indices]
        max_batch_seq_len = input_mask.sum(dim=-1).max()
        return {'input_ids': self.input_ids[indices, :max_batch_seq_len], 'input_mask': input_mask[:, :max_batch_seq_len]}
   
    def __len__(self):
        return self.labels.shape[0]
    
class XMLCollator():
    def __init__(self, dataset):
        self.numy = dataset.labels.shape[1]
        self.dataset = dataset
    
    def __call__(self, batch):
        ids = torch.LongTensor(batch)
        batch_data = {'batch_size': torch.LongTensor([len(batch)]),
                      'numy': torch.LongTensor([self.numy]),
                      'y': csr_to_pad_tensor(self.dataset.labels[ids], self.numy),
                      'ids': ids,
                      'xfts': self.dataset.get_fts(ids)
                     }
                
        return batch_data

In [8]:
DATA_DIR = 'Datasets/EURLex-4K'

trnX = [x.strip() for x in open(f'{DATA_DIR}/raw/trn_X.txt').readlines()]
tstX = [x.strip() for x in open(f'{DATA_DIR}/raw/tst_X.txt').readlines()]
trn_X_Y = read_sparse_mat(f'{DATA_DIR}/trn_X_Y.txt', use_xclib=False)
tst_X_Y = read_sparse_mat(f'{DATA_DIR}/tst_X_Y.txt', use_xclib=False)
inv_prop = xc_metrics.compute_inv_propesity(trn_X_Y, 0.55, 1.5)

args.numy = trn_X_Y.shape[1]

15449it [00:00, 103796.22it/s]
3865it [00:00, 124525.55it/s]


NameError: name 'xc_metrics' is not defined

In [9]:
trn_dataset = XMLDataset(trnX, trn_X_Y, tokenizer, args.maxlen)
tst_dataset = XMLDataset(tstX, tst_X_Y, tokenizer, args.maxlen)

In [10]:
trn_loader = torch.utils.data.DataLoader(
    trn_dataset,
    batch_size=4,
    num_workers=2,
    collate_fn=XMLCollator(trn_dataset),
    shuffle=True,
    pin_memory=True)

tst_loader = torch.utils.data.DataLoader(
    tst_dataset,
    batch_size=args.bsz,
    num_workers=2,
    collate_fn=XMLCollator(tst_dataset),
    shuffle=False,
    pin_memory=True)

In [39]:
class Net(nn.Module):
    def __init__(self, encoder, args):
        super().__init__()
        self.encoder = encoder
        self.numy = args.numy
        self.dropout = nn.Dropout(args.dropout)
        self.w = nn.Linear(args.embed_dim, args.numy)
    
    def forward(self, b):
        embs = self.encoder(b['xfts']['input_ids'], b['xfts']['input_mask'])
        mask = b['xfts']['input_mask']
        embs = embs * mask.unsqueeze(-1) / mask.sum(dim=-1).reshape(-1, 1, 1)
        embs = embs.sum(dim=1)
        out = self.w(self.dropout(embs))
        return out
    
    def predict(self, tst_loader, K=100):
        tst_X_Y = tst_loader.dataset.labels
        data = np.zeros((tst_X_Y.shape[0], K))
        inds = np.zeros((tst_X_Y.shape[0], K)).astype(np.int32)
        indptr = np.arange(0, tst_X_Y.shape[0]*K+1, K)
        self.eval()

        with torch.no_grad():
            for b in tqdm(tst_loader, leave=True, desc='Evaluating'):
                b = ToD(b, self.get_device())
                out = self(b)
                top_data, top_inds = torch.topk(out, K)
                data[b['ids'].cpu()] = top_data.detach().cpu().numpy()
                inds[b['ids'].cpu()] = top_inds.detach().cpu().numpy()
                del top_data, top_inds, b, out

        torch.cuda.empty_cache()
        score_mat = sp.csr_matrix((data.ravel(), inds.ravel(), indptr), tst_X_Y.shape)
        
        return score_mat
    
class OvABCELoss(nn.Module):
    def __init__(self, args, reduction='mean'):
        super(OvABCELoss, self).__init__()
        if args.loss_with_logits:
            self.criterion = torch.nn.BCEWithLogitsLoss(reduction=reduction)
        else:
            self.criterion = torch.nn.BCELoss(reduction=reduction)

    def forward(self, model, b):
        out = model(b)
        targets = torch.zeros((out.shape[0], out.shape[1]+1), device=out.device).scatter_(1, b['y']['inds'], 1)[:, :-1]
        loss = self.criterion(out, targets)
        return loss

In [35]:
net = Net(encoder, args)
loss = OvABCELoss(args)

In [48]:
optim_wrap = {
    'xc' : {'class': torch.optim.Adam, 'params': [], 'args': {'lr': args.xc_lr}},
    'enc': {'class': transformers.optimization.AdamW, 'params': [], 
            'args': {'lr': args.enc_lr, 'eps': 1e-06, 'weight_decay': 0.01}}
    }

for n,p in net.named_parameters():
    if n[:8] == 'encoder.': optim_wrap['enc']['params'].append(p)
    else: optim_wrap['xc']['params'].append(p)
        
optims = []
for k, v in optim_wrap.items():
    if len(v['params']) > 0: optims.append(v['class'](v['params'], **v['args']))
        

total_steps = len(trn_loader)*args.n_epochs
schedulers = [transformers.get_linear_schedule_with_warmup(optim, num_warmup_steps=int(args.warmup*total_steps), num_training_steps=total_steps) for optim in optims]

In [36]:
net.eval();

In [14]:
best_ndcg = -100
for epoch in range(args.n_epochs):
    net.train()
    cum_loss = 0; ctr = 0
    t = tqdm(trn_loader, desc='Epoch: 0, Loss: 0.0', leave=True)
          
    for b in t:        
        for optim in optims: optim.zero_grad()
        b = net.ToD(b)
        with torch.cuda.amp.autocast(enabled=args.amp):
            loss = criterion(net, b)
        loss.backward()
        for optim in optims: optim.step()
        for sch in schedulers: sch.step()
        cum_loss += loss.item()
        ctr += 1
        t.set_description('Epoch: %d/%d, Loss: %.4E'%(epoch, args.n_epochs, (cum_loss/ctr)), refresh=True)
    
    print(f'mean loss after epoch {epoch}/{args.n_epochs}: {"%.4E"%(cum_loss/ctr)}', flush=True)
    if epoch%args.eval_interval == 0 or epoch == (args.n_epochs-1):
        score_mat = net.predict(tst_loader)
        metrics = XCMetrics(score_mat, tst_X_Y, inv_prop, method=args.expname, disp=True)

        if metrics.loc[args.expname]['nDCG@5'] > best_ndcg:
            best_ndcg = metrics.loc[args.expname]['nDCG@5']
            print(_c(f'Found new best model with nDCG@5: {"%.2f"%best_ndcg}\n', attr='blue'))
            sp.save_npz(f'{OUT_DIR}/score_mat.npz', score_mat)
            metrics.to_csv(f'{OUT_DIR}/metrics.tsv', sep='\t')
            torch.save(net.state_dict(), f'{OUT_DIR}/model.pt')
    sys.stdout.flush()

In [37]:
out = net(b)

RuntimeError: [enforce fail at CPUAllocator.cpp:68] . DefaultCPUAllocator: can't allocate memory: you tried to allocate 67108864 bytes. Error code 12 (Cannot allocate memory)

In [17]:
num_params = 0
for p in model.parameters():
    num_params += np.prod(p.shape)

In [18]:
num_params

201108230