In [1]:
!pip install local_attention

Defaulting to user installation because normal site-packages is not writeable


In [2]:
!pip install axial_positional_embedding

Defaulting to user installation because normal site-packages is not writeable


In [24]:
import os
os.chdir('c:\\Users\\Guillaume\\ViT_vs_Performers\\src\\Performers')
from performers_pytorch import PerformerLM
from autoregressive_wrapper import AutoregressiveWrapper

import random
import tqdm
import gzip
import numpy as np
import torch
import torch.optim as optim
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset
from torch.cuda.amp import autocast, GradScaler

# Load datasets

In [30]:
##########################################################################################################################################
################################################################## MNIST #################################################################
##########################################################################################################################################

os.chdir('c:\\Users\\Guillaume\\ViT_vs_Performers\\Data')
data = np.load('mnist.npz')
print(data.files)
print(data['x_train'][0])

x_train = data['x_train']
y_train = data['y_train']
x_test = data['x_test']
y_test = data['y_test']

## Changing to pytorch tensors

x_train = torch.from_numpy(x_train).float()
y_train = torch.from_numpy(y_train)
x_test = torch.from_numpy(x_test).float()
y_test = torch.from_numpy(y_test)

## Adding the channels dimension

x_train = x_train.unsqueeze(1)
""" y_train = y_train.unsqueeze(1) """
x_test = x_test.unsqueeze(1)
""" y_test = y_test.unsqueeze(1) """

['x_test', 'x_train', 'y_train', 'y_test']
[[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   3  18  18  18 126 136
  175  26 166 255 247 127   0   0   0   0]
 [  0   0   0   0   0   0   0   0  30  36  94 154 170 253 253 253 253 253
  225 172 253 242 195  64   0   0   0   0]
 [  0   0   0   0   0   0   0  49 238 253 253 253 253 253 253 253 253 251
   93  82  82  56  39   0   0   0   0   0]
 [  0   0   0   0   0

' y_test = y_test.unsqueeze(1) '

# Running the model

In [28]:
device = 'cpu'

# constants

NUM_BATCHES = int(1e5)
BATCH_SIZE = 4
GRADIENT_ACCUMULATE_EVERY = 4
LEARNING_RATE = 1e-4
VALIDATE_EVERY  = 100
GENERATE_EVERY  = 500
GENERATE_LENGTH = 2048
SEQ_LEN = 4096

# helpers

def cycle(loader):
    while True:
        for data in loader:
            yield data

def decode_token(token):
    return str(chr(max(32, token)))

def decode_tokens(tokens):
    return ''.join(list(map(decode_token, tokens)))

# instantiate model

model = PerformerLM(
    num_tokens = 256,
    dim = 512,
    depth = 6,
    max_seq_len = SEQ_LEN,
    heads = 8,
    causal = True,
    reversible = True,
    nb_features = 256,
    use_scalenorm = True,
    shift_tokens = True,
    local_attn_heads = (8, 8, 8, 6, 4, 2)
)

model = AutoregressiveWrapper(model)
## model.cuda()

# prepare enwik8 data

#with gzip.open('./data/enwik8.gz') as file:
#    X = np.fromstring(file.read(int(95e6)), dtype=np.uint8)
#    trX, vaX = np.split(X, [int(90e6)])
#    data_train, data_val = torch.from_numpy(trX), torch.from_numpy(vaX)

class TextSamplerDataset(Dataset):
    def __init__(self, data, seq_len):
        super().__init__()
        self.data = data
        self.seq_len = seq_len

    def __getitem__(self, index):
        rand_start = torch.randint(0, self.data.size(0) - self.seq_len - 1, (1,))
        full_seq = self.data[rand_start: rand_start + self.seq_len + 1].long()
        return full_seq #.cuda()

    def __len__(self):
        return self.data.size(0) // self.seq_len

#train_dataset = TextSamplerDataset(data_train, SEQ_LEN)
#val_dataset   = TextSamplerDataset(data_val, SEQ_LEN)
#train_loader  = cycle(DataLoader(train_dataset, batch_size = BATCH_SIZE))
#val_loader    = cycle(DataLoader(val_dataset, batch_size = BATCH_SIZE))


# optimizer

optim = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
scaler = GradScaler()

# training

for i in tqdm.tqdm(range(NUM_BATCHES), mininterval=10., desc='training'):
    model.train()

    for __ in range(GRADIENT_ACCUMULATE_EVERY):
        with autocast():
            loss = model(next(train_loader), return_loss = True)
        scaler.scale(loss).backward()

    print(f'training loss: {loss.item()}')

    scaler.unscale_(optim)
    torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
    scaler.step(optim)
    scaler.update()
    optim.zero_grad()

    if i % VALIDATE_EVERY == 0:
        model.eval()
        with torch.no_grad():
            loss = model(next(val_loader), return_loss = True)
            print(f'validation loss: {loss.item()}')

    if i % GENERATE_EVERY == 0 and i != 0:
        model.eval()
        inp = random.choice(val_dataset)[:-1]
        prime = decode_tokens(inp)
        print(f'%s \n\n %s', (prime, '*' * 100))

        sample = model.generate(inp, GENERATE_LENGTH)
        output_str = decode_tokens(sample)
        print(output_str)

unable to import cuda code for auto-regressive Performer. will default to the memory inefficient non-cuda version
unable to import cuda code for auto-regressive Performer. will default to the memory inefficient non-cuda version
unable to import cuda code for auto-regressive Performer. will default to the memory inefficient non-cuda version
unable to import cuda code for auto-regressive Performer. will default to the memory inefficient non-cuda version
unable to import cuda code for auto-regressive Performer. will default to the memory inefficient non-cuda version
unable to import cuda code for auto-regressive Performer. will default to the memory inefficient non-cuda version


  scaler = GradScaler()
  with autocast():
training:   0%|          | 0/100000 [00:00<?, ?it/s]


NameError: name 'train_loader' is not defined