# Molecule transformers

## 1. input embedding 

## 2. Self attention and Feed forward

## 3. details

Activation function : gelu

The maximum length of positional embedding : 100

masked probability for each token : 0.15

masking probability : 0.8 | trainsition probability : 0.2

## 4. Questions
Is it OK to remain the synonyms for SMILES representations?

Should we use mask?? or not??

## 5. Note
Somehow, (guessing: just underflow problems of the CPU itself) CPU-based learning show poor convergence speed.

Most of them just being (C,4). We should add the class weights.

In [1]:
import torch
import torch.nn
import torchtext
import random
import math
from torchtext.data import Iterator
from torchtext.data.utils import get_tokenizer

In [2]:
class PositionalEncoding(torch.nn.Module):

    def __init__(self, d_model, dropout=0.1, max_len=100):
        super(PositionalEncoding, self).__init__()
        self.dropout = torch.nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))

        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x):
        return self.pe

In [3]:
class TransformerModel(torch.nn.Module):

    def __init__(self, ntoken, ninp, nhead, nhid, nlayers, dropout=0.1):
        super(TransformerModel, self).__init__()
        from torch.nn import TransformerEncoder, TransformerEncoderLayer
        self.model_type = 'Transformer'
        self.ninp = ninp
        self.src_mask = None
        
        self.pos_encoder = PositionalEncoding(ninp, dropout)
        self.encoder = torch.nn.Embedding(ntoken, ninp)
        
        encoder_layers = TransformerEncoderLayer(ninp, nhead, nhid, dropout, activation='gelu')
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        self.decoder = torch.nn.Linear(ninp, ntoken) ## embedded -> seq
        self.init_weights()

    def _generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, src):
        if self.src_mask is None or self.src_mask.size(0) != src.size(0):
            device = src.device
            mask = self._generate_square_subsequent_mask(src.size(0)).to(device)
            self.src_mask = mask

        pos_emb = self.encoder(src) * math.sqrt(self.ninp)

        mol_token_emb = self.pos_encoder(None) ### Input embedding = positional embedding + normal embedding
        input_emb = pos_emb + mol_token_emb
        
        output = self.transformer_encoder(input_emb, self.src_mask) ### Self-attention layers : dim = ninp
        output = self.decoder(output) ### decoding

        return output

# Load and batch data

In [4]:
train_file = './CID-SMILES_1M_train.txt'

smile_mol_tokenizer = torchtext.data.Field(init_token='<BEGIN>',
                                          pad_token='<PAD>',
                                          tokenize=list,
                                          eos_token='<END>',
                                          fix_length=100,
                                          batch_first=True) 

smile_data_training = torchtext.data.TabularDataset(path=train_file,
                                          format='csv',
                                          fields=[('input', smile_mol_tokenizer), ('output', smile_mol_tokenizer)])


train_data, test_data = smile_data_training.split(split_ratio=0.7)
smile_mol_tokenizer.build_vocab(smile_data_training)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_batch = Iterator(train_data, batch_size=128, device=device, repeat=False)
test_batch = Iterator(test_data, batch_size=128, device=device, repeat=False)



In [5]:
device

device(type='cuda')

# Training process

In [6]:
n_tokens = len(smile_mol_tokenizer.vocab.stoi)
mol_emsize = 128 # Embedded molecule sizes
n_layers = 8 # Number of attentions and feed-forwards
n_head = 8 # Attention heads
n_hid = 512 # feed forward dim

counts = list(smile_mol_tokenizer.vocab.freqs.values())
sum_counts = sum(counts)
class_weights = list(map(lambda x: math.log(sum_counts/x), counts))
class_weights = [1., 1., 1., 1.] + class_weights

criterion = torch.nn.CrossEntropyLoss(weight=torch.Tensor(class_weights).to(device))
model = TransformerModel(n_tokens, mol_emsize, n_head, n_hid, n_layers).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.99)

In [None]:
import time

epochs = 3

def train():
    model.train()
    total_loss = 0.
    start_time = time.time()
    i = 0
    for batch in train_batch:
        data, targets = batch.input, batch.output
        #targets_onehot = torch.nn.functional.one_hot(targets, num_classes=72)
        optimizer.zero_grad()

        predicts = model(data)
        #print(predicts.size(), targets_onehot.size())
        #print(predicts.view(-1, n_tokens).size(), targets_onehot.size())
        loss = criterion(predicts.view(-1, n_tokens), targets.view(-1))
        loss.backward()
        #print(predicts, targets)
        #print(torch.max(predicts, 1)[1])
        torch.nn.utils.clip_grad_norm(model.parameters(), 0.5)
        optimizer.step()
    
        total_loss += loss.item()
        log_interval = 200
        if i % log_interval == 0 and i > 0:
            cur_loss = total_loss / log_interval
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | '
                  'lr {:02.2f} | ms/batch {:5.2f} | '
                  'loss {:5.2f} | ppl {:8.2f}'.format(
                    epoch, i, len(train_batch), scheduler.get_lr()[0],
                    elapsed * 1000 / log_interval,
                    cur_loss, math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()
        i += 1

def evaluate(eval_model):
    eval_model.eval()
    total_loss = 0.
    with torch.no_grad():
        for batch in test_batch:
            data, targets = batch.input, batch.output
            predicts = eval_model(data)
            output_flat = output.view(-1, n_tokens)
            total_loss += len(data) * criterion(predicts.view(-1, n_tokens), targets.view(-1)).item()
    return total_loss / (len(data_source) - 1)

for epoch in range(1, epochs+1):
    epoch_start_time = time.time()
    train()



| epoch   1 |   200/ 5469 batches | lr 0.00 | ms/batch 214.24 | loss  0.93 | ppl     2.54
| epoch   1 |   400/ 5469 batches | lr 0.00 | ms/batch 208.97 | loss  0.34 | ppl     1.41
| epoch   1 |   600/ 5469 batches | lr 0.00 | ms/batch 209.85 | loss  0.33 | ppl     1.38
| epoch   1 |   800/ 5469 batches | lr 0.00 | ms/batch 209.61 | loss  0.32 | ppl     1.38
| epoch   1 |  1000/ 5469 batches | lr 0.00 | ms/batch 211.71 | loss  0.32 | ppl     1.37
| epoch   1 |  1200/ 5469 batches | lr 0.00 | ms/batch 214.79 | loss  0.31 | ppl     1.37
| epoch   1 |  1400/ 5469 batches | lr 0.00 | ms/batch 210.49 | loss  0.32 | ppl     1.37
| epoch   1 |  1600/ 5469 batches | lr 0.00 | ms/batch 217.20 | loss  0.31 | ppl     1.36
| epoch   1 |  1800/ 5469 batches | lr 0.00 | ms/batch 217.91 | loss  0.31 | ppl     1.37
| epoch   1 |  2000/ 5469 batches | lr 0.00 | ms/batch 221.50 | loss  0.31 | ppl     1.36
| epoch   1 |  2200/ 5469 batches | lr 0.00 | ms/batch 221.48 | loss  0.31 | ppl     1.37
| epoch   

In [17]:
import numpy as np
for batch in train_batch:
    predicts = model(batch.input)
    #print(predicts, predicts.size())
    print(batch.input[0])
    masked_num = 0
    hit = 0
    for i in range(len(batch.output[0])):
        if batch.input[0][i] == 8:
            masked_num += 1
            if batch.output[0][i] == torch.max(predicts, 2)[1][0][i]:
                hit += 1
            
    print(torch.max(predicts, 2)[1][0], torch.max(predicts, 2)[1].size())
    print(batch.output[0], batch.output.size())
    print(hit, masked_num)
    break

tensor([ 2,  4, 11,  4,  4, 12,  4, 70, 11,  4,  4, 12,  8,  4,  6,  5,  9,  7,
         4, 13,  5,  4,  4,  5,  4,  4,  5,  4, 13,  7, 10, 25,  4, 11,  4,  4,
        12,  4,  8, 11,  4,  4, 12,  6,  4,  6,  8,  9,  7,  4, 13,  5,  4,  4,
         5,  4,  4,  8,  4, 13,  7, 10,  3,  1,  1,  1,  1,  1,  1,  1,  1,  1,
         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
         1,  1,  1,  1,  1,  1,  1,  1,  1,  1], device='cuda:0')
tensor([ 2,  4, 11,  4,  4, 12,  4, 18, 11,  4,  4, 12, 18,  4,  6,  5,  9,  7,
         4, 13,  5,  4,  4,  5,  4,  4,  5,  4, 13,  7, 10, 25,  4, 11,  4,  4,
        12,  4, 13, 11,  4,  4, 12,  6,  4,  6, 24,  9,  7,  4, 13,  5,  4,  4,
         5,  4,  4, 24,  4, 13,  7, 10,  3,  1,  1,  1,  1,  1,  1,  1,  1,  1,
         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
         1,  1,  1,  1,  1,  1,  1,  1,  1,  1], device='cuda:0') torch.Size([128, 100])
tensor([ 2,  4, 11,  4,  4, 12,  4,  4, 11,  

# Some meaningless tests(sandbox)

In [62]:
import numpy as np

def to_onehot(x, dim, n_classes):
    y = torch.zeros(dim, n_classes)
    y[range(y.shape[0]), x] = 1
    return y
    
for batch in train_batch:
    print(batch.input, batch.input.size())
    print(batch.output, batch.output.size())
    print(to_onehot(batch.output,100,72).size())
    print(torch.nn.functional.one_hot(batch.input, num_classes=72))
    if 3 in batch.input[0]:
        print("valid")
    break
    


tensor([[ 2,  4, 11,  ...,  1,  1,  1],
        [ 2,  4,  4,  ...,  1,  1,  1],
        [ 2,  4,  8,  ...,  1,  1,  1],
        ...,
        [ 2,  4,  4,  ...,  1,  1,  1],
        [ 2,  4,  4,  ...,  1,  1,  1],
        [ 2,  4,  4,  ...,  1,  1,  1]]) torch.Size([512, 100])
tensor([[ 2,  4, 11,  ...,  1,  1,  1],
        [ 2,  4,  4,  ...,  1,  1,  1],
        [ 2,  4,  6,  ...,  1,  1,  1],
        ...,
        [ 2,  4,  4,  ...,  1,  1,  1],
        [ 2,  4,  4,  ...,  1,  1,  1],
        [ 2,  4,  4,  ...,  1,  1,  1]]) torch.Size([512, 100])
torch.Size([100, 72])
tensor([[[0, 0, 1,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 1, 0,  ..., 0, 0, 0],
         [0, 1, 0,  ..., 0, 0, 0],
         [0, 1, 0,  ..., 0, 0, 0]],

        [[0, 0, 1,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 1, 0,  ..., 0, 0, 0],
         [0, 1, 0,  ..., 0, 0, 0],
         

In [10]:
smile_mol_tokenizer.vocab.extend
len(train_batch)

5469

In [20]:
#smile_mol_tokenizer.numericalize(train_data.examples[0].smile_mol) , train_data.examples[0].smile_mol
print(smile_mol_tokenizer.vocab.freqs)
print(smile_mol_tokenizer.vocab.stoi)
counts = list(smile_mol_tokenizer.vocab.freqs.values())
sum_counts = sum(counts)
class_weights = list(map(lambda x: math.log(sum_counts/x), counts))
class_weights = [1., 1., 1., 1.] + class_weights
len(class_weights)
torch.Tensor(class_weights)

Counter({'C': 31798080, '=': 12116178, '(': 8152226, ')': 8151141, ' ': 5617883, 'O': 5263368, 'N': 3874771, '1': 3511878, '2': 2976778, '3': 1881824, '[': 1449974, ']': 1449818, '@': 1329865, '4': 842379, 'H': 837995, 'S': 669320, 'l': 530728, 'F': 413261, '5': 287084, '+': 266362, '-': 260152, '.': 189588, '#': 157056, 'B': 153817, 'r': 148943, '/': 118896, '6': 109514, 'P': 78212, 'i': 62042, '7': 53697, 'I': 49771, '\\': 45922, '8': 36877, 'a': 32851, '9': 29761, 'e': 27934, 'A': 26198, 'n': 25904, 'g': 24982, 's': 24642, 'u': 24379, 'o': 24262, 'M': 23906, 't': 23409, 'T': 23050, 'K': 22859, 'R': 22364, 'Z': 22310, 'b': 22234, 'd': 22149, 'W': 22002, 'L': 21811, 'G': 21696, 'c': 21515, 'h': 21403, 'k': 21370, 'm': 21358, 'V': 21358, '0': 21323, '%': 21252, 'X': 21247, 'E': 21224, 'Y': 21185, 'p': 21089, 'y': 21061, 'D': 21020, 'f': 21018, 'U': 20982})
defaultdict(<bound method Vocab._default_unk_index of <torchtext.vocab.Vocab object at 0x7fd72b7262b0>>, {'<unk>': 0, '<PAD>': 1, '

tensor([1.0000, 1.0000, 1.0000, 1.0000, 1.0806, 2.4417, 2.0455, 2.8792, 2.4419,
        2.8141, 4.1685, 4.1686, 3.1855, 5.8629, 4.9415, 3.2839, 4.2549, 7.0884,
        7.8402, 5.8865, 5.1735, 3.4492, 5.4237, 8.3790, 8.3403, 8.1179, 8.3462,
        8.3184, 3.9078, 4.7115, 8.3566, 8.4024, 6.4442, 6.6695, 8.3863, 8.3863,
        8.3654, 7.6208, 8.3926, 8.3101, 8.0546, 8.4004, 8.2296, 4.7168, 8.2589,
        8.3880, 6.2029, 8.3913, 5.7880, 6.7517, 8.3842, 8.3990, 6.3912, 8.1934,
        7.5403, 7.9558, 7.4644, 8.3427, 8.3945, 7.3200, 8.2736, 8.3916, 8.1821,
        8.3500, 8.4041, 8.3706, 8.4023, 8.2947, 6.4120, 8.2541, 8.2433, 8.3858])

AttributeError: 'Iterator' object has no attribute 'size'