In [34]:
from collections import Counter
import numpy as np
import pandas as pd

In [29]:
import math
import os

import torch
from torch import nn, Tensor
import torch.nn.functional as F
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from torch.utils.data import dataset

In [28]:
def mse(pred, y):
    return np.sqrt(sum((pred-y)**2)/len(pred))

In [2]:
train = pd.read_csv('data/train.csv')

In [4]:
test = pd.read_csv('data/test.csv')

In [42]:
tokens_appeared = []
for smiles in train['SMILES']:
    smiles_splited = list(smiles)  # 'Cc1n' --> ['C', 'c', '1', 'n']
    tokens_appeared.extend(smiles_splited)  

In [45]:
tokens_counter = Counter(tokens_appeared)

In [46]:
len(tokens_counter)

32

In [52]:
train

Unnamed: 0,id,SMILES,MLM,HLM,AlogP,Molecular_Weight,Num_H_Acceptors,Num_H_Donors,Num_RotatableBonds,LogD,Molecular_PolarSurfaceArea
0,TRAIN_0000,CCOc1ccc(CNC(=O)c2cc(-c3sc(C)nc3C)n[nH]2)cc1OCC,26.010,50.680,3.259,400.495,5,2,8,3.259,117.37
1,TRAIN_0001,Cc1nc(C)c(CN2CC(C)C(=O)Nc3ccccc32)s1,29.270,50.590,2.169,301.407,2,1,2,2.172,73.47
2,TRAIN_0002,CCCN1CCN(c2nn3nnnc3c3ccccc23)CC1,5.586,80.892,1.593,297.358,5,0,3,1.585,62.45
3,TRAIN_0003,Cc1ccc(-c2ccc(-n3nc(C)c(S(=O)(=O)N4CCN(C5CCCCC...,5.710,2.000,4.771,494.652,6,0,5,3.475,92.60
4,TRAIN_0004,Cc1ccc2c(c1)N(C(=O)c1ccncc1)CC(C)O2,93.270,99.990,2.335,268.310,3,0,1,2.337,42.43
...,...,...,...,...,...,...,...,...,...,...,...
3493,TRAIN_3493,Cn1nc(CNC(=O)Cn2nc(C(F)(F)F)c3c2CCC3)c(Cl)c1Cl,1.556,3.079,3.409,396.195,3,1,5,3.409,64.74
3494,TRAIN_3494,CCn1[nH]cc/c1=N\C(=O)c1nn(-c2ccccc2)c(=O)c2ccc...,35.560,47.630,1.912,359.381,4,1,3,1.844,77.37
3495,TRAIN_3495,CCOC(=O)CCCc1nc2cc(N)ccc2n1C,56.150,1.790,1.941,261.320,3,1,6,2.124,70.14
3496,TRAIN_3496,Nc1cc(C(=O)OCCC2CCOC2=O)cnc1Cl,0.030,2.770,0.989,284.696,5,1,5,0.989,91.51


In [53]:
tokens_counter.most_common(5)

[('c', 39802), ('C', 26001), ('(', 14796), (')', 14796), ('1', 10300)]

In [None]:
class TransformerModel(nn.Module):
    def __init__(self, ntoken, d_model, nhead, d_hid, nlayers, dropout = 0.2):
        """
        Args:
            ntoken (int): Token 개수. (32+1)개
            d_model (int): embedding dimension과 같음
            d_hid (int): 
            nlayers (int):
            dropout (float): 
        """
        super().__init__()
        self.model_type = 'Transformer'
        self.pos_encoder = PositionalEncoding(d_model, dropout)
        encoder_layers = TransformerEncoderLayer(d_model, nhead, d_hid, dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        self.encoder = nn.Embedding(ntoken, d_model)
        self.d_model = d_model
        self.decoder = nn.Linear(d_model, 1)  # MLM, HLM 값 하나를 예측
        self.init_weights()

    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, src: Tensor) -> Tensor:
        """
        Args:
            src (Tensor): shape ``[seq_len, batch_size]``
        Returns:
            output (Tensor): shape ``[seq_len, batch_size, ntoken]``
        """
        src = self.encoder(src) * math.sqrt(self.d_model)
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src)
        output = self.decoder(output)
        return output


마스킹은 필요없음

In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        """
        Args:
            d_model (int): 
            dropout (float): 
            max_len (int): 
        """
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x: Tensor) -> Tensor:
        """
        Args:
            x (Tensor): shape ``[seq_len, batch_size, embedding_dim]``
        """
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

In [None]:
class Tokenizer:
    def __init__(self, tokens_counter):
        self.tokens_counter = tokens_counter
    
    def token_mapping(self):
        tokens_counter

In [None]:
train_iter = WikiText2(split='train')
tokenizer = get_tokenizer('basic_english')
vocab = build_vocab_from_iterator(map(tokenizer, train_iter), specials=['<unk>'])
vocab.set_default_index(vocab['<unk>'])

def data_process(raw_text_iter: dataset.IterableDataset) -> Tensor:
    """Converts raw text into a flat Tensor."""
    data = [torch.tensor(vocab(tokenizer(item)), dtype=torch.long) for item in raw_text_iter]
    return torch.cat(tuple(filter(lambda t: t.numel() > 0, data)))

# ``train_iter`` was "consumed" by the process of building the vocab,
# so we have to create it again
train_iter, val_iter, test_iter = WikiText2()
train_data = data_process(train_iter)
val_data = data_process(val_iter)
test_data = data_process(test_iter)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def batchify(data: Tensor, bsz: int) -> Tensor:
    """Divides the data into ``bsz`` separate sequences, removing extra elements
    that wouldn't cleanly fit.

    Arguments:
        data: Tensor, shape ``[N]``
        bsz: int, batch size

    Returns:
        Tensor of shape ``[N // bsz, bsz]``
    """
    seq_len = data.size(0) // bsz
    data = data[:seq_len * bsz]
    data = data.view(bsz, seq_len).t().contiguous()
    return data.to(device)

batch_size = 20
eval_batch_size = 10
train_data = batchify(train_data, batch_size)  # shape [seq_len, batch_size]
val_data = batchify(val_data, eval_batch_size)
test_data = batchify(test_data, eval_batch_size)

https://discuss.pytorch.org/t/how-to-add-padding-mask-to-nn-transformerencoder-module/63390/2

In [100]:
import torch, torch.nn as nn
q = torch.randn(3, 1, 10) # source sequence length 3, batch size 1, embedding size 10
attn = nn.MultiheadAttention(10, 1) # embedding size 10, one head
res = attn(q, q, q) # self attention

In [104]:
mask = torch.zeros(1,3)

In [105]:
mask[0][-1] = float('-inf')

In [172]:
mask = torch.zeros(1, 3, dtype=torch.bool)

In [173]:
mask[0][-1] = True

In [174]:
mask

tensor([[False, False,  True]])

In [175]:
res_mask, w = attn(q, q, q, key_padding_mask=mask)

tensor([1, 2, 3])

In [184]:
F.softmax(torch.FloatTensor([2,4,6]), dim=0)

tensor([0.0159, 0.1173, 0.8668])

In [176]:
w

tensor([[[0.4534, 0.5466, 0.0000],
         [0.8158, 0.1842, 0.0000],
         [0.6922, 0.3078, 0.0000]]], grad_fn=<DivBackward0>)

In [166]:
res, w =attn(q,q,q)

In [148]:
test = torch.zeros(3, 3)

In [153]:
test[-1] = float('-inf')

In [155]:
test

tensor([[0., 0., 0.],
        [0., 0., 0.],
        [-inf, -inf, -inf]])

In [157]:
res = F.softmax(test, dim=-1)

tensor([[1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.]])

In [162]:
torch.matmul(res, torch.ones(3,3))

tensor([[1., 1., 1.],
        [1., 1., 1.],
        [nan, nan, nan]])

In [146]:
res_mask

tensor([[[ 0.5308,  0.5410,  0.4923, -0.3016, -0.1482,  0.2765,  0.4337,
          -0.0224,  0.3547, -0.0359]],

        [[ 0.5308,  0.5410,  0.4923, -0.3016, -0.1482,  0.2765,  0.4337,
          -0.0224,  0.3547, -0.0359]],

        [[ 0.5308,  0.5410,  0.4923, -0.3016, -0.1482,  0.2765,  0.4337,
          -0.0224,  0.3547, -0.0359]]], grad_fn=<ViewBackward0>)

In [134]:
mask

tensor([[-inf, -inf, -inf]])

In [133]:
F.softmax(mask, dim=1)

tensor([[nan, nan, nan]])

In [108]:
q

tensor([[[ 1.5008, -0.9272,  0.2531, -1.0852,  0.2230,  0.6262, -0.0557,
          -1.4180, -0.3587, -0.8511]],

        [[ 0.8764, -0.8869,  0.4847,  1.3660, -1.8363,  0.2446, -0.5027,
           0.6274, -0.2760, -0.0792]],

        [[-0.2222,  0.5592,  0.8395,  1.0963, -1.5894,  0.4925, -0.2025,
          -0.4314, -1.7024, -0.4290]]])

In [107]:
res_mask

tensor([[[ 0.1048, -0.1434, -0.1923, -0.3875, -0.1883,  0.0326,  0.4077,
           0.2217, -0.1304, -0.0842]],

        [[ 0.2626,  0.0537, -0.0712, -0.5399, -0.2916,  0.1719,  0.4479,
          -0.0824, -0.1685,  0.0877]],

        [[ 0.2088, -0.0135, -0.1125, -0.4879, -0.2563,  0.1244,  0.4342,
           0.0213, -0.1555,  0.0291]]], grad_fn=<ViewBackward0>)

In [86]:
res

tensor([[[ 0.0612,  0.1095, -0.3077,  0.2431,  0.1704, -0.3031,  0.0206,
          -0.3229, -0.2638,  0.2673]],

        [[ 0.1361,  0.0900, -0.2317,  0.3017,  0.2159, -0.3380,  0.0759,
          -0.2576, -0.2895,  0.2333]],

        [[ 0.0206,  0.1660, -0.2280,  0.3331,  0.1539, -0.3960,  0.0576,
          -0.2094, -0.2356,  0.2878]]], grad_fn=<ViewBackward0>)

In [67]:
mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))

NameError: name 'sz' is not defined

In [66]:
res[0].shape

torch.Size([3, 1, 10])

In [61]:
res[1].shape

torch.Size([1, 3, 3])