In [30]:
import os
import torch
import torch.nn as nn

from torchtext import data
import numpy as np
import pandas as pd

from tqdm.notebook import tqdm
import time

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


In [31]:
epoch = 100
lr = 0.001

emb_size = 320
hidden_size = 320

batch_size = 16

Sequential sentence data to word vector

In [32]:
class DataLoader(object):

    def __init__(
        self, path,
        batch_size = 64,
        valid_ratio = 0.2,
        device = -1,
        max_vocab = 50000,
        min_freq = 1,
        use_eos = False, # not translation : eos unessential
        shuffle = True,
    ):

        super().__init__()

        self.label = data.Field(
            sequential=False,
            use_vocab=True,
            unk_token=None
        )

        self.text = data.Field(
            use_vocab=True,
            batch_first=True,
            include_lengths=False,
            init_token='<cls>', # for classification
            eos_token='<EOS>' if use_eos else None,
        )

        train, valid = data.TabularDataset(
            path=path,
            format='csv',
            skip_header=True, # @young : 'label', 'text' 제거
            fields=[
                ('label', self.label),
                ('text', self.text),
            ],
        ).split(split_ratio=(1 - valid_ratio))

        # batch 단위 iterator for NLP
        self.train_loader, self.valid_loader = data.BucketIterator.splits(
            (train, valid),
            batch_size=batch_size,
            device='cuda:0' ,
            shuffle=shuffle,
            sort_key=lambda x: len(x.text),
            sort_within_batch=True,
        )

        # init word vocab
        self.label.build_vocab(train)
        self.text.build_vocab(train, max_size=max_vocab, min_freq=min_freq)


In [33]:
dataLoad = DataLoader('../data/test2.csv', batch_size=batch_size)

In [34]:
label = dataLoad.label
text = dataLoad.text

train_loader = dataLoad.train_loader
val_loader = dataLoad.valid_loader

label.vocab.stoi

defaultdict(None, {'4': 0, '5': 1, '3': 2, '2': 3, '1': 4})

In [35]:
# set vocab label in order
label.vocab.stoi['1'] = 0
label.vocab.stoi['2'] = 1
label.vocab.stoi['3'] = 2
label.vocab.stoi['4'] = 3
label.vocab.stoi['5'] = 4
label.vocab.stoi

defaultdict(None, {'4': 3, '5': 4, '3': 2, '2': 1, '1': 0})

In [36]:
# check label distribution
vars(label.vocab)

{'freqs': Counter({'5': 15899, '4': 16738, '3': 9986, '2': 5555, '1': 1964}),
 'itos': ['4', '5', '3', '2', '1'],
 'unk_index': None,
 'stoi': defaultdict(None, {'4': 3, '5': 4, '3': 2, '2': 1, '1': 0}),
 'vectors': None}

In [37]:
n_words = len(text.vocab.stoi)
n_cls = len(label.vocab.stoi)

In [38]:
# save model weight
def save_checkpoint(model, pth):
    torch.save(model.state_dict(), pth)

Sequential Multi Head Attention for classification

In [None]:
class MultiHeadAttn(nn.Module):
    def __init__(
        self,
        hidden_size,
        emb_size,
        n_layers = 4,
        dropout_rate = 0.3,
    ):
        super().__init__()

        self.hidden_size = hidden_size # hidden embedding size
        self.emb_size = emb_size # word embedding vector size
        self.n_layers = n_layers
        self.dropout_rate = dropout_rate

        self.n_heads = 5
        self.n_hid = hidden_size//self.n_heads # hidden size for each head
        self.scale = torch.sqrt(torch.FloatTensor([self.n_hid])).to(device)

        # init q, k, v with linear layer
        self.mlp_q = nn.Linear(self.emb_size, self.hidden_size, bias=False)
        self.mlp_k = nn.Linear(self.emb_size, self.hidden_size, bias=False)
        self.mlp_v = nn.Linear(self.emb_size, self.hidden_size, bias=False)

        self.mlp_x = nn.Linear(self.hidden_size, self.emb_size, bias=False)

        self.sm = nn.Softmax(dim=-1)

    def forward(self, x, mask=None): # [batch, n+1->n, hidden_size]
        # init Query, Key and Value
        x_q = self.mlp_q(x) # [batch, n, hidden_size = n_heads*n_hid]
        x_k = self.mlp_k(x) # [batch, n, hidden_size = n_heads*n_hid]
        x_v = self.mlp_v(x) # [batch, n, hidden_size = n_heads*n_hid]

        # multi-head to Q, K and V
        x_q = x_q.view(x.shape[0], -1, self.n_heads, self.n_hid).permute(0, 2, 1, 3).contiguous() # [batch, n_heads, n, h_hid]
        x_k = x_k.view(x.shape[0], -1, self.n_heads, self.n_hid).permute(0, 2, 1, 3).contiguous() # [batch, n_heads, n, h_hid]
        x_v = x_v.view(x.shape[0], -1, self.n_heads, self.n_hid).permute(0, 2, 1, 3).contiguous() # [batch, n_heads, n, h_hid]

        x_q = x_q.reshape(x.shape[0]*self.n_heads, -1, self.n_hid) # [batch*n_heads, n, n_hid]
        x_k = x_k.reshape(x.shape[0]*self.n_heads, -1, self.n_hid) # [batch*n_heads, n, n_hid]
        x_v = x_v.reshape(x.shape[0]*self.n_heads, -1, self.n_hid) # [batch*n_heads, n, n_hid]

        # Attention Weight with Q & K
        w = torch.bmm(x_q, x_k.transpose(2, 1))/self.scale # [batch*n_heads, n, n]

        if mask != None:
            w = w.masked_fill(mask == 0, -1e10) # [batch*n_heads, n, n]

        w = self.sm(w) # [batch*n_heads, n, n]
        # w = nn.functional.dropout(w, self.dropout_rate)

        # Multiply attention weight with V
        attn = torch.bmm(w, x_v) # [batch*n_heads, n, n_hid]
        x = attn.view(x.shape[0], self.n_heads, -1, self.n_hid) # [batch, n_heads, n, n_hid]
        x = x.permute(0, 2, 1, 3) # [batch, n, n_heads, n_hid]
    
        # Concat multi-head attention outcomes
        x = x.reshape(x.shape[0], -1, self.hidden_size) # [batch, n, hidden_size]
        x = self.mlp_x(x) # [batch, n, hidden_size]

        return x


In [40]:
class SeqClassifier(nn.Module):
    def __init__(
        self,
        input_size,
        hidden_size,
        emb_size,
        n_classes,
        n_layers=4,
        dropout_rate=0.3,
    ):
        super().__init__()

        self.input_size = input_size
        self.hidden_size = hidden_size
        self.emb_size = emb_size
        self.n_classes = n_classes
        self.n_layers = n_layers
        self.dropout_rate = dropout_rate
        self.n_heads = 5
        self.n_tran = 4
        self.max_len = 2000 # average length 사용???? padding이 많이 들어갈수록 연산?? 만 많아지나??

        self.emb = nn.Embedding(self.input_size, self.emb_size)
        self.pemb = nn.Embedding(self.max_len, self.emb_size)

        self.attns = nn.ModuleList([MultiHeadAttn(hidden_size=self.hidden_size, emb_size=self.hidden_size) for _ in range(self.n_tran)])
        
        self.fc = nn.Linear(self.hidden_size, self.n_classes)
        self.actv = nn.Softmax(dim=-1)

    def forward(self, x): # [batch, n+1]
        # init mask for label <pad> (1)
        mask = (x != 1).unsqueeze(-1) # [batch, n+1, 1]
        mask = torch.bmm(mask.type(torch.LongTensor), mask.transpose(2, 1).type(torch.LongTensor)).unsqueeze(1) # [batch, 1, n+1, n+1]
        mask = mask.repeat(1, self.n_heads, 1, 1) # [batch, n_heads, n+1, n+1]
        mask = mask.view(mask.shape[0]*mask.shape[1], mask.shape[2], mask.shape[3]).to(device) # [batch*n_heads, n+1, n+1]

        # word embedding
        x_emb = self.emb(x) # [batch, n+1, hidden_size]

        # init position embedding for sequence
        pos = torch.arange(x.shape[1]).unsqueeze(0).repeat(x.shape[0], 1).to(device) #.expand(x_emb.shape[0], x_emb.shape[1]).to(device) # [batch, n+1]
        x_pos = self.pemb(pos).to(device) # [batch, n+1, hidden_size]
        x = x_emb # [batch, n+1, hidden_size]

        # init classification token
        # cls_token = torch.zeros((x.shape[0], 1)).type(torch.LongTensor).to(device) # [batch, 1]
        # cls_token = self.cls(cls_token).to(device) # [batch, 1, hidden_size]
        # x = torch.cat((cls_token, x), 1).to(device) # [batch, n+1, hidden_size]

        for idx in range(self.n_tran):
            x = x + x_pos
            x = self.attns[idx](x, mask=mask) # [batch, n+1, hidden_size]

        # time step n to 1
        x = self.fc(x[:, 0, :]) # [batch, n_class] [batch, 1, hidden_size]

        return x


In [41]:
model = SeqClassifier(input_size=len(text.vocab.stoi),
                      emb_size=emb_size,
                      hidden_size=hidden_size,
                      n_classes=n_cls,
                      n_layers=4)

model.to(device)
crit = nn.CrossEntropyLoss()
crit.to(device)
optim = torch.optim.Adam(model.parameters(), lr = lr)

In [None]:
for i in range(epoch):
    model.train()
    train_acc, train_cnt = 0, 0

    for batch in tqdm(train_loader):
        x, y = batch.text.to(device), batch.label.to(device)

        optim.zero_grad()
        ans = model(x)

        loss = crit(ans, y)
        loss.backward()
        optim.step()

        train_acc += (ans.argmax(1) == y).sum().item()
        train_cnt += y.size(0)

    model.eval()
    with torch.no_grad():
        val_acc, val_cnt = 0, 0

        for batch in val_loader:
            x, y = batch.text.to(device), batch.label.to(device)

            ans = model(x)
 
            loss = crit(ans, y)

            val_acc += (ans.argmax(1) == y).sum().item()
            val_cnt += y.size(0)

    print(ans.argmax(1).detach().cpu().numpy())

    print('Epoch : {:3d}/{} Loss : {:.4f} TrainAcc : {:.4f} ValAcc : {:.4f}'.format(i + 1, epoch, loss, train_acc/train_cnt, val_acc/val_cnt))

# save_checkpoint(model, './model.pth')