In [1]:
import numpy as np
import faiss
import copy
import json
import torch
import math
import random
from tqdm import tqdm
from augmentation import TfIdfAugmentation
from tools.utils import ExternalPreprocessor

from modelling.models import DAN, Embedding
from modelling.templates import SequenceTemplate

In [2]:
MAX_LEN = 32

In [3]:
VOCAB_PATH = '../routing/data/sberbank_embeddings/w2v_m5_w3_v300_norm_v48_vocab.txt'
W2V_MATRIX_PATH = '../routing/data/sberbank_embeddings/w2v_m5_w3_v300_norm_v48_vectors.npy'

In [4]:
with open('token2prob.json') as f:
    token2prob = json.loads(f.read())

In [5]:
with open(VOCAB_PATH) as f:
    vocab = f.read().split('\n')

In [6]:
vocab = {key: value for value, key in enumerate(vocab)}

In [7]:
index2prob = {vocab[token]: token2prob[token] for token in token2prob if token in vocab}

In [8]:
aug = TfIdfAugmentation(indexes_matrix=np.load('nearest_matrix.npy'), index2prob=index2prob)

In [9]:
with open('train.jsonl') as f:
    data = [json.loads(sample) for sample in f.read().split('\n')]

In [10]:
random.shuffle(data)

In [11]:
train, test = data[:-len(data) // 10], data[-len(data) // 10:]

In [12]:
len(train), len(test), len(train) + len(test) == len(data)

(11623, 1292, True)

In [13]:
TARGET2INDEX = {
    'ANNA.1.sales': 0,
    'ANNA.1.sbbol': 1,
    'ANNA.1.oper_support': 2
}

In [14]:
def sequence_padding(sequence, max_sequence_length, value) -> np.ndarray:

    sequence = sequence[:max_sequence_length]

    if len(sequence) < max_sequence_length:
        for _ in range((max_sequence_length - len(sequence))):
            sequence.append(value)

    sequence = np.array(sequence)

    return sequence

In [15]:
def indexing_batch(x, vocab, max_sequence_length):
    
    x = [[vocab[tok] for tok in sample if tok in vocab] for sample in x]
    
    x = np.array([sequence_padding(sample, max_sequence_length=max_sequence_length, value=0) for sample in x])
    
    return x

In [16]:
def batch_processing(batch):
    
    x = [sample['tokens'] for sample in batch]
    
    x = indexing_batch(x, vocab, MAX_LEN)
    x_aug = aug.replace_batch(copy.deepcopy(x))
    
    y = np.array([TARGET2INDEX[sample['target']] for sample in batch])
    
    x = torch.LongTensor(x)
    x_aug = torch.LongTensor(x_aug)
    y = torch.LongTensor(y)
    
    return [x, x_aug, y]

In [17]:
def loader(data, batch_size=32):

    for n_batch in range(math.ceil(len(data) / batch_size)):

        batch = data[n_batch * batch_size:(n_batch + 1) * batch_size]

        batch = batch_processing(batch)

        yield batch

In [18]:
from modelling.layers import BaseModule
from modelling.templates import SequenceTemplate

In [19]:
word_matrix = np.load(W2V_MATRIX_PATH)

In [29]:
class Model(BaseModule):
    
    def __init__(self):
        
        super().__init__()
        
        self.embedding = Embedding(vocab_size=word_matrix.shape[0],
                                   embedding_matrix=word_matrix)
        
        self.dan = DAN((300, 256), activation_function_output=torch.nn.ReLU())
        
        self.linear = torch.nn.Linear(256, 256)
        
        self.activation = torch.nn.ReLU()
        
        self.classifier = torch.nn.Linear(256, 3)
        
    def forward(self, x, x_aug):
        
        x_rep = self.embedding(x)
        x_rep = self.dan(x_rep)
        x_rep = self.linear(x_rep)
        x_rep = torch.nn.functional.log_softmax(x_rep, dim=1)
        
        with torch.no_grad():
            
            x_aug_rep = self.embedding(x_aug)
            x_aug_rep = self.dan(x_aug_rep)
            x_aug_rep = self.linear(x_aug_rep)
            x_aug_rep = torch.nn.functional.softmax(x_aug_rep, dim=1)
    
        y_pred = self.classifier(x_rep)
        
        return x_rep, x_aug_rep, y_pred

In [42]:
model = Model()

In [43]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

In [44]:
kl_div = torch.nn.KLDivLoss(reduction='batchmean')
cross_entropy = torch.nn.CrossEntropyLoss()

In [45]:
kl_losses = []
ce_losses = []
losses = []

l = 0.5

for n in range(5):
    
    epoch_kl_losses = []
    epoch_ce_losses = []
    epoch_losses = []

    model.train()
    
    pg = tqdm(total=len(train), desc=f'Epoch: {n}')

    for x, x_aug, y in loader(train):

        optimizer.zero_grad()

        x_rep, x_aug_rep, y_pred = model(x, x_aug)

        kl_loss = kl_div(x_rep, x_aug_rep)
        ce_loss = cross_entropy(y_pred, y)

        loss = ce_loss + 10 * kl_loss

        loss.backward()

        optimizer.step()
        
        epoch_kl_losses.append(kl_loss.item())
        epoch_ce_losses.append(ce_loss.item())
        epoch_losses.append(loss.item())

        pg.update(x.shape[0])
        pg.set_postfix(kl_loss=epoch_kl_losses[-1], ce_loss=epoch_ce_losses[-1], loss=epoch_losses[-1])

    pg.close()
    
    test_epoch_kl_losses = []
    test_epoch_ce_losses = []
    test_epoch_losses = []

    model.eval()

    for x, x_aug, y in loader(test):

        x_rep, x_aug_rep, y_pred = model(x, x_aug)

        kl_loss = kl_div(x_rep, x_aug_rep)
        ce_loss = cross_entropy(y_pred, y)

        loss = ce_loss + kl_loss

    test_epoch_kl_losses.append(kl_loss.item())
    test_epoch_ce_losses.append(ce_loss.item())
    test_epoch_losses.append(loss.item())
    
    print('KL Train - {:.3f} | Test - {:.3f}'.format(np.mean(epoch_kl_losses), np.mean(test_epoch_kl_losses)))
    print('CE Train - {:.3f} | Test - {:.3f}'.format(np.mean(epoch_ce_losses), np.mean(test_epoch_ce_losses)))
    print('Aggregated Train - {:.3f} | Test - {:.3f}'.format(np.mean(epoch_losses), np.mean(test_epoch_losses)))
    
    kl_losses.extend(copy.deepcopy(epoch_kl_losses))
    ce_losses.extend(copy.deepcopy(epoch_ce_losses))
    losses.extend(copy.deepcopy(epoch_losses))

Epoch: 0: 100%|██████████| 11623/11623 [00:02<00:00, 4602.69it/s, ce_loss=1.26, kl_loss=0.000572, loss=1.26]  
Epoch: 1:   8%|▊         | 896/11623 [00:00<00:02, 4628.95it/s, ce_loss=0.757, kl_loss=0.0016, loss=0.773]  

KL Train - 0.001 | Test - 0.001
CE Train - 1.521 | Test - 0.683
Aggregated Train - 1.527 | Test - 0.684


Epoch: 1: 100%|██████████| 11623/11623 [00:02<00:00, 4566.01it/s, ce_loss=1.34, kl_loss=0.00148, loss=1.36]  
Epoch: 2:   8%|▊         | 896/11623 [00:00<00:02, 4673.08it/s, ce_loss=0.715, kl_loss=0.00213, loss=0.736]

KL Train - 0.002 | Test - 0.002
CE Train - 0.642 | Test - 0.581
Aggregated Train - 0.660 | Test - 0.583


Epoch: 2: 100%|██████████| 11623/11623 [00:02<00:00, 4598.71it/s, ce_loss=1.34, kl_loss=0.00187, loss=1.36]  
Epoch: 3:   8%|▊         | 896/11623 [00:00<00:02, 4620.97it/s, ce_loss=0.699, kl_loss=0.00191, loss=0.718]

KL Train - 0.002 | Test - 0.002
CE Train - 0.597 | Test - 0.541
Aggregated Train - 0.621 | Test - 0.543


Epoch: 3: 100%|██████████| 11623/11623 [00:02<00:00, 4590.44it/s, ce_loss=1.31, kl_loss=0.00212, loss=1.33]  
Epoch: 4:   8%|▊         | 896/11623 [00:00<00:02, 4671.20it/s, ce_loss=0.684, kl_loss=0.00195, loss=0.703]

KL Train - 0.003 | Test - 0.002
CE Train - 0.577 | Test - 0.518
Aggregated Train - 0.602 | Test - 0.520


Epoch: 4: 100%|██████████| 11623/11623 [00:02<00:00, 4589.88it/s, ce_loss=1.27, kl_loss=0.00184, loss=1.29]  


KL Train - 0.003 | Test - 0.003
CE Train - 0.561 | Test - 0.499
Aggregated Train - 0.587 | Test - 0.503


In [25]:
class Model(BaseModule):
    
    def __init__(self):
        
        super().__init__()
        
        self.embedding = Embedding(vocab_size=word_matrix.shape[0],
                                   embedding_matrix=word_matrix)
        
        self.dan = DAN((300, 256), activation_function_output=torch.nn.ReLU())
        
        self.linear = torch.nn.Linear(256, 256)
        
        self.activation = torch.nn.ReLU()
        
        self.classifier = torch.nn.Linear(256, 3)
        
    def forward(self, x):
        
        x_rep = self.embedding(x)
        x_rep = self.dan(x_rep)
        x_rep = self.linear(x_rep)
        x_rep = torch.nn.functional.log_softmax(x_rep, dim=1)
    
        y_pred = self.classifier(x_rep)
        
        return y_pred

In [26]:
model = Model()

In [27]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

In [28]:
losses = []

l = 0.5

for n in range(5):
    
    epoch_losses = []

    model.train()
    
    pg = tqdm(total=len(train), desc=f'Epoch: {n}')

    for x, _, y in loader(train):

        optimizer.zero_grad()

        y_pred = model(x)

        loss = cross_entropy(y_pred, y)

        loss.backward()

        optimizer.step()
        
        epoch_losses.append(loss.item())

        pg.update(x.shape[0])
        pg.set_postfix(loss=epoch_losses[-1])

    pg.close()

    test_epoch_losses = []

    model.eval()

    for x, x_aug, y in loader(test):

        y_pred = model(x)

        loss = cross_entropy(y_pred, y)

    test_epoch_losses.append(loss.item())
    
    print('Loss Train - {:.3f} | Test - {:.3f}'.format(np.mean(epoch_losses), np.mean(test_epoch_losses)))

    losses.extend(copy.deepcopy(epoch_losses))

Epoch: 0: 100%|██████████| 11623/11623 [00:02<00:00, 5136.96it/s, loss=1.48] 
Epoch: 1:   9%|▉         | 1024/11623 [00:00<00:02, 5239.85it/s, loss=0.502]

Loss Train - 0.805 | Test - 0.625


Epoch: 1: 100%|██████████| 11623/11623 [00:02<00:00, 5149.50it/s, loss=1.58] 
Epoch: 2:   9%|▉         | 1024/11623 [00:00<00:02, 5291.72it/s, loss=0.471]

Loss Train - 0.610 | Test - 0.536


Epoch: 2: 100%|██████████| 11623/11623 [00:02<00:00, 5137.20it/s, loss=1.53] 
Epoch: 3:   9%|▉         | 1024/11623 [00:00<00:02, 5242.93it/s, loss=0.462]

Loss Train - 0.578 | Test - 0.492


Epoch: 3: 100%|██████████| 11623/11623 [00:02<00:00, 5144.45it/s, loss=1.46] 
Epoch: 4:   9%|▊         | 992/11623 [00:00<00:02, 5108.51it/s, loss=0.474]

Loss Train - 0.559 | Test - 0.468


Epoch: 4: 100%|██████████| 11623/11623 [00:02<00:00, 5114.81it/s, loss=1.39] 


Loss Train - 0.544 | Test - 0.451
