In [1]:
# import packages
import torch
import numpy as np
import torch.nn as nn 
import torch.nn.functional as F 
import pandas as pd 
from tqdm import tqdm
from torchtext.legacy import data
from torchtext.vocab import Vectors
from torchtext.vocab import GloVe
import time
import math

In [2]:
import matplotlib.pyplot as plt
%matplotlib inline 
import nltk
# nltk.download('punkt')

In [3]:
# preprocessing
from nltk import word_tokenize

text = data.Field(sequential = True, lower = True, tokenize = word_tokenize)
term = data.Field(sequential = False, lower = True)
polarity = data.Field(sequential = False)

In [4]:
train_set, val_set = data.TabularDataset.splits(path=r'data/',
                                        skip_header=True,
                                        train='rest_train.csv',
                                        validation='rest_test.csv',
                                        format='csv',
                                        fields=[('text', text),
                                                ('term', term),
                                                ('polarity', polarity)])

In [5]:
vectors = GloVe(name='6B',dim=300)

In [6]:
text.build_vocab(train_set, val_set, vectors=vectors)
term.build_vocab(train_set, val_set, vectors=vectors)
polarity.build_vocab(train_set, val_set)

text_vocab_size = len(text.vocab)
term_vocab_size = len(term.vocab)
text_vector=text.vocab.vectors
term_vector=term.vocab.vectors

In [7]:
text_vocab_size, term_vocab_size, len(polarity.vocab)

(4545, 1529, 4)

In [8]:
# the text field will now return both the data tensor and the length of the input text
for x in data.BucketIterator(train_set, batch_size=len(train_set), shuffle=False):
     max_len = x.text[1].max().numpy()
for x in data.BucketIterator(val_set, batch_size=len(val_set), shuffle=False):
     max_len = int(max(x.text[1].max().numpy(), max_len))

In [9]:
text_vocab_size, term_vocab_size, max_len

(4545, 1529, 4538)

In [10]:
batch_size=512
train_iter, val_iter = data.Iterator.splits(
            (train_set, val_set),
            sort_key=lambda x: len(x.text),
            batch_sizes=(batch_size, len(val_set)), # batch_size only for training
    )   

In [81]:
class Attention_mlp(nn.Module):
    def __init__(self, embedding_dim, hidden_dim):
        super(Attention_mlp, self).__init__()
        self.wv = nn.Linear(embedding_dim, embedding_dim, bias= False)
        self.wh = nn.Linear(hidden_dim, embedding_dim, bias = False)
        self.embedding_dim = embedding_dim
        self.fc1 = nn.Linear(2 * embedding_dim, 1, bias = False)
    def forward(self, term, hidden):
        #### term shape: batch_size * 1 * embedding
        #### hidden shape: batch_size * seq_len * hidden_dim
        term1 = self.wv(term).transpose(-2,-1)
        # shape(batch_size * embedding_dim * 1)
        hidden1 = self.wh(hidden).transpose(-2,-1)
        # shape(batch_size * embedding_dim * seq_len)

        M = torch.cat((hidden1, term1.expand(hidden1.size())), dim = -2)
        # shape(batch_size * (2 * embedding_dim) * seq_len)

        alpha = F.softmax(self.fc1(torch.tanh(M.transpose(-2,-1))), dim = -2).transpose(-2,-1)
        # shape(batch_size * 1 * seq_len)
        
        h_star = torch.matmul(alpha, hidden)
        # shape(batch_size * 1 * hidden_dim)
        return h_star

class Final_pred(nn.Module):
    def __init__(self, hidden_dim):
        super(Final_pred, self).__init__()
        self.wp = nn.Linear(hidden_dim, hidden_dim, bias = False)
        self.wx = nn.Linear(hidden_dim, hidden_dim, bias = False)
        self.ws = nn.Linear(hidden_dim, 3)

    def forward(self, h_star, h_n):
        o_star = torch.tanh(self.wp(h_star) + self.wx(h_n))
        # shape(batch_size * 1 * hidden_dim)
        y = self.ws(o_star)
        # shape(batch_size * 1 * 3)
        return y.squeeze(1)
        
class ATAE_LSTM(nn.Module):
    def __init__(self, embedding_dim, num_hiddens, num_layers):
        super(ATAE_LSTM, self).__init__()
        self.text_embeddings = nn.Embedding(text_vocab_size, embedding_dim) # learnable embedding
        self.term_embeddings = nn.Embedding(term_vocab_size, embedding_dim) #learnable embedding
        self.text_embeddings = nn.Embedding.from_pretrained(text_vector,
                                                            freeze=False) # static embedding
        self.term_embeddings = nn.Embedding.from_pretrained(term_vector,
                                                              freeze=False) #static embedding
        self.lstm = nn.LSTM(input_size=2 * embedding_dim,
                            hidden_size=num_hiddens,
                            num_layers=num_layers,
                            batch_first=True,
                            bidirectional=True)

        # self.wp = nn.Parameter(torch.Tensor(num_hiddens * 2, num_hiddens * 2))
        # self.wx = nn.Parameter(torch.Tensor(num_hiddens * 2, num_hiddens * 2))
        # self.ws = nn.Parameter(torch.Tensor(3, num_hiddens * 2))
        
        self.attn = Attention_mlp(embedding_dim,2 * num_hiddens)

        self.final_pred = Final_pred(2 *num_hiddens)

    def forward(self, text, term):
        seq_len = len(text.t())
        # print('text2:',text.size(1))
        # print('term:',term.size())
        e1 = self.text_embeddings(text)
        # e1 shape(batch_size,seq_len, embedding_dim)
        e2 = self.term_embeddings(term).expand(e1.size())

        wv = torch.cat((e1, e2), dim=2)
        # e.g.
        # wv torch.Size([batch_size,seq_len,2*embedding_dim])

        out, (h, c) = self.lstm(wv)  # output, (h, c)
        # out shape(batch_size,seq_len, 2 * num_hiddens)
        # h shape(num_layers * num_directions, batch_size, 2*num_hiddens)

        r = self.attn(self.term_embeddings(term), out)
        
        # shape(batch_size * 1 * hidden_dim)
        h_n = out[:, -1:, :]
        # shape(batch_size * 1 * hidden_dim)
        y = self.final_pred(r, h_n)
        # shape(batch_size * 1 * 3)
        return y

# Codes below

In [11]:
# This allow you to automatically detect the device for training
device = torch.device("cuda:" + str(0) if torch.cuda.is_available() else "cpu")

In [12]:
# Here I implemented the proposed three components, GRU, Positional embedding and self-attention. You can use it for further ablation studies.
class PositionalEncoding(nn.Module):
    def __init__(self, emsize, dropout = 0.1, max_len = max_len):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, emsize, 2) * (-math.log(10000.0) / emsize))
        pe = torch.zeros(max_len, 1, emsize)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x):
        """
        Args:
            x: Tensor, shape [seq_len, batch_size, embedding_dim]
        """
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

class ATAE_GRU(nn.Module):
    def __init__(self, embedding_dim, num_hiddens, num_layers, dropout=0.2):
        super(ATAE_GRU, self).__init__()
        self.num_hiddens = num_hiddens
        self.num_layers = num_layers
        self.embedding_dim = embedding_dim
        self.text_embeddings = nn.Embedding(text_vocab_size, embedding_dim) # learnable embedding
        self.term_embeddings = nn.Embedding(term_vocab_size, embedding_dim) #learnable embedding
        self.text_embeddings = nn.Embedding.from_pretrained(text_vector,
                                                            freeze=False) # static embedding
        self.term_embeddings = nn.Embedding.from_pretrained(term_vector,
                                                              freeze=False) #static embedding
        self.gru = nn.GRU(input_size=2 * embedding_dim,
                            hidden_size=num_hiddens,
                            num_layers=num_layers,
                            batch_first=True,
                            bidirectional=True,
                            dropout=dropout)
        
        for name, p in self.gru.named_parameters():
            if "weight" in name:
                nn.init.orthogonal_(p)
            elif "bias" in name:
                nn.init.constant_(p, 0)
        
        self.attn = Attention_mlp(embedding_dim, 2 * num_hiddens)

        self.final_pred = Final_pred(2 *num_hiddens)

    def forward(self, text, term):
        seq_len = len(text.t())
        # print('text2:',text.size(1))
        # print('term:',term.size())
        e1 = self.text_embeddings(text)
        # e1 shape(batch_size,seq_len, embedding_dim)
        e2 = self.term_embeddings(term).expand(e1.size())

        wv = torch.cat((e1, e2), dim=2)
        # e.g.
        # wv torch.Size([batch_size,seq_len,2*embedding_dim])

        out, (h, c) = self.gru(wv)  # output, (h, c)
        # out shape(batch_size,seq_len, 2 * num_hiddens)
        # h shape(num_layers * num_directions, batch_size, 2*num_hiddens)

        r = self.attn(self.term_embeddings(term), out)
        
        # shape(batch_size * 1 * hidden_dim)
        h_n = out[:, -1:, :]
        # shape(batch_size * 1 * hidden_dim)
        y = self.final_pred(r, h_n)
        # shape(batch_size * 1 * 3)
        return y
    
class ATAE_GRU_Position(ATAE_GRU):
    def __init__(self, embedding_dim, num_hiddens, num_layers, dropout=0.2):
        super(ATAE_GRU_Position, self).__init__(embedding_dim, num_hiddens, num_layers, dropout)
        # self.position_embeddings = nn.Embedding(pos_vocab_size, embedding_dim) #TODO: learnable embedding
        self.position_encodings = PositionalEncoding(num_hiddens*2, dropout=dropout) #static encoding

    def forward(self, text, term):
        seq_len = len(text.t())
        # print('text2:',text.size(1))
        # print('term:',term.size())
        e1 = self.text_embeddings(text)
        # e1 shape(batch_size,seq_len, embedding_dim)
        e2 = self.term_embeddings(term).expand(e1.size())

        wv = torch.cat((e1, e2), dim=2)
        # e.g.
        # wv torch.Size([batch_size,seq_len,2*embedding_dim])

        out, (h, c) = self.gru(wv)  # output, (h, c)
        # out shape(batch_size,seq_len, 2 * num_hiddens)
        # h shape(num_layers * num_directions, batch_size, 2*num_hiddens)

        r = self.attn(self.term_embeddings(term), self.position_encodings(out.transpose(0,1)).transpose(0,1))
        # shape(batch_size * 1 * hidden_dim)
        h_n = out[:, -1:, :]
        # shape(batch_size * 1 * hidden_dim)
        y = self.final_pred(r, h_n)
        # shape(batch_size * 1 * 3)
        return y

In [13]:
def evaluate_accuracy(data_iter, net):
    acc_sum, n = 0.0, 0
    with torch.no_grad():
        for batch_idx, batch in enumerate(data_iter):
            X1, X2, y = batch.text.to(device), batch.term.to(device), batch.polarity.to(device)
            X1 = X1.permute(1, 0)
            X2 = X2.unsqueeze(1)
            y.data.sub_(1)  # index start from 0
            if isinstance(net, torch.nn.Module):
                net.eval()  
                acc_sum += (net(X1,
                                X2).argmax(dim=1) == y).float().sum().item()
                net.train()  
            else:
                if ('is_training'
                        in net.__code__.co_varnames): 
                    acc_sum += (net(X1, X2, is_training=False).argmax(
                        dim=1) == y).float().sum().item()
                else:
                    acc_sum += (net(
                        X1, X2).argmax(dim=1) == y).float().sum().item()
            n += y.shape[0]
    return acc_sum / n

In [85]:
def train(train_iter, test_iter, net, loss, optimizer, num_epochs):
    batch_count = 0
    ret = ret = [[],[],[],[],[]]
    for epoch in range(num_epochs):
        train_l_sum, train_acc_sum, n, start = 0.0, 0.0, 0, time.time()
        for batch_idx, batch in enumerate(train_iter):
            X1, X2, y = batch.text.to(device), batch.term.to(device), batch.polarity.to(device)
            X1 = X1.permute(1, 0)
            X2 = X2.unsqueeze(1)
            y.data.sub_(1)  # index start from 0
            y_hat = net(X1,X2)
            l = loss(y_hat, y)

            optimizer.zero_grad()
            l.backward()
            optimizer.step()
            
            train_l_sum += l.item()
            train_acc_sum += (y_hat.argmax(dim=1) == y).sum().item()
            n += y.shape[0]
            batch_count += 1
        test_acc = evaluate_accuracy(test_iter, net)
        print(
            'epoch %d, loss %.4f, train acc %.3f, test acc %.3f, time %.1f sec'
            % (epoch + 1, train_l_sum / batch_count, train_acc_sum / n,
               test_acc, time.time() - start))
        ret[0].append(epoch + 1)
        ret[1].append(train_l_sum / batch_count)
        ret[2].append(train_acc_sum / n)
        ret[3].append(test_acc)
        ret[4].append(time.time() - start)
                      
    return ret

## Baseline for ATAE_LSTM

In [86]:
embedding_dim, num_hiddens, num_layers = 300, 150, 1
net = ATAE_LSTM(embedding_dim, num_hiddens, num_layers).to(device)
print(net)
lr, num_epochs = 0.001, 20
optimizer = torch.optim.Adam(net.parameters(), lr=lr)
loss = nn.CrossEntropyLoss()
lstm_data = train(train_iter, val_iter, net, loss, optimizer, num_epochs)

ATAE_LSTM(
  (text_embeddings): Embedding(4545, 300)
  (term_embeddings): Embedding(1529, 300)
  (lstm): LSTM(600, 150, batch_first=True, bidirectional=True)
  (attn): Attention_mlp(
    (wv): Linear(in_features=300, out_features=300, bias=False)
    (wh): Linear(in_features=300, out_features=300, bias=False)
    (fc1): Linear(in_features=600, out_features=1, bias=False)
  )
  (final_pred): Final_pred(
    (wp): Linear(in_features=300, out_features=300, bias=False)
    (wx): Linear(in_features=300, out_features=300, bias=False)
    (ws): Linear(in_features=300, out_features=3, bias=True)
  )
)
epoch 1, loss 0.9801, train acc 0.562, test acc 0.659, time 19.5 sec
epoch 2, loss 0.4395, train acc 0.614, test acc 0.671, time 19.4 sec
epoch 3, loss 0.2591, train acc 0.651, test acc 0.684, time 20.3 sec
epoch 4, loss 0.1831, train acc 0.678, test acc 0.710, time 19.4 sec
epoch 5, loss 0.1277, train acc 0.739, test acc 0.723, time 18.0 sec
epoch 6, loss 0.0956, train acc 0.774, test acc 0.731,

## Baseline for ATAE_GRU

In [15]:
embedding_dim, num_hiddens, num_layers = 300, 150, 1
net = ATAE_GRU(embedding_dim, num_hiddens, num_layers).to(device)
print(net)
lr, num_epochs = 0.001, 20
optimizer = torch.optim.Adam(net.parameters(), lr=lr)
loss = nn.CrossEntropyLoss()
gru_data = train(train_iter, val_iter, net, loss, optimizer, num_epochs)



ATAE_GRU(
  (text_embeddings): Embedding(4545, 300)
  (term_embeddings): Embedding(1529, 300)
  (gru): GRU(600, 150, batch_first=True, dropout=0.2, bidirectional=True)
  (attn): Attention_mlp(
    (wv): Linear(in_features=300, out_features=300, bias=False)
    (wh): Linear(in_features=300, out_features=300, bias=False)
    (fc1): Linear(in_features=600, out_features=1, bias=False)
  )
  (final_pred): Final_pred(
    (wp): Linear(in_features=300, out_features=300, bias=False)
    (wx): Linear(in_features=300, out_features=300, bias=False)
    (ws): Linear(in_features=300, out_features=3, bias=True)
  )
)
epoch 1, loss 0.9785, train acc 0.568, test acc 0.665, time 13.6 sec
epoch 2, loss 0.4058, train acc 0.629, test acc 0.671, time 14.0 sec
epoch 3, loss 0.2609, train acc 0.653, test acc 0.704, time 13.8 sec
epoch 4, loss 0.1813, train acc 0.685, test acc 0.725, time 13.2 sec
epoch 5, loss 0.1258, train acc 0.735, test acc 0.723, time 12.6 sec
epoch 6, loss 0.1013, train acc 0.768, test 

## Baseline for ATAE_GRU+position-embedding

In [68]:
embedding_dim, num_hiddens, num_layers = 300, 150, 1
net = ATAE_GRU_Position(embedding_dim, num_hiddens, num_layers).to(device)
print(net)
lr, num_epochs = 0.001, 20
optimizer = torch.optim.Adam(net.parameters(), lr=lr)
loss = nn.CrossEntropyLoss()
pos_data = train(train_iter, val_iter, net, loss, optimizer, num_epochs)

ATAE_GRU_Position(
  (text_embeddings): Embedding(4545, 300)
  (term_embeddings): Embedding(1529, 300)
  (gru): GRU(600, 150, batch_first=True, dropout=0.2, bidirectional=True)
  (attn): Attention_mlp(
    (wv): Linear(in_features=300, out_features=300, bias=False)
    (wh): Linear(in_features=300, out_features=300, bias=False)
    (fc1): Linear(in_features=600, out_features=1, bias=False)
  )
  (final_pred): Final_pred(
    (wp): Linear(in_features=300, out_features=300, bias=False)
    (wx): Linear(in_features=300, out_features=300, bias=False)
    (ws): Linear(in_features=300, out_features=3, bias=True)
  )
  (position_encodings): PositionalEncoding(
    (dropout): Dropout(p=0.2, inplace=False)
  )
)
epoch 1, loss 1.0059, train acc 0.536, test acc 0.631, time 13.4 sec
epoch 2, loss 0.4575, train acc 0.606, test acc 0.661, time 13.8 sec
epoch 3, loss 0.3026, train acc 0.616, test acc 0.671, time 13.0 sec
epoch 4, loss 0.1995, train acc 0.659, test acc 0.673, time 13.3 sec
epoch 5, lo

## Prepare new data

In [14]:
import csv
def aggreation_data(path_to_single):
    single_data = pd.read_csv(path_to_single)
    single_data["polarity"] = single_data["polarity"].astype(str)
    aggre_data = pd.DataFrame({"text":single_data.groupby("text").count().index, "term":single_data.groupby("text")["term"].apply(lambda x:x.str.cat(sep='|')), "polarity":single_data.groupby("text")["polarity"].apply(lambda x:x.str.cat(sep='|'))})
    aggre_data.to_csv(path_to_single[:-4]+'_aggregation'+'.csv', index=False)
    # return aggre_data

In [15]:
aggreation_data('./data/rest_train.csv')
aggreation_data('./data/rest_test.csv')

In [16]:
a = pd.read_csv('./data/rest_train.csv')
b = pd.read_csv('./data/rest_test.csv')

In [17]:
c = pd.read_csv('./data/rest_train_aggregation.csv')
d = pd.read_csv('./data/rest_test_aggregation.csv')

In [18]:
c

Unnamed: 0,text,term,polarity
0,"$160 for 2 filets, 2 sides, an appetizer and d...",filets|sides|appetizer|drinks,0|0|0|0
1,$20 for all you can eat sushi cannot be beaten.,sushi,0
2,$20 gets you unlimited sushi of a very high qu...,sushi|sushi places|quality,1|1|1
3,"$6 and there is much tasty food, all of it fre...",food,1
4,"($200 for 2 glasses of champagne, not too expe...",glasses of champagne|bottle of wine|after dinn...,-1|-1|-1
...,...,...,...
1971,we were tired and cold when we got to the rest...,appetizers,0
1972,"word of advice, save room for pasta dishes and...",pasta dishes|tiramisu,1|1
1973,would have rather tried terrace in the sky or ...,price,-1
1974,you can actually get 2 salads worth if u take ...,salads|lettuce,-1|0


In [20]:
c.polarity

0        0|0|0|0
1              0
2          1|1|1
3              1
4       -1|-1|-1
          ...   
1971           0
1972         1|1
1973          -1
1974        -1|0
1975           0
Name: polarity, Length: 1976, dtype: object

In [19]:
max_term = max(a.groupby("text").count().max()[0], b.groupby("text").count().max()[0])

In [22]:
max_term

13

In [20]:
# preprocessing
from nltk import word_tokenize

text = data.Field(sequential = True, lower = True, tokenize = word_tokenize)
term = data.Field(sequential = True, lower = True, tokenize = lambda x:x.split("|"))
polarity = data.Field(sequential = True, tokenize = lambda x:x.split("|"))

In [21]:
train_set, val_set = data.TabularDataset.splits(path=r'data/',
                                        skip_header=True,
                                        train='rest_train_aggregation.csv',
                                        validation='rest_test_aggregation.csv',
                                        format='csv',
                                        fields=[('text', text),
                                                ('term', term),
                                                ('polarity', polarity)])

In [22]:
text.build_vocab(train_set, val_set, vectors=vectors)
term.build_vocab(train_set, val_set, vectors=vectors)
polarity.build_vocab(train_set, val_set)

text_vocab_size = len(text.vocab)
term_vocab_size = len(term.vocab)
polarity_size = len(polarity.vocab)
text_vector=text.vocab.vectors
term_vector=term.vocab.vectors

In [26]:
text_vocab_size, term_vocab_size, len(polarity.vocab)

(4545, 1530, 5)

In [27]:
batch_size=512
train_iter, val_iter = data.Iterator.splits(
            (train_set, val_set),
            sort_key=lambda x: len(x.text),
            batch_sizes=(batch_size, len(val_set)), # batch_size only for training
)

In [23]:
def evaluate_accuracy_sa(data_iter, net):
    acc_sum, n = 0.0, 0
    with torch.no_grad():
        for batch_idx, batch in enumerate(data_iter):
            X1, X2, y = batch.text.to(device), batch.term.to(device), batch.polarity.to(device)
            X1 = X1.permute(1, 0)
            X2 = X2.permute(1, 0)
            y = y.permute(1, 0)
            raw_mask = 1-(y==1).type(torch.long)
            y = y.reshape(-1,1).squeeze()
            # print(y, raw_mask)
            mask = raw_mask.reshape(-1,1).squeeze() # get rid of padding
            y.data.sub_(2)  # index start from 0
            y *= mask
            if isinstance(net, torch.nn.Module):
                net.eval()  
                res = net(X1, X2, 1-raw_mask).argmax(dim=1)*mask
                acc_sum += ((res == (y*mask)).float().sum().item() - (1-mask).sum())
                net.train()  
            else:
                if ('is_training'
                        in net.__code__.co_varnames): 
                    acc_sum += (net(X1, X2, is_training=False).argmax(
                        dim=1) == y).float().sum().item()
                else:
                    acc_sum += (net(
                        X1, X2).argmax(dim=1) == y).float().sum().item()
            n += (y.shape[0] - (1-mask).sum())
    return acc_sum / n

In [67]:
def train_sa(train_iter, test_iter, net, loss, optimizer, num_epochs):
    batch_count = 0
    ret = [[],[],[],[],[]]
    for epoch in range(num_epochs):
        train_l_sum, train_acc_sum, n, start = 0.0, 0.0, 0, time.time()
        # batch_count = 1
        # n=1
        for batch_idx, batch in enumerate(train_iter):
            X1, X2, y = batch.text.to(device), batch.term.to(device), batch.polarity.to(device)
            X1 = X1.permute(1, 0)
            X2 = X2.permute(1, 0)
            y = y.permute(1, 0)
            
            mask = 1-(y==1).type(torch.long)
            y_hat = net(X1,X2,1-mask)
            
            y = y.reshape(-1,1).squeeze()
            mask = mask.reshape(-1,1).squeeze() # get rid of padding
            y.data.sub_(2)  # index start from 0
            y *= mask
            # print(y_hat, y.min())
            l = loss(y_hat, y) * mask # get rid of padding
            l=l.mean()

            optimizer.zero_grad()
            l.backward()
            optimizer.step()
            
            train_l_sum += l.item()
            train_acc_sum += ((y_hat.argmax(dim=1)*mask == (y*mask)).sum().item() - (1-mask).sum())
            # print(mask, mask.sum(), (1-mask).sum(), y.shape[0], (y_hat.argmax(dim=1)*mask == (y*mask)).sum().item())
            n += (y.shape[0] - (1-mask).sum())
            batch_count += 1
        test_acc = evaluate_accuracy_sa(test_iter, net)
        print(
            'epoch %d, loss %.4f, train acc %.3f, test acc %.3f, time %.1f sec'
            % (epoch + 1, train_l_sum / batch_count, train_acc_sum / n,
               test_acc, time.time() - start))
        ret[0].append(epoch + 1)
        ret[1].append(train_l_sum / batch_count)
        ret[2].append(train_acc_sum / n)
        ret[3].append(test_acc)
        ret[4].append(time.time() - start)
                      
    return ret

## Baseline for ATAE_GRU+position-embedding+self-attention

In [68]:
class ATAE_GRU_Position_SA(ATAE_GRU_Position):
    def __init__(self, embedding_dim, num_hiddens, num_layers, dropout=0.2, num_heads=2):
        super(ATAE_GRU_Position_SA, self).__init__(embedding_dim, num_hiddens, num_layers, dropout)
        self.multihead_attn = torch.nn.MultiheadAttention(2*num_hiddens, num_heads, dropout=dropout)
        
        self.num_attention_heads = num_heads
        self.attention_head_size = int(2 * num_hiddens / num_heads)
        self.all_head_size = 2 * num_hiddens
        
        self.query = nn.Linear(2 * num_hiddens, self.all_head_size)
        self.key = nn.Linear(2 * num_hiddens, self.all_head_size)
        self.value = nn.Linear(2 * num_hiddens, self.all_head_size)
        
        self.attn_dropout = nn.Dropout(dropout)
        self.LayerNorm = nn.LayerNorm(2 * num_hiddens, eps=1e-12)
        self.out_dropout = nn.Dropout(dropout)
        
    def transpose_for_scores(self, x):
        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
        x = x.view(*new_x_shape)
        return x.permute(0, 2, 1, 3)

    def forward(self, text_input, term_input, mask_input):
        seq_len = len(text_input.t())
        # print('text2:',text.size(1))
        # print('term:',term.size())
        e1 = self.text_embeddings(text_input)
        # e1 shape(batch_size, seq_len, embedding_dim)
        e2 = self.term_embeddings(term_input)
        # e2 shape(batch_size, term_len, embedding_dim)
        
        # print(0, e1.shape, e2.shape)
        
        seq_len = e1.size(1)
        term_len = e2.size(1)
        #三维扩充为四维
        e1 = e1.unsqueeze(1).repeat(1, term_len, 1, 1) # (batch_size, term_len, seq_len, embedding_dim)
        e2 = e2.unsqueeze(2).repeat(1, 1, seq_len, 1) # (batch_size, term_len, seq_len, embedding_dim)
        
        # print(1, e1.shape, e2.shape)
        
        e1 = e1.reshape(-1, e1.size(2), e1.size(3)) # (batch_size * term_len, seq_len, embedding_dim)
        e2 = e2.reshape(-1, e2.size(2), e2.size(3)) # (batch_size * term_len, seq_len, embedding_dim)
        
        # print(2, e1.shape, e2.shape)

        wv = torch.cat((e1, e2), dim=2)
        # e.g.
        # wv torch.Size([batch_size*term_len,seq_len,2*embedding_dim])

        out, (h, c) = self.gru(wv)  # output, (h, c)
        # out shape(batch_size* term_len, seq_len, 2 * num_hiddens)
        # h shape(num_layers * num_directions, batch_size * term_len, 2 * num_hiddens)
        
        # print(2.1, out.shape)

        posenc = self.position_encodings(out.transpose(0,1)).transpose(0,1)
        h_star = self.attn(e2, self.position_encodings(out.transpose(0,1)).transpose(0,1)) # (batch_size * term_len, 1, 2*hidden)
        h_star = h_star.squeeze().reshape(-1, term_len, 2*self.num_hiddens) # (batch_size, term_len, 2*hidden)
        out = out.squeeze().reshape(-1, term_len, seq_len, 2*self.num_hiddens) # (batch_size, term_len, 2*hidden)
        
        query = self.query(h_star).transpose(1, 0) # (batch_size, term_len, 2*hidden)
        key = self.key(h_star).transpose(1, 0) # (batch_size, term_len, 2*hidden)
        value = self.value(h_star).transpose(1, 0) # (batch_size, term_len, 2*hidden)
        
        attn_output, attn_output_weights = self.multihead_attn(query, key, value, key_padding_mask=mask_input) # (batch_size, term_len, 2*hidden),  (batch_size, term_len, term_len)
        # shape(batch_size, term_len, 2*hidden)
        h_n = out[:, :, -1:, :].squeeze()
        # print(2.3, h_star.shape, h_n.shape)
        # shape(batch_size, term_len, 2*hidden)
        y = self.final_pred(h_star, h_n).reshape(-1, 3)
        # shape(batch_size, term_len, 3)     
            # print(3, y.shape)

        
        return y

In [70]:
embedding_dim, num_hiddens, num_layers = 300, 256, 1
net = ATAE_GRU_Position_SA(embedding_dim, num_hiddens, num_layers).to(device)
print(net)
lr, num_epochs = 0.001, 20
optimizer = torch.optim.Adam(net.parameters(), lr=lr)
loss = nn.CrossEntropyLoss(reduction="none")
sa_data = train_sa(train_iter, val_iter, net, loss, optimizer, num_epochs)

ATAE_GRU_Position_SA(
  (text_embeddings): Embedding(4545, 300)
  (term_embeddings): Embedding(1530, 300)
  (gru): GRU(600, 256, batch_first=True, dropout=0.2, bidirectional=True)
  (attn): Attention_mlp(
    (wv): Linear(in_features=300, out_features=300, bias=False)
    (wh): Linear(in_features=512, out_features=300, bias=False)
    (fc1): Linear(in_features=600, out_features=1, bias=False)
  )
  (final_pred): Final_pred(
    (wp): Linear(in_features=512, out_features=512, bias=False)
    (wx): Linear(in_features=512, out_features=512, bias=False)
    (ws): Linear(in_features=512, out_features=3, bias=True)
  )
  (position_encodings): PositionalEncoding(
    (dropout): Dropout(p=0.2, inplace=False)
  )
  (multihead_attn): MultiheadAttention(
    (out_proj): _LinearWithBias(in_features=512, out_features=512, bias=True)
  )
  (query): Linear(in_features=512, out_features=512, bias=True)
  (key): Linear(in_features=512, out_features=512, bias=True)
  (value): Linear(in_features=512, out

In [69]:
del net
import gc
gc.collect()

355

In [None]:
def plot_data(data, name):
    
    epoch, train_loss, train_acc, test_acc, time = data
    
    plt.subplot(1, 3, 1)
    plt.plot(epoch, train_loss)
    plt.title("Training loss")

    #plot 2:
    x = np.array([1, 2, 3, 4])
    y = np.array([1, 4, 9, 16])

    plt.subplot(1, 3, 2)
    plt.plot(epoch, train_acc)
    plt.title("Training Acc")
    
    plt.subplot(1, 3, 3)
    plt.plot(epoch, test_acc)
    plt.title("Test Acc")

    plt.suptitle(name)
    # plt.show()

In [None]:
plot_data(lstm_data, "ATAE_LSTM")

In [None]:
plot_data(gru_data, "ATAE_GRU")

In [None]:
plot_data(pos_data, "ATAE_GRU_POS")

In [None]:
plot_data(sa_data, "ATAE_LSTM_POS_SA")