https://github.com/bentrevett/pytorch-seq2seq/blob/master/6%20-%20Attention%20is%20All%20You%20Need.ipynb

In [1]:
from sklearn.model_selection import train_test_split


import numpy as np
import pandas as pd
import pickle
from boltons.iterutils import windowed

# Helper libraries
import random
from tqdm import tqdm, tqdm_notebook

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.dataset import random_split
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import torchtext

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

import numpy as np

import random
import math
import time

In [2]:
#device = torch.device('cpu')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

In [3]:
DATA_PATH = 'Data/Market_Basket_Optimisation.csv'

# Define Functions

In [4]:
def save_obj(obj, name ):
    with open('data/'+ name + '.pkl', 'wb') as f:
        pickle.dump(obj, f)

def load_obj(name ):
    with open('data/' + name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [5]:
def get_key(val): 
    for key, value in item2idx.items(): 
         if val == value:
                return key 
  
    return "key doesn't exist"

In [6]:
def get_val(keyval): 
    for key, value in item2idx.items(): 
         if keyval == key:
                return value 
  
    return "value doesn't exist"

In [7]:
def load_data(path, sequence_length=4):
    df = pd.read_csv(DATA_PATH, sep=';', names=['sequence'])
    df = df.sequence.str.lower().tolist()
    tmp = []
    for row in df:
        splitted = ((row.split(',')))
        tmp.append(windowed(splitted, sequence_length))
    all_chars_windowed = [sublst for lst in tmp for sublst in lst]
    filtered_good_chars = [
            sequence for sequence in tqdm_notebook(all_chars_windowed) 
        ]
    return filtered_good_chars


def get_unique_items(sequences):
    return {sublst for lst in sequences for sublst in lst}


def create_item2idx(sequences):
    unique_chars = get_unique_items(sequences)
    return {char: idx for idx, char in enumerate(sorted(unique_chars))}


def encode_sequence(sequence, item2idx):
    return [item2idx[char] for char in sequence]


def encode_sequences(sequences, char2idx):
    return np.array([
        encode_sequence(sequence, char2idx) 
        for sequence in tqdm_notebook(sequences)
    ])

def split_list_x_and_y(encoded_seq):
    # our y value should be the last value of each sequence ->
    # IMPORTANT: depends on sequence length!
    y_num = encoded_seq[:,2:4]
    x_num = encoded_seq[:,0:2]
    return x_num, y_num

Das Modell soll innerhalb einer Session den jeweiligen state feststellen, daher muss das Validation Set in der gleichen Session liegen wie die Trainsession; allerdings zu einem (im Idealfall) späteren/letzten Zustand

# Build Data Loader

In [8]:
class Sequences(Dataset):
    def __init__(self, path, trainset=True):
        
        self.sequences = load_data(path)
        self.vocab_size = len(get_unique_items(self.sequences))
        self.item2idx = create_item2idx(self.sequences)
        self.idx2item = {idx: item for item, idx in self.item2idx.items()}
        self.x, self.y = split_list_x_and_y(encode_sequences(self.sequences, self.item2idx))
        self.dataseq_len = self.x.shape[0]
        # to build training and testset, we cannot simply use train-test split, because sequences are not random but in order
        self.train_len = np.ceil(self.dataseq_len*0.8).astype(int)
        
        if trainset==True:
            self.x, self.y = self.x[0:self.train_len], self.y[0:self.train_len]
        else:
            self.x, self.y = self.x[self.train_len:], self.y[self.train_len:]
            
        self.x = torch.from_numpy(self.x).to(device)
        self.y = torch.from_numpy(self.y).to(device)
#         self.emb_dims = [(self.vocab_size, min(50, (self.vocab_size + 1) // 2))]
        
        
        
    def __getitem__(self, i):
        return self.x[i], self.y[i]
    
    def __len__(self):
        return len(self.x)

In [9]:
dataset_emb_train = Sequences(DATA_PATH, trainset=True)
dataset_emb_test = Sequences(DATA_PATH, trainset=False)

HBox(children=(IntProgress(value=0, max=11726), HTML(value='')))




HBox(children=(IntProgress(value=0, max=11726), HTML(value='')))




HBox(children=(IntProgress(value=0, max=11726), HTML(value='')))




HBox(children=(IntProgress(value=0, max=11726), HTML(value='')))




In [10]:
dataset_emb_train.x.shape, dataset_emb_train.y.shape

(torch.Size([9381, 2]), torch.Size([9381, 2]))

In [11]:
dataloader_emb_train=DataLoader(dataset=dataset_emb_train,batch_size=64, shuffle=False)
dataloader_emb_test=DataLoader(dataset=dataset_emb_test,batch_size=64, shuffle=False)

In [12]:
dataset_emb_train.vocab_size, dataset_emb_test.vocab_size

(120, 120)

In [13]:
dataloader_emb_train.dataset.x.shape, dataloader_emb_test.dataset.x.shape

(torch.Size([9381, 2]), torch.Size([2345, 2]))

In [14]:
max(dataloader_emb_test.dataset.idx2item)

119

In [15]:
dataloader_emb_test.dataset.x[17], dataloader_emb_test.dataset.x[18]

(tensor([ 71, 111], dtype=torch.int32), tensor([111,  37], dtype=torch.int32))

In [16]:
dataloader_emb_test.dataset.y[17], dataloader_emb_test.dataset.y[18]

(tensor([37, 25], dtype=torch.int32), tensor([25, 43], dtype=torch.int32))

In [17]:
item2idx = dataloader_emb_train.dataset.item2idx
get_key(71), get_key(111), get_key(37)

('milk', 'vegetables mix', 'eggs')

# Build Model 

## Encoder

In [18]:
class Encoder(nn.Module):
    def __init__(self, 
                 input_dim, 
                 hid_dim, 
                 n_layers, 
                 n_heads, 
                 pf_dim,
                 dropout, 
                 device,
                 max_length = 5):
        super().__init__()

        self.device = device
        
        self.tok_embedding = nn.Embedding(input_dim, hid_dim)
        self.pos_embedding = nn.Embedding(max_length, hid_dim)
        
        self.layers = nn.ModuleList([EncoderLayer(hid_dim, 
                                                  n_heads, 
                                                  pf_dim,
                                                  dropout, 
                                                  device) 
                                     for _ in range(n_layers)])
        
        self.dropout = nn.Dropout(dropout)
        
        self.scale = torch.sqrt(torch.FloatTensor([hid_dim])).to(device)
        
    def forward(self, src, src_mask):
        
        #src = [batch size, src len]
        #src_mask = [batch size, src len]
        
        batch_size = src.shape[0]
        src_len = src.shape[1]
        
        pos = torch.arange(0, src_len).unsqueeze(0).repeat(batch_size, 1).to(self.device)
        
        #pos = [batch size, src len]
        
        src = self.dropout((self.tok_embedding(src) * self.scale) + self.pos_embedding(pos))
        
        #src = [batch size, src len, hid dim]
        
        for layer in self.layers:
            src = layer(src, src_mask)
            
        #src = [batch size, src len, hid dim]
            
        return src

### Encoder Layer 

In [19]:
class EncoderLayer(nn.Module):
    def __init__(self, 
                 hid_dim, 
                 n_heads, 
                 pf_dim,  
                 dropout, 
                 device):
        super().__init__()
        
        self.layer_norm = nn.LayerNorm(hid_dim)
        self.self_attention = MultiHeadAttentionLayer(hid_dim, n_heads, dropout, device)
        self.positionwise_feedforward = PositionwiseFeedforwardLayer(hid_dim, 
                                                                     pf_dim, 
                                                                     dropout)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src, src_mask):
        
        #src = [batch size, src len, hid dim]
        #src_mask = [batch size, src len]
                
        #self attention
        _src, _ = self.self_attention(src, src, src, src_mask)
        
        #dropout, residual connection and layer norm
        src = self.layer_norm(src + self.dropout(_src))
        
        #src = [batch size, src len, hid dim]
        
        #positionwise feedforward
        _src = self.positionwise_feedforward(src)
        
        #dropout, residual and layer norm
        src = self.layer_norm(src + self.dropout(_src))
        
        #src = [batch size, src len, hid dim]
        
        return src

### MultiheadAttentionLayer

In [20]:
class MultiHeadAttentionLayer(nn.Module):
    def __init__(self, hid_dim, n_heads, dropout, device):
        super().__init__()
        
        assert hid_dim % n_heads == 0
        
        self.hid_dim = hid_dim
        self.n_heads = n_heads
        self.head_dim = hid_dim // n_heads
        
        self.fc_q = nn.Linear(hid_dim, hid_dim)
        self.fc_k = nn.Linear(hid_dim, hid_dim)
        self.fc_v = nn.Linear(hid_dim, hid_dim)
        
        self.fc_o = nn.Linear(hid_dim, hid_dim)
        
        self.dropout = nn.Dropout(dropout)
        
        self.scale = torch.sqrt(torch.FloatTensor([self.head_dim])).to(device)
        
    def forward(self, query, key, value, mask = None):
        
        batch_size = query.shape[0]
        
        #query = [batch size, query len, hid dim]
        #key = [batch size, key len, hid dim]
        #value = [batch size, value len, hid dim]
                
        Q = self.fc_q(query)
        K = self.fc_k(key)
        V = self.fc_v(value)
        
        #Q = [batch size, query len, hid dim]
        #K = [batch size, key len, hid dim]
        #V = [batch size, value len, hid dim]
                
        Q = Q.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        K = K.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        V = V.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        
        #Q = [batch size, n heads, query len, head dim]
        #K = [batch size, n heads, key len, head dim]
        #V = [batch size, n heads, value len, head dim]
                
        energy = torch.matmul(Q, K.permute(0, 1, 3, 2)) / self.scale
        
        #energy = [batch size, n heads, seq len, seq len]
        
        if mask is not None:
            energy = energy.masked_fill(mask == 0, -1e10)
        
        attention = torch.softmax(energy, dim = -1)
                
        #attention = [batch size, n heads, query len, key len]
        
        x = torch.matmul(self.dropout(attention), V)
        
        #x = [batch size, n heads, seq len, head dim]
        
        x = x.permute(0, 2, 1, 3).contiguous()
        
        #x = [batch size, seq len, n heads, head dim]
        
        x = x.view(batch_size, -1, self.hid_dim)
        
        #x = [batch size, seq len, hid dim]
        
        x = self.fc_o(x)
        
        #x = [batch size, seq len, hid dim]
        
        return x, attention

###  Position-wise Feedforward Layer

In [21]:
class PositionwiseFeedforwardLayer(nn.Module):
    def __init__(self, hid_dim, pf_dim, dropout):
        super().__init__()
        
        self.fc_1 = nn.Linear(hid_dim, pf_dim)
        self.fc_2 = nn.Linear(pf_dim, hid_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        
        #x = [batch size, seq len, hid dim]
        
        x = self.dropout(torch.relu(self.fc_1(x)))
        
        #x = [batch size, seq len, pf dim]
        
        x = self.fc_2(x)
        
        #x = [batch size, seq len, hid dim]
        
        return x

## Decoder

In [22]:
class Decoder(nn.Module):
    def __init__(self, 
                 output_dim, 
                 hid_dim, 
                 n_layers, 
                 n_heads, 
                 pf_dim, 
                 dropout, 
                 device,
                 max_length = 5):
        super().__init__()
        
        self.device = device
        
        self.tok_embedding = nn.Embedding(output_dim, hid_dim)
        self.pos_embedding = nn.Embedding(max_length, hid_dim)
        
        self.layers = nn.ModuleList([DecoderLayer(hid_dim, 
                                                  n_heads, 
                                                  pf_dim, 
                                                  dropout, 
                                                  device)
                                     for _ in range(n_layers)])
        
        self.fc_out = nn.Linear(hid_dim, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
        self.scale = torch.sqrt(torch.FloatTensor([hid_dim])).to(device)
        
    def forward(self, trg, enc_src, trg_mask, src_mask):
        
        #trg = [batch size, trg len]
        #enc_src = [batch size, src len, hid dim]
        #trg_mask = [batch size, trg len]
        #src_mask = [batch size, src len]
                
        batch_size = trg.shape[0]
        trg_len = trg.shape[1]
        
        pos = torch.arange(0, trg_len).unsqueeze(0).repeat(batch_size, 1).to(self.device)
                            
        #pos = [batch size, trg len]
            
        trg = self.dropout((self.tok_embedding(trg) * self.scale) + self.pos_embedding(pos))
                
        #trg = [batch size, trg len, hid dim]
        
        for layer in self.layers:
            trg, attention = layer(trg, enc_src, trg_mask, src_mask)
        
        #trg = [batch size, trg len, hid dim]
        #attention = [batch size, n heads, trg len, src len]
        
        output = self.fc_out(trg)
        
        #output = [batch size, trg len, output dim]
            
        return output, attention

### Decoder Layer

In [23]:
class DecoderLayer(nn.Module):
    def __init__(self, 
                 hid_dim, 
                 n_heads, 
                 pf_dim, 
                 dropout, 
                 device):
        super().__init__()
        
        self.layer_norm = nn.LayerNorm(hid_dim)
        self.self_attention = MultiHeadAttentionLayer(hid_dim, n_heads, dropout, device)
        self.encoder_attention = MultiHeadAttentionLayer(hid_dim, n_heads, dropout, device)
        self.positionwise_feedforward = PositionwiseFeedforwardLayer(hid_dim, 
                                                                     pf_dim, 
                                                                     dropout)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, trg, enc_src, trg_mask, src_mask):
        
        #trg = [batch size, trg len, hid dim]
        #enc_src = [batch size, src len, hid dim]
        #trg_mask = [batch size, trg len]
        #src_mask = [batch size, src len]
        
        #self attention
        _trg, _ = self.self_attention(trg, trg, trg, trg_mask)
        
        #dropout, residual connection and layer norm
        trg = self.layer_norm(trg + self.dropout(_trg))
            
        #trg = [batch size, trg len, hid dim]
            
        #encoder attention
        _trg, attention = self.encoder_attention(trg, enc_src, enc_src, src_mask)
        
        #dropout, residual connection and layer norm
        trg = self.layer_norm(trg + self.dropout(_trg))
                    
        #trg = [batch size, trg len, hid dim]
        
        #positionwise feedforward
        _trg = self.positionwise_feedforward(trg)
        
        #dropout, residual and layer norm
        trg = self.layer_norm(trg + self.dropout(_trg))
        
        #trg = [batch size, trg len, hid dim]
        #attention = [batch size, n heads, trg len, src len]
        
        return trg, attention

##  Seq2Seq

In [24]:
class Seq2Seq(nn.Module):
    def __init__(self, 
                 encoder, 
                 decoder, 
                 src_pad_idx, 
                 trg_pad_idx, 
                 device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.src_pad_idx = src_pad_idx
        self.trg_pad_idx = trg_pad_idx
        self.device = device
        
    def make_src_mask(self, src):
        
        #src = [batch size, src len]
        
        src_mask = (src != self.src_pad_idx).unsqueeze(1).unsqueeze(2)

        #src_mask = [batch size, 1, 1, src len]

        return src_mask
    
    def make_trg_mask(self, trg):
        
        #trg = [batch size, trg len]
        
        trg_pad_mask = (trg != self.trg_pad_idx).unsqueeze(1).unsqueeze(3)
        
        #trg_pad_mask = [batch size, 1, trg len, 1]
        
        trg_len = trg.shape[1]
        
        trg_sub_mask = torch.tril(torch.ones((trg_len, trg_len), device = self.device)).bool()
        
        #trg_sub_mask = [trg len, trg len]
            
        trg_mask = trg_pad_mask & trg_sub_mask
        
        #trg_mask = [batch size, 1, trg len, trg len]
        
        return trg_mask

    def forward(self, src, trg):
        
        #src = [batch size, src len]
        #trg = [batch size, trg len]
                
        src_mask = self.make_src_mask(src)
        trg_mask = self.make_trg_mask(trg)
        
        #src_mask = [batch size, 1, 1, src len]
        #trg_mask = [batch size, 1, trg len, trg len]
        
        enc_src = self.encoder(src, src_mask)
        
        #enc_src = [batch size, src len, hid dim]
                
        output, attention = self.decoder(trg, enc_src, trg_mask, src_mask)
        
        #output = [batch size, trg len, output dim]
        #attention = [batch size, n heads, trg len, src len]
        
        return output, attention

# Train Seq2Seq Model

In [25]:
INPUT_DIM = dataset_emb_train.vocab_size
OUTPUT_DIM = dataset_emb_train.vocab_size
#HID_DIM = 256
# die hidden_dim geben auch die embedding-size vor, wenn es pre-trained embeddings gibt, müssen die HID_DIM angepasst werden
HID_DIM = 128
ENC_LAYERS = 3
DEC_LAYERS = 3
ENC_HEADS = 8
DEC_HEADS = 8
ENC_PF_DIM = 100
DEC_PF_DIM = 100
ENC_DROPOUT = 0.1
DEC_DROPOUT = 0.1

enc = Encoder(INPUT_DIM, 
              HID_DIM, 
              ENC_LAYERS, 
              ENC_HEADS, 
              ENC_PF_DIM, 
              ENC_DROPOUT, 
              device)

dec = Decoder(OUTPUT_DIM, 
              HID_DIM, 
              DEC_LAYERS, 
              DEC_HEADS, 
              DEC_PF_DIM, 
              DEC_DROPOUT, 
              device)

In [26]:
SRC_PAD_IDX = max(dataset_emb_train.idx2item)+1
TRG_PAD_IDX = max(dataset_emb_train.idx2item)+1

model = Seq2Seq(enc, dec, SRC_PAD_IDX, TRG_PAD_IDX, device).to(device)

In [27]:
model

Seq2Seq(
  (encoder): Encoder(
    (tok_embedding): Embedding(120, 128)
    (pos_embedding): Embedding(5, 128)
    (layers): ModuleList(
      (0): EncoderLayer(
        (layer_norm): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (self_attention): MultiHeadAttentionLayer(
          (fc_q): Linear(in_features=128, out_features=128, bias=True)
          (fc_k): Linear(in_features=128, out_features=128, bias=True)
          (fc_v): Linear(in_features=128, out_features=128, bias=True)
          (fc_o): Linear(in_features=128, out_features=128, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (positionwise_feedforward): PositionwiseFeedforwardLayer(
          (fc_1): Linear(in_features=128, out_features=100, bias=True)
          (fc_2): Linear(in_features=100, out_features=128, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (1): EncoderLayer(
        (layer_

In [28]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 798,416 trainable parameters


In [29]:
def initialize_weights(m):
    if hasattr(m, 'weight') and m.weight.dim() > 1:
        nn.init.xavier_uniform_(m.weight.data)

In [30]:
model.apply(initialize_weights);

In [31]:
LEARNING_RATE = 0.0005

optimizer = torch.optim.Adam(model.parameters(), lr = LEARNING_RATE)

In [32]:
criterion = nn.CrossEntropyLoss()

##  Train

In [33]:
def train(model, iterator, optimizer, criterion, clip):
    
    model.train()
    
    epoch_loss = 0
            
    for batch_idx, (inputs,outputs) in enumerate(iterator):
        src = inputs.long().to(device)
        trg = outputs.long().to(device)
        batch_size = src.size(0)
        
        optimizer.zero_grad()
        
        output, _ = model(src, trg[:,:-1])
                
        #output = [batch size, trg len - 1, output dim]
        #trg = [batch size, trg len]
            
        output_dim = output.shape[-1]
            
        output = output.contiguous().view(-1, output_dim)
        trg = trg[:,1:].contiguous().view(-1)
                
        #output = [batch size * trg len - 1, output dim]
        #trg = [batch size * trg len - 1]
            
        loss = criterion(output, trg)
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [40]:
def evaluate(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
    
        for batch_idx, (inputs,outputs) in enumerate(iterator):
            src = inputs.long().to(device)
            trg = outputs.long().to(device)
            batch_size = src.size(0)

            output, _ = model(src, trg[:,:-1])
            
            #output = [batch size, trg len - 1, output dim]
            #trg = [batch size, trg len]
            
            output_dim = output.shape[-1]
            
            output = output.contiguous().view(-1, output_dim)
            trg = trg[:,1:].contiguous().view(-1)
            
            #output = [batch size * trg len - 1, output dim]
            #trg = [batch size * trg len - 1]
            
            loss = criterion(output, trg)

            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [41]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [90]:
N_EPOCHS = 3
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train(model, dataloader_emb_train, optimizer, criterion, CLIP)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
 
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')

Epoch: 01 | Time: 0m 20s
	Train Loss: 2.489 | Train PPL:  12.043
Epoch: 02 | Time: 0m 23s
	Train Loss: 2.504 | Train PPL:  12.230
Epoch: 03 | Time: 0m 24s
	Train Loss: 2.474 | Train PPL:  11.872


Ein Problem hier beim Validation Set ist, dass die Sessions per se keine inhärente Logik haben wir ein Satz. Je genauer wir die Trainingssessions lernen, desto weiter sind wir vom Validationset entfernt, weil wir uns immer "sicherer" werden. Wenn die Sequenz aber nicht 100%ig stimmt, verschlechtern wir den Val.Loss. 

In [91]:
#device = torch.device('cpu')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

In [92]:
file_name = "Attention_Rebuild02.pth"
torch.save(model, file_name)

  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


In [93]:
# Load Embedding model
file_name = "Attention_Rebuild02.pth"
model_embeddings = torch.load(file_name, map_location=device)

In [94]:
model_embeddings

Seq2Seq(
  (encoder): Encoder(
    (tok_embedding): Embedding(120, 128)
    (pos_embedding): Embedding(5, 128)
    (layers): ModuleList(
      (0): EncoderLayer(
        (layer_norm): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (self_attention): MultiHeadAttentionLayer(
          (fc_q): Linear(in_features=128, out_features=128, bias=True)
          (fc_k): Linear(in_features=128, out_features=128, bias=True)
          (fc_v): Linear(in_features=128, out_features=128, bias=True)
          (fc_o): Linear(in_features=128, out_features=128, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (positionwise_feedforward): PositionwiseFeedforwardLayer(
          (fc_1): Linear(in_features=128, out_features=100, bias=True)
          (fc_2): Linear(in_features=100, out_features=128, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (1): EncoderLayer(
        (layer_

# Evaluate Next Sequence

Hier noch mit target_seq (könnte man auch theoretisch nehmen, wenn man 10 Sequenzen abwartet und dann 5 als input und die nächsten 5 als Output, allerdings dann nicht mehr wirklich korrekt).

### künstliche Sequenz der Länge n erstellen

Problem ist noch bei BERT: die Input Query wird gegen die Output Query geworfen, um eine weitere Sequenz zu erzeugen. Was ich versuchen möchte, ist, den Input selbst als Query gegen den Session-state zu werfen und daraus selbst eine neue Sequenz zu erzeugen. Bzw sollte ich als Input n eingeben, als target n+1, also einfach einen weiter. Der aktuelle State ist der Target state.

Außerdem muss die Reihenfolge sein: neuester angesehener Artikel bis vor n- Klicks angesehener Artikel

# Testing 

In [95]:
def gen_seq(src_tensor, trg_tensor, n=5):
    for i in range (0,n):
    
        if i ==0:
            src_tensor_input = src_tensor
            trg_tensor_input = trg_tensor
        else:
            src_tensor_input = src_tensor_
            trg_tensor_input = trg_tensor_

        src_mask = model_embeddings.make_src_mask(src_tensor_input)

        with torch.no_grad():
            enc_src = model_embeddings.encoder(src_tensor_input, src_mask)

        trg_mask = model_embeddings.make_trg_mask(trg_tensor_input)

        with torch.no_grad():
                output, attention = model_embeddings.decoder(trg_tensor_input, enc_src, trg_mask, src_mask)

        # # den wahrscheinlichsten Output nehmen        
        pred_token = output.argmax(2)[:,0].item()

        # # von der alten Sequenz den am weitesten entfernt gesehenen löschen und pred anhängen
        src_tensor_ = torch.cat((torch.tensor(pred_token).detach().view(-1).to(device), src_tensor_input[0][:-1]), dim=0).unsqueeze(0)
        trg_tensor_ = torch.cat((torch.tensor(pred_token).detach().view(-1).to(device), trg_tensor_input[0][:-1]), dim=0).unsqueeze(0)
        
        if i == 0:
            complete_gen_seq = torch.cat((torch.tensor(pred_token).detach().view(-1).to(device), src_tensor_input[0]), dim=0).unsqueeze(0)
        elif i == n-1:
            complete_gen_seq = torch.cat((torch.tensor(pred_token).detach().view(-1).to(device), complete_gen_seq[0]), dim=0).unsqueeze(0)
            # nur für die spätere Ausgabe soll zuerst der Artikel angezeigt werden, der vor n Klicks gesehen wurde, 
            # und zuletzt der gerade gesehene Artikel (bzw. der von BERT vorhergesagte Artikel)
            complete_gen_seq = torch.flip(complete_gen_seq, [0,1])
        else:
            complete_gen_seq = torch.cat((torch.tensor(pred_token).detach().view(-1).to(device), complete_gen_seq[0]), dim=0).unsqueeze(0)
               
    return src_tensor_, complete_gen_seq

In [101]:
idx_no = 8
src_tensor = dataloader_emb_test.dataset.x[idx_no].long().unsqueeze(0)
src_tensor = torch.flip(src_tensor, [0,1])
trg_tensor = dataloader_emb_test.dataset.x[idx_no+1].long().unsqueeze(0)
trg_tensor = torch.flip(trg_tensor, [0,1])
src_tensor, trg_tensor

(tensor([[62,  7]]), tensor([[29, 62]]))

In [102]:
src_tensor_, complete_gen_seq = gen_seq(src_tensor, trg_tensor, n=3)
for i in range(0,len(complete_gen_seq[0])):
    if i == 2:
        print('ab jetzt predictions')
    print(get_key(complete_gen_seq[0][i]))

barbecue sauce
ketchup
ab jetzt predictions
french fries
hot dogs
green tea


# selbst Sequenzen erstellen und gucken, wie BERT sie weiterführen würde

In [69]:
dataset_emb_train.idx2item[4], dataset_emb_train.item2idx['avocado']

('avocado', 4)

In [70]:
dataset_emb_train.idx2item

{0: ' asparagus',
 1: 'almonds',
 2: 'antioxydant juice',
 3: 'asparagus',
 4: 'avocado',
 5: 'babies food',
 6: 'bacon',
 7: 'barbecue sauce',
 8: 'black tea',
 9: 'blueberries',
 10: 'body spray',
 11: 'bramble',
 12: 'brownies',
 13: 'bug spray',
 14: 'burger sauce',
 15: 'burgers',
 16: 'butter',
 17: 'cake',
 18: 'candy bars',
 19: 'carrots',
 20: 'cauliflower',
 21: 'cereals',
 22: 'champagne',
 23: 'chicken',
 24: 'chili',
 25: 'chocolate',
 26: 'chocolate bread',
 27: 'chutney',
 28: 'cider',
 29: 'clothes accessories',
 30: 'cookies',
 31: 'cooking oil',
 32: 'corn',
 33: 'cottage cheese',
 34: 'cream',
 35: 'dessert wine',
 36: 'eggplant',
 37: 'eggs',
 38: 'energy bar',
 39: 'energy drink',
 40: 'escalope',
 41: 'extra dark chocolate',
 42: 'flax seed',
 43: 'french fries',
 44: 'french wine',
 45: 'fresh bread',
 46: 'fresh tuna',
 47: 'fromage blanc',
 48: 'frozen smoothie',
 49: 'frozen vegetables',
 50: 'gluten free bar',
 51: 'grated cheese',
 52: 'green beans',
 53: 'g

In [71]:
get_val('eggplant'), get_val('herb & pepper'),get_val('olive oil'),get_val('parmesan cheese')

(36, 59, 81, 83)

Darauf achten, dass die Reihenfolge ist vom gerade gesehenen zum vor n gesehenen Artikel

In [72]:
src_tensor_created = torch.tensor([81, 83]).unsqueeze(0).to(device)
# trg_tensor ist src_tensor plus einen Artikel und nach hinten verschoben
trg_tensor_created = torch.tensor([59, 81]).unsqueeze(0).to(device)

Was sich erkennen lässt: wenn der gerade gesehene Artikel (also der Artikel, der in trg_tensor aber nicht in src_tensor drin ist) anders ist, als der zuletzt gesehene Artikel, dann werden die vorgeschlagenen nächsten Artikel anders, eher dazu passend. Wenn es sich um einen ähnlichen Artikel handelt, dann werden auch ähnliche Artikel als folgende Sequenz generiert.

In [76]:
src_tensor_, complete_gen_seq = gen_seq(src_tensor_created, trg_tensor_created, n=3)
for i in range(0,len(complete_gen_seq[0])):
    if i == 2:
        print('ab jetzt predictions')
    print(get_key(complete_gen_seq[0][i]))

parmesan cheese
olive oil
ab jetzt predictions
ground beef
spaghetti
mineral water


# BERT Seq Generation mit topk

In [103]:
def gen_seq_topk(src_tensor, trg_tensor, n=5, topk=5):
    
    model_embeddings.eval()
    
    for i in range (0,n):
        
        # in der ersten Iteration wird der Ausgangs src und trg Tensor eingegeben, daraufhin werden src und trg Tensor jeweils
        # von den künstlich erzeugten Artikeln überschrieben
        if i ==0:
            src_tensor_input = src_tensor
            trg_tensor_input = trg_tensor
        else:
            src_tensor_input = src_tensor_
            trg_tensor_input = trg_tensor_

        src_mask = model_embeddings.make_src_mask(src_tensor_input)

        with torch.no_grad():
            enc_src = model_embeddings.encoder(src_tensor_input, src_mask)

        trg_mask = model_embeddings.make_trg_mask(trg_tensor_input)

        with torch.no_grad():
                output, attention = model_embeddings.decoder(trg_tensor_input, enc_src, trg_mask, src_mask)

        # einen Zufallsoutput aus den Topk nehmen 
        rnd_nmbr = np.random.randint(topk)
        top_values, top_art = torch.topk(output[0][0],topk)
        pred_token = top_art[rnd_nmbr]

        # # von der alten Sequenz den am weitesten entfernt gesehenen löschen und pred anhängen
        src_tensor_ = torch.cat((torch.clone(pred_token).detach().view(-1).to(device), src_tensor_input[0][:-1]), dim=0).unsqueeze(0)
        trg_tensor_ = torch.cat((torch.clone(pred_token).detach().view(-1).to(device), trg_tensor_input[0][:-1]), dim=0).unsqueeze(0)
        
        if i == 0:
            complete_gen_seq = torch.cat((torch.clone(pred_token).detach().view(-1).to(device), src_tensor_input[0]), dim=0).unsqueeze(0)
        elif i == n-1:
            complete_gen_seq = torch.cat((torch.clone(pred_token).detach().view(-1).to(device), complete_gen_seq[0]), dim=0).unsqueeze(0)
            # nur für die spätere Ausgabe soll zuerst der Artikel angezeigt werden, der vor n Klicks gesehen wurde, 
            # und zuletzt der gerade gesehene Artikel (bzw. der von BERT vorhergesagte Artikel)
            complete_gen_seq = torch.flip(complete_gen_seq, [0,1])
        else:
            complete_gen_seq = torch.cat((torch.clone(pred_token).detach().view(-1).to(device), complete_gen_seq[0]), dim=0).unsqueeze(0)
            
    return src_tensor_, complete_gen_seq

In [104]:
idx_no = 5
src_tensor = dataloader_emb_test.dataset.x[idx_no].long().unsqueeze(0)
src_tensor = torch.flip(src_tensor, [0,1])
trg_tensor = dataloader_emb_test.dataset.x[idx_no+1].long().unsqueeze(0)
trg_tensor = torch.flip(trg_tensor, [0,1])
src_tensor, trg_tensor

(tensor([[72, 55]]), tensor([[90, 72]]))

In [105]:
get_key(trg_tensor[0][0])

'rice'

In [108]:
src_tensor_, complete_gen_seq = gen_seq_topk(src_tensor, trg_tensor, n=3, topk=5)
for i in range(0,len(complete_gen_seq[0])):
    if i == 2:
        print('ab jetzt predictions')
    print(get_key(complete_gen_seq[0][i]))

ground beef
mineral water
ab jetzt predictions
protein bar
toothpaste
candy bars


###  selbst Sequenzen erstellen und gegen topk testen

In [81]:
dataset_emb_train.idx2item

{0: ' asparagus',
 1: 'almonds',
 2: 'antioxydant juice',
 3: 'asparagus',
 4: 'avocado',
 5: 'babies food',
 6: 'bacon',
 7: 'barbecue sauce',
 8: 'black tea',
 9: 'blueberries',
 10: 'body spray',
 11: 'bramble',
 12: 'brownies',
 13: 'bug spray',
 14: 'burger sauce',
 15: 'burgers',
 16: 'butter',
 17: 'cake',
 18: 'candy bars',
 19: 'carrots',
 20: 'cauliflower',
 21: 'cereals',
 22: 'champagne',
 23: 'chicken',
 24: 'chili',
 25: 'chocolate',
 26: 'chocolate bread',
 27: 'chutney',
 28: 'cider',
 29: 'clothes accessories',
 30: 'cookies',
 31: 'cooking oil',
 32: 'corn',
 33: 'cottage cheese',
 34: 'cream',
 35: 'dessert wine',
 36: 'eggplant',
 37: 'eggs',
 38: 'energy bar',
 39: 'energy drink',
 40: 'escalope',
 41: 'extra dark chocolate',
 42: 'flax seed',
 43: 'french fries',
 44: 'french wine',
 45: 'fresh bread',
 46: 'fresh tuna',
 47: 'fromage blanc',
 48: 'frozen smoothie',
 49: 'frozen vegetables',
 50: 'gluten free bar',
 51: 'grated cheese',
 52: 'green beans',
 53: 'g

In [82]:
get_val('protein bar'), get_val('nonfat milk'),get_val('mineral water')

(88, 78, 72)

In [87]:
src_tensor_created = torch.tensor([88, 72]).unsqueeze(0).to(device)
# trg_tensor ist src_tensor plus einen Artikel und nach hinten verschoben
trg_tensor_created = torch.tensor([63, 88]).unsqueeze(0).to(device)

In [88]:
src_tensor_, complete_gen_seq = gen_seq_topk(src_tensor_created, trg_tensor_created, n=5, topk=25)
for i in range(0,len(complete_gen_seq[0])):
    if i == 2:
        print('ab jetzt predictions')
    print(get_key(complete_gen_seq[0][i]))

mineral water
protein bar
ab jetzt predictions
cottage cheese
honey
champagne
body spray
salt
