In [1]:
import torch.nn.utils
# %% packed sequences
a = torch.Tensor([1, 2, 3])
b = torch.Tensor([4, 5])
c = torch.Tensor([6])
lengths = list(map(len, [a, b, c]))
padded= torch.nn.utils.rnn.pad_sequence([a, b, c], batch_first=False)
packed_padded =torch.nn.utils.rnn.pack_padded_sequence(padded, lengths) # non ho messo batch first da qui in poi
repadded = torch.nn.utils.rnn.pad_packed_sequence(packed_padded)[0]
repacked = torch.nn.utils.rnn.pack_padded_sequence(repadded, lengths)
print([a, b, c])
print(padded)
print(packed_padded)
print(repadded)
print(repacked)

[tensor([ 1.,  2.,  3.]), tensor([ 4.,  5.]), tensor([ 6.])]
tensor([[ 1.,  4.,  6.],
        [ 2.,  5.,  0.],
        [ 3.,  0.,  0.]])
PackedSequence(data=tensor([ 1.,  4.,  6.,  2.,  5.,  3.]), batch_sizes=tensor([ 3,  2,  1]))
tensor([[ 1.,  4.,  6.],
        [ 2.,  5.,  0.],
        [ 3.,  0.,  0.]])
PackedSequence(data=tensor([ 1.,  4.,  6.,  2.,  5.,  3.]), batch_sizes=tensor([ 3,  2,  1]))


In [2]:
from src.parameters import Params
from src.vocabulary import Vocabulary
from src.style_transfer import StyleTransfer
from src.greedy_decoding import Decoder
from src.generate_batches import preprocessSentences

In [3]:
params = Params()
vocab = Vocabulary()
vocab.loadVocabulary("data/yelp/vocabulary.pickle")
vocab.initializeEmbeddings(params.embedding_size)
model = StyleTransfer(params, vocab)

In [4]:
import torch
checkpoint = torch.load("data/models/yelp/model-2018-06-27-epoch_18-loss_45.287033")
model.load_state_dict(checkpoint)

In [9]:
import numpy as np

with open('data/yelp/dev/negative.txt', 'r') as fp:
    testSents = fp.readlines()[:32]
    
labels = np.array([0] * len(testSents))
testSents = sorted(testSents, key=len)

In [None]:
decoder = Decoder(model, 20, 12, params)

In [6]:
encoder_inputs, generator_inputs, targets, lenghts = \
            model._sentencesToInputs(testSents)

Lengths:  32


In [7]:
padded_targets = torch.nn.utils.rnn.pad_packed_sequence(targets, batch_first=True)[0]
for i in range(32):
    sent = []
    for j in range(16):
        sent.append(model.vocabulary.id2word[padded_targets[i, j]])
    print(" ".join(sent))

it was obvious it was the same damn one he brought the first time <unk> <eos>
it 's not really french food and the decor is n't really french either <unk> <eos>
waitress ( jen ) was nice but we waited very long for the food <unk> <eos>
if we were to get this server again , i 'd ask to move <unk> <eos>
we waited over half an hour to get menus and then only got _num_ <unk> <eos>
easter day nothing open , heard about this place figured it would ok <unk> <eos> <pad>
the host that walked us to the table and left without a word <unk> <eos> <pad>
the last couple years this place has been going down hill <unk> <eos> <pad> <pad> <pad>
i ordered a chicken sandwich with onion rings and a soda <unk> <eos> <pad> <pad> <pad>
short term memory apparently since they were still on main entree <unk> <eos> <pad> <pad> <pad>
about _num_ minutes after sitting down , our orders were taken <unk> <eos> <pad> <pad> <pad>
last night however it was way to thick and tasteless <unk> <eos> <pad> <pad> <pad> <pad>
i

In [10]:
model.eval_size = len(testSents)
model._computeHiddens(
                encoder_inputs, generator_inputs, labels, lenghts, True)
generatorOutputs, h_teacher = model._generateTokens(
            generator_inputs, model.originalHiddens, lenghts, True)

In [40]:
generatorOutputs.view(-1, model.vocabulary.vocabSize).shape

torch.Size([512, 9603])

In [46]:
torch.nn.utils.rnn.pad_packed_sequence(targets, batch_first=True)[0].contiguous().view(-1).shape

torch.Size([512])

In [47]:
packedGenOutput = torch.nn.utils.rnn.pack_padded_sequence(
    generatorOutputs, lenghts, batch_first=True)[0] # PROVA CON batch_first = True, sballa l'ordine!

model.rec_loss_criterion(
    generatorOutputs.view(-1, model.vocabulary.vocabSize),
    torch.nn.utils.rnn.pad_packed_sequence(targets, batch_first=True)[0].contiguous().view(-1))

tensor(5.2853, device='cuda:0')

# Generate with previous Output

In [49]:
from src.rnn import SoftSampleWord

def _generateWithPrevOutput(
            model, h0, max_len, size, lengths=[], evaluation=False, soft=True):

    hidden = h0
    hiddens = torch.zeros(size, max_len,
                          model.params.autoencoder.hidden_size,
                          device="cuda")
    if soft:
        tokens = torch.zeros(
            size, max_len, model.params.embedding_size, device="cuda")
    else:
        tokens = torch.zeros(size, max_len, device="cuda")
        
    print(tokens.shape)

    goEmbedding = model.vocabulary(['<go>']).squeeze(0)
    goEmbedding = goEmbedding.repeat(size, 1)
    goEmbedding = goEmbedding.unsqueeze(1)
    currTokens = goEmbedding
    softSampleFunction = SoftSampleWord(
        dropout=model.params.dropout,
        embeddings=model.vocabulary.embeddings,
        gamma=model.params.gamma_init)
    
    if soft:

        for index in range(max_len):
            # generator need input (seq_len, batch_size, input_size)
            output, hidden = model.generator(
                currTokens, hidden, pad=False)
            currTokens, vocabLogits = softSampleFunction(
                output=output,
                hiddenToVocab=model.hiddenToVocab)
            tokens[:, index, :] = currTokens
            currTokens = currTokens.unsqueeze(1)
            hiddens[:, index, :] = hidden
            
    else:
        for index in range(max_len):
            output, hidden = model.generator(currTokens, hidden, pad=False)
            vocabLogit = model.hiddenToVocab(hidden)
            idxs = vocabLogit[0, : , :].max(1)[1]
            outputs[:, i] = idxs
            currTokens = model.vocabulary(idxs).unsqueeze(1)

    hiddens = torch.cat((h0.transpose(0, 1), hiddens), dim=1)
    # tokens = torch.cat((goEmbedding, tokens), dim=1)
    return hiddens, tokens

In [None]:
encoder_inputs, generator_inputs, targets, lenghts = \
            model._sentencesToInputs(testSents)

model.eval_size = len(testSents)
model._computeHiddens(
                encoder_inputs, generator_inputs, labels, lenghts, True)

generateWithPrevOutputs, h_prof = _generateWithPrevOutput(
    model, model.originalHiddens, model.params.max_len, 32, lenghts, True, soft=True)

In [None]:
targets

In [None]:
size = 32
goEmbedding = model.vocabulary(['<go>']).squeeze(0)
goEmbedding = goEmbedding.repeat(size, 1)
goEmbedding = goEmbedding.unsqueeze(1)
currTokens = goEmbedding
outputs = torch.zeros(32, 20)
hidden = model.originalHiddens
for i in range(20):
    # generator need input (seq_len, batch_size, input_size)
    output, hidden = model.generator(
        currTokens, hidden, pad=False)
    currTokens, vocabLogits = softSampleFunction(
        output=output,
        hiddenToVocab=model.hiddenToVocab)
    tokens[:, i, :] = currTokens
    currTokens = currTokens.unsqueeze(1)
    hiddens[:, i, :] = hidden

In [None]:
sents = []
for i in range(32):
    tokens = outputs[i, :]
    words = [model.vocabulary.id2word[int(x)] for x in list(tokens)]
    sents.append(" ".join(words))
    
sents

In [None]:
sents = []
for index in range(32):
    tokensLogits = generateWithPrevOutputs[index, :, :]
    sent = []
    len = 20
    for j in range(len):
        logit = tokensLogits[j, :]
        sent.append(model.vocabulary.id2word[logit.argmax()])
    sents.append(" ".join(sent))
sents        

In [None]:
print(packedGenOutput.shape)
print(targets.shape[0])

In [None]:
sents = []
for index in range(packedGenOutput.shape[0]):
    tokensLogits = packedGenOutput[index]
    sent = []
    sent.append(model.vocabulary.id2word[tokensLogits.argmax()])
    sents.append(" ".join(sent))
sents  

In [None]:
sents = []
for index in range(targets.shape[0]):
    word = targets[index]
    sent = []
    sent.append(model.vocabulary.id2word[word])
    sents.append(" ".join(sent))
sents 

In [None]:
testSents

In [None]:
targets

In [None]:
orig, tsf = decoder.rewriteBatch(testSents[:2], labels[:2])

In [None]:
any(torch.FloatTensor([1, 0]))

In [None]:
sent = 'the restaurant was nice but food tasted ugly'.split()
h = torch.zeros(700).to('cuda')
h = h.unsqueeze(0).unsqueeze(0)
for token in sent:
    emb = vocab([token])
    emb = emb.unsqueeze(1)
    out, h = model.generator(emb, h, pad=False)
    voc = model.hiddenToVocab(out)
    _, id = voc.max(2)
    h = h
    print(vocab.id2word[int(id)])

In [None]:
model.hiddenToVocab(out).max(2)

In [None]:
emb.size()