In [1]:
import torchtext
from torchtext import data
from torchtext import datasets
import spacy
import time

In [2]:
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [3]:
USE_CUDA = True

In [4]:
spacy_en = spacy.load('en')
BOS_WORD = '<s>'
EOS_WORD = '</s>'

def tokenize(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

In [5]:
TEXT = data.Field(lower=True, batch_first=True, tokenize=tokenize, init_token=BOS_WORD, eos_token=EOS_WORD)
LABEL = data.Field(sequential=False)

In [6]:
# make splits for data
train, val, test = datasets.SST.splits(TEXT, LABEL, '../../data/')

In [7]:
# print information about the data
print('train.fields', train.fields)
print('len(train)', len(train))
print('vars(train[0])', vars(train[0]))

train.fields {'text': <torchtext.data.field.Field object at 0x7f0c41d36080>, 'label': <torchtext.data.field.Field object at 0x7f0c41d36128>}
len(train) 8544
vars(train[0]) {'text': ['the', 'rock', 'is', 'destined', 'to', 'be', 'the', '21st', 'century', "'s", 'new', '``', 'conan', "''", 'and', 'that', 'he', "'s", 'going', 'to', 'make', 'a', 'splash', 'even', 'greater', 'than', 'arnold', 'schwarzenegger', ',', 'jean', '-', 'claud', 'van', 'damme', 'or', 'steven', 'segal', '.'], 'label': 'positive'}


In [8]:
TEXT.build_vocab(train)
LABEL.build_vocab(train)
print('len(TEXT.vocab)', len(TEXT.vocab))
print('len(LABEL.vocab)', len(LABEL.vocab))

train_iter, val_iter, test_iter = torchtext.data.BucketIterator.splits((train, val, test), batch_size=32, 
                                                                       shuffle=True, repeat=False)

len(TEXT.vocab) 15483
len(LABEL.vocab) 4


In [9]:
BOS_IDX = TEXT.vocab.stoi[BOS_WORD]
EOS_IDX = TEXT.vocab.stoi[EOS_WORD]
PAD_IDX = TEXT.vocab.stoi['<pad>']

In [11]:
class LSTMEncoder(nn.Module):
    
    def __init__(self, embedding, hidden_size, num_layers=1, dropout_frac=0.5):
        
        super(LSTMEncoder, self).__init__()
        
        self.embedding = embedding
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        self.lstm = nn.LSTM(self.embedding.embedding_dim, hidden_size, num_layers)
        self.dropout = nn.Dropout(p=dropout_frac)
    
    def forward(self, batch):
        input = self.embedding(batch)
        _, (hidden, _) = self.lstm(input.t())
        hidden = self.dropout(hidden)
        return hidden.squeeze()

In [12]:
class LSTMDecoder(nn.Module):
    
    def __init__(self, embedding, hidden_size, num_layers=1, dropout_frac=0.5):
        
        super(LSTMDecoder, self).__init__()
        
        self.embedding = embedding
        self.hidden_size = hidden_size
        
        self.lstm = nn.LSTM(self.embedding.embedding_dim + hidden_size, hidden_size, num_layers)
        self.dropout = nn.Dropout(p=dropout_frac)
        self.linear = nn.Linear(hidden_size, self.embedding.num_embeddings)
        
    def forward(self, batch, hidden_init):
        seq_len = batch.size(1)
        input = self.embedding(batch)
        if USE_CUDA:
            input = input.cuda()
        hidden_init_repeated = torch.stack([hidden_init] * seq_len, 1)
        input = torch.cat([input, hidden_init_repeated], dim=2)
        cell_init = Variable(torch.zeros(hidden_init.size()))
        if USE_CUDA:
            cell_init = cell_init.cuda()
        output, _ = self.lstm(input.t(), (hidden_init.unsqueeze(0), cell_init.unsqueeze(0)))
        output = self.dropout(output)
        output = output.t()
        probs = F.softmax(self.linear(output), dim=2)
        log_probs = torch.log(probs)
        return log_probs

In [13]:
class VAE(nn.Module):
    
    def __init__(self, encoder, decoder, latent_dim):
        
        super(VAE, self).__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.latent_dim = latent_dim
        
        self.linear_mean = nn.Linear(encoder.hidden_size, self.latent_dim)
        self.linear_var = nn.Linear(encoder.hidden_size, self.latent_dim)
        self.linear_decoder = nn.Linear(self.latent_dim, decoder.hidden_size)
    
    def forward(self, batch):
        batch_size = batch.size(0)
        
        # Take out BOS/EOS token for encoder and EOS token for decoder
        batch_no_eos = batch.clone()
        batch_no_eos[batch_no_eos == EOS_IDX] = 1
        batch_no_eos = batch_no_eos[:, :-1]
         
        hidden = self.encoder(batch_no_eos[:, 1:])
        means = self.linear_mean(hidden)
        log_vars = self.linear_var(hidden)
        Z = Variable(torch.normal(means=torch.zeros(batch_size, self.latent_dim), 
                                  std=torch.ones(batch_size, self.latent_dim)))
        if USE_CUDA:
            means = means.cuda()
            log_vars = log_vars.cuda()
            Z = Z.cuda()
        latent = means + Z * torch.exp(1/2 * log_vars)
        decoder_hidden_init = self.linear_decoder(latent)
        log_probs = self.decoder(batch_no_eos, decoder_hidden_init)
        return log_probs, means, log_vars

In [15]:
def vae_loss(target, log_probs, means, log_vars):
    log_probs = torch.gather(log_probs, dim=2, index=target.unsqueeze(2)).squeeze()
    pad_mask = target == PAD_IDX
    n_tokens = (~pad_mask.data).long().sum()
    log_probs[pad_mask] = 0
    rec_loss = -log_probs.sum(dim=1)
    reg_loss = -1/2 * torch.sum(1 + log_vars - means**2 - torch.exp(log_vars), dim=1)
    batch_loss = rec_loss.sum() + reg_loss.sum()
    return batch_loss, rec_loss.sum(), reg_loss.sum(), n_tokens

def train(model, data, optimizer):
    model.train()
    data_loss = 0
    data_size = 0
    total_reg_loss = 0
    for batch in data:
        model.zero_grad()
        
        if USE_CUDA:
            batch.text = batch.text.cuda()
        log_probs, means, log_vars = model(batch.text)
        target = batch.text[:, 1:]
        batch_loss, rec_loss, reg_loss, n_tokens = vae_loss(target, log_probs, means, log_vars)
        
        avg_loss = batch_loss / n_tokens
        avg_loss.backward()
        optimizer.step()
        
        data_size += n_tokens
        data_loss += batch_loss
        total_reg_loss += reg_loss
    avg_loss_all = data_loss / data_size
    avg_reg_loss_all = total_reg_loss / data_size
    ppl = torch.exp(avg_loss_all)
    return avg_loss_all, avg_reg_loss_all, ppl

def evaluate(model, data):
    model.eval()
    data_loss = 0
    data_size = 0
    total_reg_loss = 0
    for batch in data:        
        if USE_CUDA:
            batch.text = batch.text.cuda()
        log_probs, means, log_vars = model(batch.text)
        target = batch.text[:, 1:]
        batch_loss, rec_loss, reg_loss, n_tokens = vae_loss(target, log_probs, means, log_vars)
                
        data_size += n_tokens
        data_loss += batch_loss
        total_reg_loss += reg_loss
    avg_loss_all = data_loss / data_size
    avg_reg_loss_all = total_reg_loss / data_size
    ppl = torch.exp(avg_loss_all)
    model.train()
    return avg_loss_all, avg_reg_loss_all, ppl

In [32]:
embedding = nn.Embedding(num_embeddings=len(TEXT.vocab), embedding_dim=256)
encoder = LSTMEncoder(embedding, hidden_size=256)
decoder = LSTMDecoder(embedding, hidden_size=256)
vae = VAE(encoder, decoder, latent_dim=64)
if USE_CUDA:
    vae = vae.cuda()
    
optimizer = optim.Adam(vae.parameters(), lr=1e-3)

In [33]:
# Train model
for i in range(30):
    t = time.time()
    train_loss, _, train_ppl = train(vae, train_iter, optimizer)
    train_time = time.time() - t
    test_loss, reg_loss, test_ppl = evaluate(vae, test_iter)
    
    train_loss = train_loss.data[0]
    train_ppl = train_ppl.data[0]
    test_loss = test_loss.data[0]
    reg_loss = reg_loss.data[0]
    test_ppl = test_ppl.data[0]
    
    print('| Epoch #{:2d} | time {:4.2f} | train loss {:5.2f} | train ppl {:5.2f} | test loss {:5.2f} | test reg {:5.2f} | test ppl {:5.2f}'.format(i, train_time, train_loss, train_ppl, test_loss, reg_loss, test_ppl))

| Epoch # 0 | time 33.61 | train loss  6.61 | train ppl 744.20 | test loss  6.15 | test reg  0.00 | test ppl 469.82
| Epoch # 1 | time 32.53 | train loss  5.92 | train ppl 371.89 | test loss  5.94 | test reg  0.00 | test ppl 380.69
| Epoch # 2 | time 33.67 | train loss  5.62 | train ppl 275.44 | test loss  5.83 | test reg  0.00 | test ppl 340.56
| Epoch # 3 | time 33.55 | train loss  5.37 | train ppl 214.85 | test loss  5.79 | test reg  0.00 | test ppl 325.86
| Epoch # 4 | time 33.30 | train loss  5.15 | train ppl 172.02 | test loss  5.76 | test reg  0.00 | test ppl 318.30
| Epoch # 5 | time 33.28 | train loss  4.94 | train ppl 139.38 | test loss  5.73 | test reg  0.00 | test ppl 308.48
| Epoch # 6 | time 33.08 | train loss  4.73 | train ppl 113.34 | test loss  5.75 | test reg  0.00 | test ppl 313.73
| Epoch # 7 | time 35.21 | train loss  4.53 | train ppl 92.89 | test loss  5.75 | test reg  0.00 | test ppl 315.21
| Epoch # 8 | time 38.41 | train loss  4.34 | train ppl 76.96 | test loss

KeyboardInterrupt: 

In [24]:
class ResidualBlock(nn.Module):
    
    def __init__(self, d, k=3, in_channels=512 + 256, mid_channels=512, out_channels=512 + 256):
        
        super(ResidualBlock, self).__init__()
    
        self.in_channels = in_channels
        self.mid_channels = mid_channels
        self.out_channels = out_channels
        self.d = d
        self.k = k
        self.n_pad = n_pad = k * d - d
        
        self.conv1x1_A = torch.nn.Conv1d(in_channels, mid_channels, kernel_size=1)
        self.convkx1 = torch.nn.Conv1d(mid_channels, mid_channels, kernel_size=k, dilation=d, padding=n_pad)
        self.conv1x1_B = torch.nn.Conv1d(mid_channels, out_channels, kernel_size=1)
    
    def forward(self, input):
        out = F.relu(input)
        out = self.conv1x1_A(out)
        out = F.relu(out)
        out = self.convkx1(out)[:, :, :-self.n_pad]
        out = F.relu(out)
        out = self.conv1x1_B(out)
        out = out + input
        return out

In [25]:
class DilatedCNNDecoder(nn.Module):
    
    def __init__(self, embedding, hidden_size, dilation_layers=[1, 2, 4], filter_size=3):
        
        super(DilatedCNNDecoder, self).__init__()
        
        self.embedding = embedding
        self.hidden_size = hidden_size
        self.dilation_layers = dilation_layers
        self.filter_size = filter_size
        
        resid_blocks = []
        for d in dilation_layers:
            resid_blocks.append(ResidualBlock(d, k=filter_size))
        self.resid_blocks = nn.ModuleList(resid_blocks)
        self.linear = nn.Linear(resid_blocks[-1].out_channels, embedding.num_embeddings)
        
    def forward(self, batch, hidden_init):
        seq_len = batch.size(1)
        input = self.embedding(batch)
        if USE_CUDA:
            input = input.cuda()
        hidden_init_repeated = torch.stack([hidden_init] * seq_len, 1)
        out = torch.cat([input, hidden_init_repeated], dim=2)
        out = out.transpose(1, 2)
        
        for r_block in self.resid_blocks:
            out = r_block(out)
        out = out.transpose(1, 2)
        probs = F.softmax(self.linear(out), dim=2)
        log_probs = torch.log(probs)
        return log_probs

In [27]:
embedding2 = nn.Embedding(num_embeddings=len(TEXT.vocab), embedding_dim=256)
encoder2 = LSTMEncoder(embedding2, hidden_size=512)
decoder2 = DilatedCNNDecoder(embedding2, hidden_size=512)
vae_dilated = VAE(encoder2, decoder2, latent_dim=32)
if USE_CUDA:
    vae_dilated = vae_dilated.cuda()
    
optimizer2 = optim.Adam(vae_dilated.parameters(), lr=1e-3)

In [28]:
# # Train model
for i in range(30):
    t = time.time()
    train_loss, _, train_ppl = train(vae_dilated, train_iter, optimizer2)
    train_time = time.time() - t
    test_loss, reg_loss, test_ppl = evaluate(vae_dilated, test_iter)
    
    train_loss = train_loss.data[0]
    train_ppl = train_ppl.data[0]
    test_loss = test_loss.data[0]
    reg_loss = reg_loss.data[0]
    test_ppl = test_ppl.data[0]
    
    print('| Epoch #{:2d} | time {:4.2f} | train loss {:5.2f} | train ppl {:5.2f} | test loss {:5.2f} | test reg {:5.2f} | test ppl {:5.2f}'.format(i, train_time, train_loss, train_ppl, test_loss, reg_loss, test_ppl))

| Epoch # 0 | time 50.04 | train loss  6.39 | train ppl 594.97 | test loss  6.04 | test reg  0.00 | test ppl 420.01
| Epoch # 1 | time 51.38 | train loss  5.49 | train ppl 241.71 | test loss  5.94 | test reg  0.00 | test ppl 379.97
| Epoch # 2 | time 52.03 | train loss  5.00 | train ppl 148.71 | test loss  5.87 | test reg  0.00 | test ppl 355.22


KeyboardInterrupt: 

In [232]:
decoder2(batch_no_eos, hidden_init)

Variable containing:
(  0  ,.,.) = 
 -10.3526  -9.3471  -9.6030  ...  -10.1425  -9.4990  -9.5840
 -10.1305  -9.6689 -10.1028  ...  -10.2751  -9.4249  -9.9911
 -10.2381  -9.6977 -10.3961  ...   -9.6479  -9.6753  -9.1942
            ...               ⋱              ...            
 -10.2831  -9.1699 -10.9693  ...   -9.8108  -9.6649  -9.7658
  -9.9476  -9.0176  -9.9333  ...  -10.5735  -9.3686  -9.1843
 -10.2049  -8.5085  -9.5729  ...   -9.7321  -9.6015 -10.2293

(  1  ,.,.) = 
 -10.0810 -10.0030  -9.4868  ...  -10.1834  -9.5610  -9.2903
  -9.9780  -9.6010  -9.6552  ...  -10.4450  -9.1934  -9.4402
  -9.2953  -9.8903 -10.0582  ...   -9.9064  -9.3533  -9.3996
            ...               ⋱              ...            
 -10.3158  -9.1531 -10.1516  ...   -9.7145  -9.1483  -9.7620
  -9.6362  -9.6684  -9.7638  ...  -10.5614  -9.4083  -9.0233
  -9.8283  -9.1205  -9.5133  ...   -9.8005  -9.6368 -10.1266

(  2  ,.,.) = 
 -10.1020  -9.8092  -9.1300  ...  -10.0079  -9.7844  -9.7142
  -8.9096  -9.455