In [1]:
import torch
import torch.nn as nn
from torch.nn import functional
from torch.nn import Module
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

### Data Loading and Preparation


In [2]:
import pickle
PATH = "poem_dict.obj"

In [3]:

def read_file(path):
    poems = dict()
    
    with open(path,'rb') as f:
        data = pickle.load(f)
        
    poem_idx = 0
    
    for key in data.keys():
        for p in data[key]: 
            poems[poem_idx] = [key,prep_poem(p)]   
            poem_idx += 1
            
    return poems


In [4]:
def prep_poem(string):
    max_lines = 5  # 4 line poem
    res = []

    short_poem = string.lower().split('\n')[:max_lines]
    
    res = sum([i.split() for i in short_poem],['<BOP>'])
    res.append('<EOP>')
    
    # pad sequence
    
    max_len = 50
    if len(res) > max_len: # 120
        res = res[:max_len]
    elif len(res) < max_len: # 20
        diff = max_len - len(res)
        for i in range(diff):
            res.append("<pad>")

    return res

In [5]:
# corpus = read_file(PATH)
# cor = dict()

# n=0
# for key,val in corpus.items():
#     if n <1001:
#         cor[key]=val
#         n+=1
# corpus = cor

In [78]:
import torch
from torch.utils.data import Dataset, DataLoader
import torchtext as tt
from itertools import chain
import tqdm
import pickle


class PoemDataset(Dataset):
    """
    """
    def __init__(self, corpus): # corpus = read_file(path)
        self.corpus = corpus
        self.vocab = self.build_vocab()
        

    def __len__(self):
        return len(self.corpus)

    def __getitem__(self, idx): 
        
        topic = self.corpus[idx][0]
        poem = self.corpus[idx][1]
        inp = poem[:-1]
        target = poem[1:]

        
         # convert sentences to tensors
        inp_tensor = torch.tensor(self.vocab.lookup_indices(inp))
        target_tensor = torch.tensor(self.vocab.lookup_indices(target)) 


        # instead do pad_sequence up to length of input
        topic_tensor = torch.zeros(1) 
        topic_tensor[0] = self.vocab.__getitem__(topic) 

#         print(topic_tensor.shape)
        
        sample = {"topic": topic_tensor, "input": inp_tensor, "target": target_tensor}

        return sample
    

    
    def build_vocab(self):

        sentences = [poem[1] for poem in self.corpus.values()]
        
        vocab = tt.vocab.build_vocab_from_iterator(sentences, specials=["<pad>","<unk>","<BOP>","<EOP>"])
        vocab.set_default_index(0)
        return vocab
    
    

In [106]:
def collate_fn(batch):
    
    for instance in batch:
        inp = instance["input"]
        targ = instance["target"]
        topic = instance["topic"]
        tensors = [topic.long(),inp,targ]
        tensors = pad_sequence(tensors,batch_first=True,padding_value=0)
    return tensors

In [82]:
# Dataset
train_dataset = PoemDataset(corpus)

# Data Loader
train_dataloader = DataLoader(train_dataset, batch_size=1, collate_fn=collate_fn)
# train_dataloader = DataLoader(train_dataset, batch_size=1)

In [83]:
for batch in train_dataloader:
    top = batch[0]
    inp = batch[1]
    targ = batch[2]
    print(top,inp,targ)
    break

torch.FloatTensor torch.Size([1])
tensor([46,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0]) tensor([   2, 1774,  661,  207,   24,   87,   42,  340,   14,  127,   45,  180,
          98,  694,   15,   65, 2377,    9,  453, 1750,   15,   65, 2292,    3,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0]) tensor([1774,  661,  207,   24,   87,   42,  340,   14,  127,   45,  180,   98,
         694,   15,   65, 2377,    9,  453, 1750,   15,   65, 2292,    3,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0])


# Model

In [90]:
class PoemLSTM(nn.Module):     
    """ 
    """
    def __init__(self, hidden_size, vocab_size, embedding_dim = 200, n_layers=1): 

        super(PoemLSTM, self).__init__()   

        self.hidden_size = hidden_size
        self.n_layers = n_layers
        
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        

        topic_embedding_dim = 100
        # embedding for topic 
        self.embed_topic = nn.Embedding(vocab_size, topic_embedding_dim) # [41634, 100]
    
        # embedding for sent
        self.embed_sent = nn.Embedding(vocab_size, embedding_dim)  # [41634, 200]
        
        #concatinate
        self.lstm = nn.LSTM(topic_embedding_dim+embedding_dim, hidden_size) 
        self.fc = nn.Linear(hidden_size, vocab_size) # [100,41634]

    def forward(self, topic, inp):
               
        # topic embedding matrix
        topic_emb = self.embed_topic(topic)
        
        # sent embedding for words
        emb = self.embed_sent(inp)
#         print('emb out',emb.shape)   
        
        # concatinate input sentence and topic
        input_combined = torch.cat((emb,topic_emb),1)
#         print('shape',input_combined.shape) 
        
    
        
        
        # prepare Embedding output for LSTM layer
        lstm_in = input_combined.view(input_combined.shape[0],1,input_combined.shape[1]) # lstm_in = emb.view(1, 1, -1)
#         print('lstm in',lstm_in.shape,'in dim',lstm_in.dim())

        
        # feed to LSTM layer
        lstm_out, hn_cn = self.lstm(lstm_in)
#         print('lstm_out shape',lstm_out.size())
       
        # re-shape LSTM output, prepare for Linear layer
        fc_in  = lstm_out.view(len(input_combined),-1)
#         print('fc in', fc_in.size())
        
        # feed to Linear
        fc_out = self.fc(fc_in)
#         print('fc_out',fc_out.shape)
        
        return fc_out, hn_cn

In [91]:
# Initialize Model 
vocabulary = train_dataset.build_vocab()
#
hidden_size = 100
n_layers = 1
lr = 0.001
vocab_size = len(vocabulary)

PoemGenerator = PoemLSTM(hidden_size, vocab_size)

# Set Hyperparameter Values

NUM_EPOCHS = 20 
LEARNING_RATE = 0.001
OPTIMIZER = optim.Adam          
LOSS_FUNCTION = nn.functional.cross_entropy 

In [93]:
def train_model(model, optimizer_type, loss_function, learning_rate):
    """
    """

    model.train()

    optimizer = optimizer_type(params=model.parameters(), lr=learning_rate)        
    loss = 0

    for batch in train_dataloader:
        
        topic = batch[0]
        input_sent = batch[1]
        targ_sent = batch[2]
        
        # forward pass
        output, _ = model(topic=topic, inp=input_sent)
        
        # calculate the loss and perform backprop
        loss = loss_function(output, targ_sent)

        # zero grad
        optimizer.zero_grad()

        loss.backward()

        # update parameters
        optimizer.step()

        
    return loss.item()


In [94]:
# train and sample
loss_avg=0

for epoch in range(NUM_EPOCHS):
    loss = train_model(model=PoemGenerator, optimizer_type=OPTIMIZER, loss_function=LOSS_FUNCTION, learning_rate=LEARNING_RATE)       
    print('epoch:',epoch,'loss:',loss)


torch.FloatTensor torch.Size([1])
torch.FloatTensor torch.Size([1])
torch.FloatTensor torch.Size([1])
torch.FloatTensor torch.Size([1])
torch.FloatTensor torch.Size([1])
torch.FloatTensor torch.Size([1])
torch.FloatTensor torch.Size([1])
torch.FloatTensor torch.Size([1])
torch.FloatTensor torch.Size([1])
torch.FloatTensor torch.Size([1])
torch.FloatTensor torch.Size([1])
torch.FloatTensor torch.Size([1])
torch.FloatTensor torch.Size([1])
torch.FloatTensor torch.Size([1])
torch.FloatTensor torch.Size([1])
torch.FloatTensor torch.Size([1])
torch.FloatTensor torch.Size([1])
torch.FloatTensor torch.Size([1])
torch.FloatTensor torch.Size([1])
torch.FloatTensor torch.Size([1])
torch.FloatTensor torch.Size([1])
torch.FloatTensor torch.Size([1])
torch.FloatTensor torch.Size([1])
torch.FloatTensor torch.Size([1])
torch.FloatTensor torch.Size([1])
torch.FloatTensor torch.Size([1])
torch.FloatTensor torch.Size([1])
torch.FloatTensor torch.Size([1])
torch.FloatTensor torch.Size([1])
torch.FloatTen

In [None]:
# hidden: 100
# NUM_EPOCHS = 30 
# LEARNING_RATE = 0.001
epoch: 29 loss: 3.4507241249084473

# hidden 200
# NUM_EPOCHS = 30 
# LEARNING_RATE = 0.001
epoch: 29 loss: 2.4526050090789795
        
# hidden 200
# NUM_EPOCHS = 50 
# LEARNING_RATE = 0.001
epoch: 49 loss: 1.3027650117874146

### Sample and generate

In [107]:


from torch.nn.utils.rnn import pad_sequence
import random


def sample(topic,model):
    
    with torch.no_grad(): 
    
        vocab = vocabulary
        start_token = "<BOP>"

        topic_tensor = torch.zeros(1)
        topic_tensor[0] = vocab.__getitem__(topic)
        topic_tensor = topic_tensor.long()
#         print('topic old',topic_tensor)
        
        inp_tensor = torch.zeros(1)
        inp_tensor[0] = vocab.__getitem__(start_token)
        inp_tensor = inp_tensor.long()
#         print('inp tensor old',inp_tensor)

        
        text = []
        for i in range(1,50):
            output, _ = model(topic_tensor, inp_tensor)
            
            
            p = functional.softmax(output, dim=1).data
            print(p)
            
            # get indices of top N values
            
            N = 10 #3,5
            vals, inds = torch.topk(p,N)
            print("top_n_ind",inds)

            # randomly select one of the three indices
            sample_index = random.sample(inds[0].tolist(),1)
            print("sampled_token_index",sample_index)

            # get token
            tok = vocab.lookup_token(sample_index[0])
            print('tok',tok)

            # add to generated text
            text.append(tok +' ')
            
            
            inp_tensor = torch.cat((inp_tensor,  torch.zeros(1).long()), 0)
            inp_tensor[i] = sample_index[0]
            print("inp tensor new", inp_tensor)
            

            tens = (topic_tensor,inp_tensor)
            topic_tensor,inp_tensor = pad_sequence(tens,batch_first=True,padding_value=0)
            

            text.append(tok +' ')
        print(text)
            


In [108]:
sample('alone',model=PoemGenerator)

raw output tensor([[-2.9735, -6.9983, -6.8745,  ..., -6.2339, -4.5142, -4.1965]]) of shape: torch.Size([1, 2618])
tensor([[1.8738e-04, 3.3479e-06, 3.7893e-06,  ..., 7.1906e-06, 4.0144e-05,
         5.5153e-05]])
top_n_ind tensor([[ 8, 13,  7, 41, 44, 45, 14, 46,  4, 54]])
sampled_token_index [46]
tok alone
inp tensor new tensor([ 2, 46])
raw output tensor([[-2.9735, -6.9983, -6.8745,  ..., -6.2339, -4.5142, -4.1965],
        [-4.9165, -8.2861, -8.2427,  ..., -7.5886, -5.4297, -5.1493]]) of shape: torch.Size([2, 2618])
tensor([[1.8738e-04, 3.3479e-06, 3.7893e-06,  ..., 7.1906e-06, 4.0144e-05,
         5.5153e-05],
        [1.4859e-05, 5.1119e-07, 5.3384e-07,  ..., 1.0268e-06, 8.8935e-06,
         1.1773e-05]])
top_n_ind tensor([[ 8, 13,  7, 41, 44, 45, 14, 46,  4, 54],
        [44, 41, 49,  4, 46, 42, 48, 54, 56, 68]])
sampled_token_index [14]
tok my
inp tensor new tensor([ 2, 46, 14])
raw output tensor([[-2.9735, -6.9983, -6.8745,  ..., -6.2339, -4.5142, -4.1965],
        [-4.9165, -8.