# Poem Title Generator

## References

__DATA__

1. https://www.kaggle.com/ultrajack/modern-renaissance-poetry/data

__Code__

1. https://github.com/joosthub/PyTorchNLPBook/blob/master/chapters/chapter_5/5_2_CBOW/5_2_munging_frankenstein.ipynb

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import numpy as np
from collections import Counter
import string
import torch
from torch.utils.data import Dataset
import pandas as pd
import torch.nn as nn
from torch.utils.data import DataLoader
from argparse import Namespace
import torch.optim as optim
from tqdm.notebook import tqdm
import torch.nn.functional as F
import re
import json

In [None]:
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/PoetryGenerator/data/poetryfoundation.csv")
df.head()

Unnamed: 0,author,content,poem name,age,type
0,WILLIAM SHAKESPEARE,Let the bird of loudest lay\r\nOn the sole Ara...,The Phoenix and the Turtle,Renaissance,Mythology & Folklore
1,DUCHESS OF NEWCASTLE MARGARET CAVENDISH,"Sir Charles into my chamber coming in,\r\nWhen...",An Epilogue to the Above,Renaissance,Mythology & Folklore
2,THOMAS BASTARD,"Our vice runs beyond all that old men saw,\r\n...","Book 7, Epigram 42",Renaissance,Mythology & Folklore
3,EDMUND SPENSER,"Lo I the man, whose Muse whilome did maske,\r\...","from The Faerie Queene: Book I, Canto I",Renaissance,Mythology & Folklore
4,RICHARD BARNFIELD,"Long have I longd to see my love againe,\r\nSt...",Sonnet 16,Renaissance,Mythology & Folklore


In [None]:
df.shape

(573, 5)

In [None]:
df['split'] = 'train'
poem_title_df = df[['poem name', 'split']].copy()

def assign_label(row):
    magic_number = np.random.randint(0, 10)
    if magic_number > 6:
        valortest = np.random.randint(0, 2)
        if valortest == 0:
            return 'val'
        elif valortest == 1:
            return 'test'
    else:
        return 'train'
           
poem_title_df['split'] = poem_title_df.apply(lambda row: assign_label(row['split']), axis=1)

In [None]:
poem_title_df['split'].value_counts()

train    412
test      93
val       66
Name: split, dtype: int64

In [None]:
poem_title_df.drop(poem_title_df[poem_title_df['poem name'].isna()].index, inplace=True)

In [None]:
poem_title_df[poem_title_df['poem name'].isna()]

Unnamed: 0,poem name,split


In [None]:
poem_title_df.to_csv("/content/drive/MyDrive/Colab Notebooks/PoemTitleGenerator/poem_title.csv", index=False)

# Building The Model

## Sequence Vocabulary Class

In [None]:
class Vocabulary(object):
    def __init__(self):
        """
        Create sequence vocabulary
        """
        self.token_to_idx = {}
        self.idx_to_token = {}

        self.bos_token = "<BOS>"
        self.eos_token = "<EOS>"
        self.mask_token = "<MASK>"
        self.unk_token = "<UNK>"

        self.bos_index = self.add_token(self.bos_token)
        self.eos_index = self.add_token(self.eos_token)
        self.mask_index = self.add_token(self.mask_token)
        self.unk_index = self.add_token(self.unk_token)

    def add_token(self, token):
        """
        Add new token to vocabulary
        """

        if token in self.token_to_idx:
            index = self.token_to_idx[token]
        else:
            index = len(self.token_to_idx)
            self.token_to_idx[token] = index
            self.idx_to_token[index] = token
            
        return index

    def lookup_index(self, index):
        """
        Find token given an index
        """
        return self.idx_to_token[index]

    def lookup_token(self, token):
        """
        Find an index of a token
        """
        if token in self.token_to_idx:
            return self.token_to_idx[token]

    def __len__(self):
        """
        Return the length of vocabulary
        """
        return len(self.token_to_idx)

## Vectorizer Class

In [None]:
class Vectorizer(object):
    def __init__(self, title_vocab):
        self.title_vocab = title_vocab

    def vectorize(self, title, vector_length=-1):
        """
        Vectorize the title and returns two vector
        title[:-1] and title[1:]
        so that the first vector is an observation vector
        and the second is target vector
        """

        indices = [self.title_vocab.bos_index]
        indices.extend(self.title_vocab.lookup_token(token) for token
                                  in title)
        indices.append(self.title_vocab.eos_index)

        if vector_length < 0:
            vector_length = len(indices) - 1

        from_vector = np.empty(vector_length, dtype=np.int64)
        from_indices = indices[:-1]
        from_vector[:len(from_indices)] = from_indices 
        from_vector[len(from_indices):] = self.title_vocab.mask_index

        to_vector = np.empty(vector_length, dtype=np.int64)
        to_indices = indices[1:]
        to_vector[:len(to_indices)] = to_indices
        to_vector[len(to_indices):] = self.title_vocab.mask_index

        return from_vector, to_vector

    @classmethod
    def from_dataframe(cls, title_df):
        """
        Vectorize from dataframe
        """
        title_vocab = Vocabulary()

        for index, row in title_df.iterrows():
            title = row['poem name']
            for char in title:
                title_vocab.add_token(char)

        return cls(title_vocab)

## Dataset Class

In [None]:
class TitleDataset(Dataset):
    def __init__(self, title_df, vectorizer):
        self.title_df = title_df
        self.vectorizer = vectorizer

        self.max_seq_length = max(map(len, self.title_df['poem name'])) + 2

        self.train_df = self.title_df[self.title_df['split'] == 'train']
        self.train_size = len(self.train_df)

        self.val_df = self.title_df[self.title_df['split'] == 'val']
        self.val_size = len(self.val_df)

        self.test_df = self.title_df[self.title_df['split'] == 'test']
        self.test_size = len(self.test_df)

        self.lookup_dict = {
            "train": (self.train_df, self.train_size),
            "test": (self.test_df, self.test_size),
            "val": (self.val_df, self.val_size)
        }

        self.set_split("train")

    @classmethod
    def load_dataset_and_make_vectorizer(cls, title_csv):
        title_df = pd.read_csv(title_csv)
        return cls(title_df, Vectorizer.from_dataframe(title_df))

    def set_split(self, split="train"):
        self.target_split = split
        self.target_df, self.target_size = self.lookup_dict[split]

    def __len__(self):
        return self.target_size
    
    def __getitem__(self, index):
        """
        Pytorch primary point to get data
        return the data point
        """

        row = self.target_df.iloc[index]

        from_vector, to_vector = self.vectorizer.vectorize(row['poem name'],
                                                           self.max_seq_length)
        
        return {
            "x_data": from_vector,
            "y_target": to_vector
        }

    def get_num_batches(self, batch_size):
        return len(self) // batch_size

    def get_vectorizer(self):
        return self.vectorizer


def generate_batches(dataset, batch_size, shuffle=True,
                     drop_last=True, device="cpu"):
    """
    Helper function to generate batches
    """
    dataloader = DataLoader(dataset=dataset, batch_size=batch_size,
                            shuffle=shuffle, drop_last=drop_last)

    for data_dict in dataloader:
        out_data_dict = {}
        for name, tensor in data_dict.items():
            out_data_dict[name] = data_dict[name].to(device)
        yield out_data_dict

## Poem Title Model

In [None]:
class PoemTitleGenerator(nn.Module):
    def __init__(self, embedding_size, vocab_size, rnn_hidden_size,
                 padding_idx=0, dropout=.5, batch_first=True):
        
        super(PoemTitleGenerator, self).__init__()
        
        self.embedding = nn.Embedding(num_embeddings= vocab_size,
                                      embedding_dim= embedding_size,
                                      padding_idx= padding_idx)
        
        self.rnn = nn.GRU(input_size = embedding_size,
                          hidden_size = rnn_hidden_size,
                          batch_first = batch_first)
        
        self.fc = nn.Linear(in_features= rnn_hidden_size,
                            out_features= vocab_size)
        
        self.dropout = nn.Dropout(p=dropout)

    def forward(self, x_in, apply_softmax = False):
        x_embedded = self.embedding(x_in)

        y_out, hidden = self.rnn(x_embedded)

        batch_size, seq_size, features = y_out.shape
        y_out = y_out.contiguous().view(batch_size * seq_size, features)

        y_out = self.fc(self.dropout(y_out))

        if apply_softmax:
            y_out = F.softmax(y_out, dim=1)

        new_feat_size = y_out.shape[-1]
        y_out = y_out.view(batch_size, seq_size, new_feat_size)
            
        return y_out

## Training Routine

### Helper Functions


In [None]:
def make_train_state(args):
    return {'stop_early': False,
            'early_stopping_step': 0,
            'early_stopping_best_val': 1e8,
            'learning_rate': args.learning_rate,
            'epoch_index': 0,
            'train_loss': [],
            'train_acc': [],
            'val_loss': [],
            'val_acc': [],
            'test_loss': -1,
            'test_acc': -1}

def normalize_sizes(y_pred, y_true):
    """Normalize tensor sizes
    
    Args:
        y_pred (torch.Tensor): the output of the model
            If a 3-dimensional tensor, reshapes to a matrix
        y_true (torch.Tensor): the target predictions
            If a matrix, reshapes to be a vector
    """
    if len(y_pred.size()) == 3:
        y_pred = y_pred.contiguous().view(-1, y_pred.size(2))
    if len(y_true.size()) == 2:
        y_true = y_true.contiguous().view(-1)
    return y_pred, y_true

def compute_accuracy(y_pred, y_true, mask_index):
    y_pred, y_true = normalize_sizes(y_pred, y_true)

    _, y_pred_indices = y_pred.max(dim=1)
    
    correct_indices = torch.eq(y_pred_indices, y_true).float()
    valid_indices = torch.ne(y_true, mask_index).float()
    
    n_correct = (correct_indices * valid_indices).sum().item()
    n_valid = valid_indices.sum().item()

    return n_correct / n_valid * 100

def sequence_loss(y_pred, y_true, mask_index):
    y_pred, y_true = normalize_sizes(y_pred, y_true)
    return F.cross_entropy(y_pred, y_true, ignore_index=mask_index)

In [None]:
def sample_from_model(model, vectorizer, num_samples=1, sample_size=20, 
                      temperature=1.0):
    """Sample a sequence of indices from the model
    
    Args:
        model (SurnameGenerationModel): the trained model
        vectorizer (SurnameVectorizer): the corresponding vectorizer
        num_samples (int): the number of samples
        sample_size (int): the max length of the samples
        temperature (float): accentuates or flattens 
            the distribution. 
            0.0 < temperature < 1.0 will make it peakier. 
            temperature > 1.0 will make it more uniform
    Returns:
        indices (torch.Tensor): the matrix of indices; 
        shape = (num_samples, sample_size)
    """
    begin_seq_index = [vectorizer.title_vocab.bos_index 
                       for _ in range(num_samples)]
    begin_seq_index = torch.tensor(begin_seq_index, 
                                   dtype=torch.int64).unsqueeze(dim=1)
    indices = [begin_seq_index]
    h_t = None
    
    for time_step in range(sample_size):
        x_t = indices[time_step]
        x_emb_t = model.embedding(x_t)
        rnn_out_t, h_t = model.rnn(x_emb_t, h_t)
        prediction_vector = model.fc(rnn_out_t.squeeze(dim=1))
        probability_vector = F.softmax(prediction_vector / temperature, dim=1)
        indices.append(torch.multinomial(probability_vector, num_samples=1))
    indices = torch.stack(indices).squeeze().permute(1, 0)
    return indices

def decode_samples(sampled_indices, vectorizer):
    """Transform indices into the string form of a surname
    
    Args:
        sampled_indices (torch.Tensor): the inidces from `sample_from_model`
        vectorizer (SurnameVectorizer): the corresponding vectorizer
    """
    decoded_surnames = []
    vocab = vectorizer.title_vocab
    
    for sample_index in range(sampled_indices.shape[0]):
        surname = ""
        for time_step in range(sampled_indices.shape[1]):
            sample_item = sampled_indices[sample_index, time_step].item()
            if sample_item == vocab.bos_index:
                continue
            elif sample_item == vocab.eos_index:
                break
            else:
                surname += vocab.lookup_index(sample_item)
        decoded_surnames.append(surname)
    return decoded_surnames

### Training Init

In [None]:
args = Namespace(
    # Data and Path information
    title_csv="/content/drive/MyDrive/Colab Notebooks/PoemTitleGenerator/poem_title.csv",
    # Model hyper parameters
    char_embedding_size=128,
    rnn_hidden_size=128,
    # Training hyper parameters
    seed=1337,
    learning_rate=0.001,
    batch_size=32,
    num_epochs=1000,
    early_stopping_criteria=5,
    # Runtime options
    cuda=True,
)

train_state = make_train_state(args)

args.device = "cuda" if args.cuda & torch.cuda.is_available() else "cpu"

dataset = TitleDataset.load_dataset_and_make_vectorizer(args.title_csv)

vectorizer = dataset.get_vectorizer()

model = PoemTitleGenerator(embedding_size=args.char_embedding_size,
                               vocab_size=len(vectorizer.title_vocab),
                               rnn_hidden_size=args.rnn_hidden_size,
                               padding_idx=vectorizer.title_vocab.mask_index)
model.to(args.device)

optimizer = optim.Adam(model.parameters(), lr=args.learning_rate)

print(args.device)

cuda


### Training Loop

In [None]:
mask_index = vectorizer.title_vocab.mask_index
epoch_bar = tqdm(desc='training routine', 
                          total=args.num_epochs,
                          position=0)

dataset.set_split('train')
train_bar = tqdm(desc='split=train',
                          total=dataset.get_num_batches(args.batch_size)-1, 
                          position=1, 
                          leave=True)

dataset.set_split('val')
val_bar = tqdm(desc='split=val',
                        total=dataset.get_num_batches(args.batch_size), 
                        position=1, 
                        leave=True)

try:
    for epoch_index in range(args.num_epochs):
        train_state['epoch_index'] = epoch_index

        # Iterate over training dataset

        # setup: batch generator, set loss and acc to 0, set train mode on
        dataset.set_split('train')
        batch_generator = generate_batches(dataset, 
                                           batch_size=args.batch_size, 
                                           device=args.device)
        running_loss = 0.0
        running_acc = 0.0
        model.train()
        
        for batch_index, batch_dict in enumerate(batch_generator):
            # step 1. zero the gradients
            optimizer.zero_grad()

            # step 2. compute the output
            y_pred = model(x_in=batch_dict['x_data'])

            # step 3. compute the loss
            loss = sequence_loss(y_pred, batch_dict['y_target'], mask_index)


            # step 4. use loss to produce gradients
            loss.backward()

            # step 5. use optimizer to take gradient step
            optimizer.step()

            # compute the  running loss and running accuracy
            running_loss += (loss.item() - running_loss) / (batch_index + 1)
            acc_t = compute_accuracy(y_pred, batch_dict['y_target'], mask_index)
            running_acc += (acc_t - running_acc) / (batch_index + 1)

            # update bar
            train_bar.set_postfix(loss=running_loss,
                                  acc=running_acc,
                                  epoch=epoch_index)
            train_bar.update()

        train_state['train_loss'].append(running_loss)
        train_state['train_acc'].append(running_acc)

        # Iterate over val dataset

        # setup: batch generator, set loss and acc to 0; set eval mode on
        dataset.set_split('val')
        batch_generator = generate_batches(dataset, 
                                           batch_size=args.batch_size, 
                                           device=args.device)
        running_loss = 0.
        running_acc = 0.
        model.eval()

        for batch_index, batch_dict in enumerate(batch_generator):
            # compute the output
            y_pred = model(x_in=batch_dict['x_data'])

            # step 3. compute the loss
            loss = sequence_loss(y_pred, batch_dict['y_target'], mask_index)

            # compute the  running loss and running accuracy
            running_loss += (loss.item() - running_loss) / (batch_index + 1)
            acc_t = compute_accuracy(y_pred, batch_dict['y_target'], mask_index)
            running_acc += (acc_t - running_acc) / (batch_index + 1)
            
            # Update bar
            val_bar.set_postfix(loss=running_loss, acc=running_acc, 
                            epoch=epoch_index)
            val_bar.update()

        train_state['val_loss'].append(running_loss)
        train_state['val_acc'].append(running_acc)
        
        train_bar.n = 0
        # val_bar.n = 0
        epoch_bar.update()
        
except KeyboardInterrupt:
    print("Exiting loop")

HBox(children=(FloatProgress(value=0.0, description='training routine', max=1000.0, style=ProgressStyle(descri…

HBox(children=(FloatProgress(value=0.0, description='split=train', max=11.0, style=ProgressStyle(description_w…

HBox(children=(FloatProgress(value=0.0, description='split=val', max=2.0, style=ProgressStyle(description_widt…

### Sampling

In [None]:
print(dataset.max_seq_length)

83


In [None]:
num_names = 100
model = model.cpu()
# Generate nationality hidden state
sample_title = decode_samples(
    sample_from_model(model, vectorizer, num_samples=num_names, sample_size=83), 
    vectorizer)
# Show results
print ("-"*15)
for i in range(num_names):
    print (sample_title[i])

---------------
The Bait
Song: Orpheus more thes fore thy wild fork on hath were fore there her cronice alli
Winter Stars
Wild Peaches
He wishes his Beloved were Foud for the Being
OLen of Wasen
[love is more thes fale boled thou tree wear thers
Song: to Celia [Wonte, grest
Song for the Lady Mary Wroth
The Sauchs in the Reuch Heuch Hauch
A Coronet for heram
Rosalinds Madrigal
Magic
The Relic
Break of Day
Green Groweth the Holly
Prothalamian
The Siver Mirnaly beood
Sonnet 110: Alas, 'tin thoug with her sout
The Bait
A Graveyard
Delia 33: Hy wasme anlt and all thespe, now in thou mayst in the mary for led, more
Astrophil and Stella 109: O thou made her cherety
The Estoble Isn
At Melvisles
Farewell Love as Dais Son Hing of Renbsea
Intine Pigtres
Sonnet 33: Full many a glorious made to Celia [Afyloss 
The Song of the Wits
Bronzes
Astrophil and Stella 106: O absent presencong this whanly thXa
Unstable Dream
The Rivilet
Winter Song
The Wild Comman
Ballad of the Three Haymrel
Sonnet 146: Pass

In [None]:
title_df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/PoemTitleGenerator/poem_title.csv")

for title in sample_title:
    title_list = title_df[title_df['poem name'] == title]['poem name'].to_list()
    if len(title_list) == 0:
        print("no match found for ", title)
    else:
        print("found ", title_list)

found  ['The Bait', 'The Bait']
no match found for  Song: Orpheus more thes fore thy wild fork on hath were fore there her cronice alli
found  ['Winter Stars']
found  ['Wild Peaches', 'Wild Peaches']
no match found for  He wishes his Beloved were Foud for the Being
no match found for  OLen of Wasen
no match found for  [love is more thes fale boled thou tree wear thers
no match found for  Song: to Celia [Wonte, grest
no match found for  Song for the Lady Mary Wroth
found  ['The Sauchs in the Reuch Heuch Hauch']
no match found for  A Coronet for heram
found  ['Rosalinds Madrigal']
found  ['Magic', 'Magic']
found  ['The Relic']
found  ['Break of Day', 'Break of Day']
found  ['Green Groweth the Holly', 'Green Groweth the Holly']
no match found for  Prothalamian
no match found for  The Siver Mirnaly beood
no match found for  Sonnet 110: Alas, 'tin thoug with her sout
found  ['The Bait', 'The Bait']
found  ['A Graveyard']
no match found for  Delia 33: Hy wasme anlt and all thespe, now in tho