# Assignment 7

Delelop language model, which generates death metal band names.  
You can get data from https://www.kaggle.com/zhangjuefei/death-metal.  
You are free to use any other data, but the most easy way is just to take the band name column.

Your language model should be char-based autogression RNN.  
Text generation should be terminated when either max length is reached or terminal symbol is generated.  

<img src="images/example.png">

<img src="images/example2.png">

Different band names can be generated by:  
1. init $h_0$ as random vector from some probabilty distribution.
2. sampling over tokens at each timestep with probability = softmax 

Calculate perplexity for your model = your objective quality metric.  
Also, sample 10 band names from your model for subjective evaluation. E.g. names like 'qwiouefiou23riop2h3' or 'death death death!' are bad examples.  

In [1]:
import pandas as pd
import numpy as np
import torch as tt
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchtext.data import Field, LabelField, BucketIterator, ReversibleField, TabularDataset, BPTTIterator
from torch.distributions.distribution import Distribution
from tqdm import tqdm_notebook
SEED = 42
import random
import string

In [2]:
data = pd.read_csv('death-metal\\bands.csv')

In [3]:
data.head()

Unnamed: 0,id,name,country,status,formed_in,genre,theme,active
0,1,('M') Inc.,United States,Unknown,2009.0,Death Metal,,2009-?
1,2,(sic),United States,Split-up,1993.0,Death Metal,,1993-1996
2,3,.F.O.A.D.,France,Active,2009.0,Death Metal,Life and Death,2009-present
3,4,100 Suns,United States,Active,2004.0,Death Metal,,2004-present
4,5,12 Days of Anarchy,United States,Split-up,1998.0,Death Metal,Anarchy,1998-2002


In [4]:
del data

In [5]:
def tokenizer(text): # create a tokenizer function
    return [tok for tok in text]  

In [101]:
TEXT = Field(tokenize=tokenizer, 
             include_lengths=True, 
             batch_first=True,
             init_token='<start>', eos_token='<end>',
             lower=False
            )

dataset = TabularDataset('death-metal\\bands.csv', format='csv', 
                         fields=[(None, None), ('text', TEXT), (None, None), (None, None), 
                                 (None, None), (None, None), (None, None), (None, None)])

In [102]:
TEXT.build_vocab(dataset, min_freq=10)
len(TEXT.vocab.itos)

120

In [143]:
TEXT.vocab.itos[:10]

['<unk>',
 '<pad>',
 '<start>',
 '<end>',
 'e',
 'a',
 'r',
 'o',
 'i',
 'n',
 't',
 ' ',
 's',
 'l',
 'u',
 'c',
 'h',
 'd',
 'm',
 'g',
 'D',
 'y',
 'S',
 'p',
 'f',
 'A',
 'C',
 'M',
 'b',
 'v',
 'T',
 'B',
 'k',
 'E',
 'I',
 'P',
 'F',
 'H',
 'R',
 'N',
 'G',
 'w',
 'L',
 'x',
 'O',
 'V',
 'W',
 'K',
 'z',
 '.',
 'U',
 "'",
 'q',
 'j',
 'Z',
 'J',
 '-',
 'Y',
 'X',
 '6',
 'о',
 'а',
 'ö',
 'е',
 'р',
 '1',
 'т',
 'ó',
 'Q',
 'и',
 'н',
 '0',
 'ä',
 'é',
 '3',
 'с',
 '2',
 'л',
 ':',
 'ü',
 'к',
 'í',
 'в',
 '&',
 '7',
 'м',
 'я',
 '4',
 'у',
 '!',
 'С',
 '5',
 '9',
 'á',
 'г',
 '8',
 'ы',
 'д',
 '|',
 'ç',
 'й',
 'ú',
 'М',
 'Р',
 'ã',
 'П',
 'п',
 'ë',
 'А',
 'К',
 'х',
 'æ',
 'ь',
 'ï',
 'В',
 'з',
 '(',
 ')',
 '?',
 'ø']

In [104]:
train, test = dataset.split()
train, valid = train.split()

In [138]:
class MyModel(nn.Module):
    
    def __init__(self, vocab_size, target_vocab_size, embed_size, hidden_size):
        super(MyModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        
        self.rnn = nn.LSTM(input_size=embed_size,
                           hidden_size=hidden_size,
                           bidirectional=True,
                           batch_first=True
                          )
        
        self.fc = nn.Linear(hidden_size * 2, target_vocab_size)
        
        self.init_weights()
        
    def init_weights(self):
        nn.init.uniform_(self.embedding.weight)
        nn.init.xavier_uniform_(self.fc.weight)
        nn.init.zeros_(self.fc.bias)
        
    
    def forward(self, batch):
        
        x, x_lengths = batch.text
        batch_size = x.size(0)
        total_length = x.size(1)

        x = self.embedding(x)

        if x_lengths is not None:
            x_lengths = x_lengths.view(-1).tolist()
            x = nn.utils.rnn.pack_padded_sequence(x, x_lengths, batch_first=True)
            
        x, _ = self.rnn(x)
        
        x, _ = nn.utils.rnn.pad_packed_sequence(x, total_length=total_length, batch_first=True)
        
        x = x.contiguous().view(batch_size * total_length, -1)
        x = self.fc(x)
        x = x.contiguous().view(batch_size, total_length, -1)
        
        return x.transpose(1,2)
    
    def evaluate(self, h_0, c_0):
        for n in range(10):
            out_1, (h_1, c_1) = self.rnn(self.embedding(tt.tensor([[2]])))
            token = tt.tensor([[np.random.randint(0, 120)]])
            
            brand_name = ''
            for i in range(np.random.randint(3, 10)):
                out_1, (h_1, c_1) = self.rnn(self.embedding(token), (h_1, c_1))
                p = self.fc(out_1)
                _, token = tt.max(p, 2)
                
                brand_name += TEXT.vocab.itos[token]
                
            print(brand_name)

In [111]:
def get_answers(iterator):
    answers = list()
    for batch in iterator:
        text = batch.text[0]
        for i, el in enumerate(text):
            text[i] = tt.tensor(list(el[1:]) + [4])
        answers.append(text)
    return answers

In [123]:
def _train_epoch(model, iterator, target, optimizer, criterion, curr_epoch):

    model.train()

    running_loss = 0

    n_batches = len(iterator)
    iterator = tqdm_notebook(iterator, total=n_batches, desc='epoch %d' % (curr_epoch), leave=True)

    for i, batch in enumerate(iterator):
        optimizer.zero_grad()

        pred = model(batch)
        real = target[i]
        loss = criterion(pred, real)
        loss.backward()
        optimizer.step()

        curr_loss = loss.data.cpu().detach().item()
        
        loss_smoothing = i / (i+1)
        running_loss = loss_smoothing * running_loss + (1 - loss_smoothing) * curr_loss

        iterator.set_postfix(loss='%.5f' % running_loss)

    return running_loss

def _test_epoch(model, iterator, target, criterion):
    model.eval()
    epoch_loss = 0

    n_batches = len(iterator)
    with tt.no_grad():
        for i, batch in enumerate(iterator):
            pred = model(batch)
            real = target[i]
            loss = criterion(pred, real)
            epoch_loss += loss.data.item()
            
    return epoch_loss / n_batches


def nn_train(model, train_iterator, valid_iterator, criterion, optimizer, n_epochs=100,
          scheduler=None, early_stopping=0):

    prev_loss = 100500
    es_epochs = 0
    best_epoch = None
    history = pd.DataFrame()
    target_train = get_answers(train_iterator)
    target_valid = get_answers(valid_iterator)

    for epoch in range(n_epochs):
        train_loss = _train_epoch(model, train_iterator, target_train, optimizer, criterion, epoch)
        valid_loss = _test_epoch(model, valid_iterator, target_valid, criterion)

        valid_loss = valid_loss
        print('validation loss %.5f' % valid_loss)

        record = {'epoch': epoch, 'train_loss': train_loss, 'valid_loss': valid_loss}
        history = history.append(record, ignore_index=True)

        if early_stopping > 0:
            if valid_loss > prev_loss:
                es_epochs += 1
            else:
                es_epochs = 0

            if es_epochs >= early_stopping:
                best_epoch = history[history.valid_loss == history.valid_loss.min()].iloc[0]
                print('Early stopping! best epoch: %d val %.5f' % (best_epoch['epoch'], best_epoch['valid_loss']))
                break

            prev_loss = min(prev_loss, valid_loss)

In [121]:
batch_size = 32

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train, valid, test),
    batch_sizes=(batch_size, batch_size, batch_size),
    shuffle=False,
    sort_key=lambda x: len(x.text),
    sort_within_batch=True
    )

In [139]:
tt.cuda.empty_cache()


model = MyModel(vocab_size=len(TEXT.vocab.itos),
                target_vocab_size=len(TEXT.vocab.itos),
                embed_size=8,
                hidden_size=128
               )

optimizer = optim.Adam(model.parameters())
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=5)
criterion = nn.CrossEntropyLoss(ignore_index=1)

In [140]:
nn_train(model, train_iterator, valid_iterator, criterion, optimizer, scheduler=scheduler, 
        n_epochs=20, early_stopping=2)

HBox(children=(IntProgress(value=0, description='epoch 0', max=578), HTML(value='')))

validation loss 0.25246


HBox(children=(IntProgress(value=0, description='epoch 1', max=578), HTML(value='')))

validation loss 0.06825


HBox(children=(IntProgress(value=0, description='epoch 2', max=578), HTML(value='')))

validation loss 0.04175


HBox(children=(IntProgress(value=0, description='epoch 3', max=578), HTML(value='')))

validation loss 0.03043


HBox(children=(IntProgress(value=0, description='epoch 4', max=578), HTML(value='')))

validation loss 0.02397


HBox(children=(IntProgress(value=0, description='epoch 5', max=578), HTML(value='')))

validation loss 0.01980


HBox(children=(IntProgress(value=0, description='epoch 6', max=578), HTML(value='')))

validation loss 0.01686


HBox(children=(IntProgress(value=0, description='epoch 7', max=578), HTML(value='')))

validation loss 0.01459


HBox(children=(IntProgress(value=0, description='epoch 8', max=578), HTML(value='')))

validation loss 0.01279


HBox(children=(IntProgress(value=0, description='epoch 9', max=578), HTML(value='')))

validation loss 0.01113


HBox(children=(IntProgress(value=0, description='epoch 10', max=578), HTML(value='')))

validation loss 0.00966


HBox(children=(IntProgress(value=0, description='epoch 11', max=578), HTML(value='')))

validation loss 0.00830


HBox(children=(IntProgress(value=0, description='epoch 12', max=578), HTML(value='')))

validation loss 0.00706


HBox(children=(IntProgress(value=0, description='epoch 13', max=578), HTML(value='')))

validation loss 0.00608


HBox(children=(IntProgress(value=0, description='epoch 14', max=578), HTML(value='')))

validation loss 0.00498


HBox(children=(IntProgress(value=0, description='epoch 15', max=578), HTML(value='')))

validation loss 0.00418


HBox(children=(IntProgress(value=0, description='epoch 16', max=578), HTML(value='')))

validation loss 0.00350


HBox(children=(IntProgress(value=0, description='epoch 17', max=578), HTML(value='')))

validation loss 0.00295


HBox(children=(IntProgress(value=0, description='epoch 18', max=578), HTML(value='')))

validation loss 0.00251


HBox(children=(IntProgress(value=0, description='epoch 19', max=578), HTML(value='')))

validation loss 0.00215


In [141]:
target_test = get_answers(test_iterator)

test_loss = _test_epoch(model, test_iterator, target_test, criterion)
print('Crossentropy: ', test_loss, 'Perplexity: ', float(tt.exp(tt.tensor(test_loss))))

Crossentropy:  0.0018235069784517857 Perplexity:  1.001825213432312


In [142]:
with tt.no_grad():
    model.evaluate(tt.randn(2, 1, 128), tt.randn(2, 1, 128))

<end>í0
<end>E0E0E0
<end>xBxBxBxB
<end>u0u0u0u
<end>р0р0р0
<end>í0í0í0í
<end>j0j0j0j0
<end>e0
<end>e0e0e0
<end>iviviv


Не смогла разобраться, в чем проблема. Пробовалаи с инициализацией случайным h0, и без нее.