# Assignment 7

Delelop language model, which generates death metal band names.  
You can get data from https://www.kaggle.com/zhangjuefei/death-metal.  
You are free to use any other data, but the most easy way is just to take the band name column.

Your language model should be char-based autogression RNN.  
Text generation should be terminated when either max length is reached or terminal symbol is generated.  


Different band names can be generated by:  
1. init $h_0$ as random vector from some probabilty distribution.
2. sampling over tokens at each timestep with probability = softmax 

Calculate perplexity for your model = your objective quality metric.  
Also, sample 10 band names from your model for subjective evaluation. E.g. names like 'qwiouefiou23riop2h3' or 'death death death!' are bad examples.  

In [1]:
import torch as tt
import torch.nn as nn
from torch.autograd import Variable
from tqdm import tqdm_notebook as tqdm
import string
import time
import os
import numpy as np
import pandas as pd
import random

In [2]:
class MyModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, n_layers=1):
        super(MyModel, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers

        self.encoder = nn.Embedding(input_size, hidden_size)
        self.rnn = nn.GRU(hidden_size, hidden_size, n_layers)
        self.decoder = nn.Linear(hidden_size, output_size)

    def forward(self, input, hidden):
        batch_size = input.size(0)
        encoded = self.encoder(input)
        output, hidden = self.rnn(encoded.view(1, batch_size, -1), hidden)
        output = self.decoder(output.view(batch_size, -1))
        return output, hidden

    def init_hidden(self, batch_size):
        return Variable(tt.zeros(self.n_layers, batch_size, self.hidden_size))

In [3]:
def read_file(filename):
    with open(filename, 'r') as file:
        text = file.read()
    return text, len(text)

In [4]:
def char_tensor(string):
    tensor = tt.zeros(len(string)).long()
    for c in range(len(string)):
        try:
            tensor[c] = all_characters.index(string[c])
        except:
            continue
    return tensor

## Делим выборку на тренировочную и валидационную

In [5]:
from random import shuffle
df = pd.read_csv('bands.csv')
groups = list(df['text'])
print('Shuffling data...')
shuffle(groups)
print('Writting data...')
with open('bands.txt', 'a') as file:
    for group in groups:
        file.write(group + '\n')

Shuffling data...
Writting data...


In [6]:
with open('bands.txt', 'r') as file, open('train_bands.txt', 'a') as file_w1, open('valid_bands.txt', 'a') as file_w2:
    lines = file.readlines()
    sep = int(0.9 * len(lines))
    i = 0
    for line in lines:
        i += 1
        if i < sep:
            file_w1.write(line)
        else:
            file_w2.write(line)

In [8]:
all_characters = string.printable
n_characters = len(all_characters)

In [9]:
hidden_size = 100
batch_size = 32
chunk_len = 200
decoder = MyModel(
    n_characters,
    hidden_size,
    n_characters
)
optimizer = tt.optim.Adam(decoder.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss()

In [10]:
def save(name):
    save_filename = os.path.splitext(os.path.basename(name))[0] + '.pt'
    tt.save(decoder, save_filename)
    print('Saved as %s' % save_filename)

In [11]:
file_train, file_train_len = read_file('train_bands.txt')
file_valid, file_valid_len = read_file('valid_bands.txt')

In [12]:
def random_training_set(chunk_len, batch_size, file, file_len):
    inp = tt.LongTensor(batch_size, chunk_len)
    target = tt.LongTensor(batch_size, chunk_len)
    for bi in range(batch_size):
        start_index = random.randint(0, file_len - chunk_len)
        end_index = start_index + chunk_len + 1
        chunk = file[start_index:end_index]
        inp[bi] = char_tensor(chunk[:-1])
        target[bi] = char_tensor(chunk[1:])
    inp = Variable(inp)
    target = Variable(target)
    return inp, target

In [13]:
def perplexity(x):
    return 2**x

In [14]:
def _train_epoch(inp, target, model, optimizer, criterion, curr_epoch):

    decoder.train()
    hidden = decoder.init_hidden(batch_size)
    decoder.zero_grad()
    running_loss = 0
    perplexities = []
    
    for c in range(chunk_len):
        optimizer.zero_grad()
        
        output, hidden = decoder(inp[:,c], hidden)
        loss = criterion(output.view(batch_size, -1), target[:,c])
        perplexities.append(perplexity(loss.item()))
        
        curr_loss = loss.data.cpu().detach().item()
        loss_smoothing = c / (c+1)
        running_loss = loss_smoothing * running_loss + (1 - loss_smoothing) * curr_loss
    
    PERPLEXITY = np.mean(perplexities)
    loss.backward()
    optimizer.step()

    return running_loss, PERPLEXITY

def _test_epoch(inp, target, model, criterion):
    model.eval()
    epoch_loss = 0
    hidden = decoder.init_hidden(batch_size)
    loss = 0
    perplexities = []
    
    with tt.no_grad():
        for c in range(chunk_len):
            output, hidden = decoder(inp[:,c], hidden)
            loss = criterion(output.view(batch_size, -1), target[:,c])
            perplexities.append(perplexity(loss.item()))
            epoch_loss += loss.data.item()
    PERPLEXITY = np.mean(perplexities)
    
    return epoch_loss / chunk_len, PERPLEXITY


def nn_train(model, criterion, optimizer, n_epochs=100, scheduler=None, early_stopping=0):

    prev_loss = 100500
    es_epochs = 0
    best_epoch = None
    history = pd.DataFrame()
    train_losses = []
    valid_losses = []
    
    for epoch in tqdm(range(n_epochs)):
        train_loss, train_per = _train_epoch(*random_training_set(300, batch_size, file_train, file_train_len),
                                             model, optimizer, criterion, epoch)
        valid_loss, valid_per = _test_epoch(*random_training_set(300, batch_size, file_valid, file_valid_len),
                                            model, criterion)
        train_losses.append(train_loss)
        valid_losses.append(valid_loss)
        if epoch % 100 == 0 or epoch == n_epochs-1:
            print('|| Epoch %s | Valid loss %.5f | Train loss %.5f | Valid perplexity %.5f | Train perplexity %.5f ||' % (str(epoch),
                                                                                                                          valid_loss,
                                                                                                                          train_loss,
                                                                                                                          train_per,
                                                                                                                          valid_per))

In [16]:
nn_train(decoder, criterion, optimizer, n_epochs=1000)
save('bands')

HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))

|| Epoch 0 | Valid loss 2.69484 | Train loss 2.63052 | Valid perplexity 6.33457 | Train perplexity 6.62086 ||
|| Epoch 100 | Valid loss 2.68137 | Train loss 2.68046 | Valid perplexity 6.53741 | Train perplexity 6.52927 ||
|| Epoch 200 | Valid loss 2.63052 | Train loss 2.61395 | Valid perplexity 6.25979 | Train perplexity 6.31929 ||
|| Epoch 300 | Valid loss 2.65130 | Train loss 2.67726 | Valid perplexity 6.52089 | Train perplexity 6.40427 ||
|| Epoch 400 | Valid loss 2.69372 | Train loss 2.66897 | Valid perplexity 6.49674 | Train perplexity 6.61305 ||
|| Epoch 500 | Valid loss 2.66746 | Train loss 2.64441 | Valid perplexity 6.37464 | Train perplexity 6.47759 ||
|| Epoch 600 | Valid loss 2.58373 | Train loss 2.57041 | Valid perplexity 6.03143 | Train perplexity 6.11563 ||
|| Epoch 700 | Valid loss 2.67036 | Train loss 2.62570 | Valid perplexity 6.28575 | Train perplexity 6.48682 ||
|| Epoch 800 | Valid loss 2.70903 | Train loss 2.71940 | Valid perplexity 6.71711 | Train perplexity 6.679

  "type " + obj.__name__ + ". It won't be checked "


## Генерируем названия групп

In [17]:
def generate(decoder, prime_str='\n', predict_len=30, temperature=0.8):
    hidden = decoder.init_hidden(1)
    prime_input = Variable(char_tensor(prime_str).unsqueeze(0))
    predicted = ''

    for p in range(len(prime_str) - 1):
        _, hidden = decoder(prime_input[:,p], hidden)
        
    inp = prime_input[:,-1]
    
    for p in range(predict_len):
        output, hidden = decoder(inp, hidden)
        output_dist = output.data.view(-1).div(temperature).exp()
        top_i = tt.multinomial(output_dist, 1)[0]

        predicted_char = all_characters[top_i]
        if predicted != '' and predicted_char == '\n':
            break
        else:
            predicted += predicted_char
            inp = Variable(char_tensor(predicted_char).unsqueeze(0))

    return predicted

In [23]:
filename = 'bands.pt'
decoder = tt.load(filename)

for x in range(10):
    print(generate(decoder))

Inencastarg
Folemioss
Thorsed
Bed
Nacinata
Ungar
Teded
Unidgel
Wiscid
Ferios
