# TRAIN Language Model

In [1]:
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import progressbar
from multiprocessing import Pool
from tensorboardX import SummaryWriter
import time


In [2]:
def to_int(s):
    return int(s)

In [3]:
class text_corpus(Dataset):
    """text corpus dataset."""

    def __init__(self, base_dir='resources', tok_ind_files_list=['bijankhan_indexed.txt']):
        """
        Args:
            tok_ind_files_list (list): .txt filenames, contains one sentence per line.
            base_dir (string): directory with all the txt files.
        """
        super(text_corpus, self).__init__()
        self.ds = []
        for i in tok_ind_files_list:
            print('loading {} corpus...'.format(i))
            self.load_txt(os.path.join(base_dir, i))

    def load_txt(self, tok_ind_files_list):

        with open(tok_ind_files_list) as f:
            sentences = f.read().strip().split('\n')

        p = Pool(4)
        bar = progressbar.ProgressBar()
        for sentence in bar(sentences):
            self.ds.append(p.map(to_int, sentence.split(' ')))

    def __len__(self):
        return len(self.ds)

    def __getitem__(self, idx):
        sample = self.ds[idx]

        return sample[:-1], sample[1:]


In [4]:
def my_collate(batch):
    #torch.IntTensor(
    B = len(batch)
    T = max([len(i[0]) for i in batch])
    in_batch, out_batch = torch.ones((T, B)).long(), torch.ones((T, B)).long()
    for i in range(B):
        l = len(batch[i][0])
        in_batch[0:l, i] = torch.LongTensor(batch[i][0])
        out_batch[0:l, i] = torch.LongTensor(batch[i][1])
    #print('in_batch:', in_batch)
    #print('out_batch:', out_batch)
    return in_batch, out_batch

In [5]:

class RNNModel(nn.Module):
    """Container module with an encoder, a recurrent module, and a decoder."""

    def __init__(self, ntoken, bsz, nembd=128, nhid=256):
        super(RNNModel, self).__init__()
        self.nhid = nhid
        self.bsz = bsz
        self.nembd = nembd
        self.drop = nn.Dropout(.1)
        self.encoder = nn.Embedding(ntoken, nembd)
        self.rnn = nn.LSTM(nembd, nhid, dropout=.1)
        self.fc = nn.Linear(nhid, nembd)
        self.decoder = nn.Linear(nembd, ntoken)
        self.decoder.weight = self.encoder.weight
        self.init_weights()
        self.hidden = self.init_hidden()

    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.fill_(0)
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, x, hidden):
        T = x.shape[0]
        emb = self.drop(self.encoder(x))
        #print('emb size: {}'.format(emb.size()))
        output, hidden = self.rnn(emb, hidden)
        #print('output size: {}'.format(output.size()))
        dropped_output = self.drop(output)
        #print('dropped_output size: {}'.format(dropped_output.size()))
        dropped_fc = self.drop(self.fc(dropped_output.view(T * self.bsz, -1)))
        #print('dropped_fc size: {}'.format(dropped_fc.size()))
        decoded = self.decoder(dropped_fc.view(T * self.bsz, -1))
        #print('decoded size: {}'.format(decoded.size()))
        return decoded.view(T, self.bsz, -1), hidden

    def init_hidden(self):
        return (Variable(torch.zeros(1, self.bsz, self.nhid)), Variable(torch.zeros(1, self.bsz, self.nhid)))


In [6]:
def repackage_hidden(h):
    """Wraps hidden states in new Variables, to detach them from their history."""
    if type(h) == Variable:
        return Variable(h.data.zero_())
        #return Variable(h.data).cuda(device_id)
    else:
        return tuple(repackage_hidden(v) for v in h)

In [7]:
if __name__ == '__main__':
    # CONSTANTS
    mb_size = 1
    update_size = 128
    ntoken = 48603
    t0 = time.time()
    lr = .001

    tc = text_corpus()
    dataloader = DataLoader(tc, 
                            collate_fn=my_collate, 
                            batch_size=mb_size, 
                            shuffle=True, 
                            num_workers=2, 
                            drop_last=True)
    model = RNNModel(ntoken, mb_size)
    #model.cuda()
    hidden = model.init_hidden()

    loss_function = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    writer = SummaryWriter('log')

    counter = 0
    for epoch in range(1):
        print('-' * 40 + 'epoch{:03d}'.format(epoch + 1) + '-' * 40)

        bar = progressbar.ProgressBar()
        for ind, io_seqs in enumerate(bar(dataloader)):
            model.zero_grad()
            t = io_seqs[1].shape[1]
            #in_seq, out_seq = Variable(io_seqs[0]).cuda(), Variable(io_seqs[1]).cuda()
            in_seq, out_seq = Variable(io_seqs[0]), Variable(io_seqs[1])
            hidden = repackage_hidden(hidden)
            model_score, hidden = model(in_seq, hidden)
            loss = loss_function(model_score.view(-1, ntoken), out_seq.view(-1))
            loss.backward()
            writer.add_scalar('data/loss', loss.data[0], counter)
            counter += 1
            if ind % update_size == update_size - 1:
                optimizer.step()
                break

        # save model
        with open(os.path.join('garbage_model', 'model_{:03d}.mdl'.format(epoch)), 'wb') as f:
            torch.save(model, f)

loading bijankhan_indexed.txt corpus...


100% (1001 of 1001) |#####################| Elapsed Time: 0:00:00 Time: 0:00:00


----------------------------------------epoch001----------------------------------------


  "type " + obj.__name__ + ". It won't be checked "
