In [1]:
import io
import time
import nltk
import torch
from torch import nn

I'll try to build a small model that can predict the next word given a sequence of words.
I use pretrained word embedings from [here](https://nlp.stanford.edu/projects/glove/). To train the model I use Project Gutenberg.
Todos:
* Load the training data and clean it
* create embedding matrix from word2vec embeddings
* create word_to_idx and idx_to_word dict, include <pad> and <unk>
* create idx_from_word func (should return the index of <unk> for unknown words
* create training sequences:
  * sequences should have the length of the largest sentence. If sentence is shorter, then it is padded with <pad>
  * each word should predict the next and the last predict <pad>.

# Get text and create the word list (list of tokens)

In [202]:
nltk.set_proxy('http://proxy.brz.gv.at:8080')
nltk.download('brown')
nltk.corpus.brown.fileids()
#nltk.corpus.gutenberg.fileids()

[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\dohr\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!


['ca01',
 'ca02',
 'ca03',
 'ca04',
 'ca05',
 'ca06',
 'ca07',
 'ca08',
 'ca09',
 'ca10',
 'ca11',
 'ca12',
 'ca13',
 'ca14',
 'ca15',
 'ca16',
 'ca17',
 'ca18',
 'ca19',
 'ca20',
 'ca21',
 'ca22',
 'ca23',
 'ca24',
 'ca25',
 'ca26',
 'ca27',
 'ca28',
 'ca29',
 'ca30',
 'ca31',
 'ca32',
 'ca33',
 'ca34',
 'ca35',
 'ca36',
 'ca37',
 'ca38',
 'ca39',
 'ca40',
 'ca41',
 'ca42',
 'ca43',
 'ca44',
 'cb01',
 'cb02',
 'cb03',
 'cb04',
 'cb05',
 'cb06',
 'cb07',
 'cb08',
 'cb09',
 'cb10',
 'cb11',
 'cb12',
 'cb13',
 'cb14',
 'cb15',
 'cb16',
 'cb17',
 'cb18',
 'cb19',
 'cb20',
 'cb21',
 'cb22',
 'cb23',
 'cb24',
 'cb25',
 'cb26',
 'cb27',
 'cc01',
 'cc02',
 'cc03',
 'cc04',
 'cc05',
 'cc06',
 'cc07',
 'cc08',
 'cc09',
 'cc10',
 'cc11',
 'cc12',
 'cc13',
 'cc14',
 'cc15',
 'cc16',
 'cc17',
 'cd01',
 'cd02',
 'cd03',
 'cd04',
 'cd05',
 'cd06',
 'cd07',
 'cd08',
 'cd09',
 'cd10',
 'cd11',
 'cd12',
 'cd13',
 'cd14',
 'cd15',
 'cd16',
 'cd17',
 'ce01',
 'ce02',
 'ce03',
 'ce04',
 'ce05',
 'ce06',
 

In [221]:
book_name = 'cp07'
nltk.download('punkt')
sentences = [[w for w in (map(lambda w: w.lower().strip('_-.,?.!#'), s)) if w.isalnum()] for s in  nltk.corpus.brown.sents(book_name)]
corpus_set = set([w for s in sentences for w in s])

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dohr\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Load the embbeddings

Load the embbeddings in a dictionary **word_vecs_en**.

In [280]:
word_vecs_en = {}
v_list = []
word_to_idx = {}
idx_to_word = {}
with io.open('../glove.6B.50d.txt','r', encoding='utf8') as f:
    idx = 0
    for line in f:
        tokens = line.split()
        w = tokens[0]
        if w in corpus_set or w == 'unk' or w == 'pad':
            if w == 'unk' or w == 'pad':
                print(w)
            v = torch.FloatTensor(list(map(float,tokens[1:])))
            v_list.append(v)
            word_to_idx[w] = idx
            idx_to_word[idx] = w
            word_vecs_en[w] = v
            idx += 1


embeddings_matrix = torch.stack(v_list)
embeddings_matrix.shape

pad
unk


torch.Size([703, 50])

# Get text and create the word list (list of tokens)

Get the available texts from Projekt Gutenberg

In [281]:
def idx_from_word(word):
    if word in word_to_idx:
        return word_to_idx[word]
    else:
        return word_to_idx['unk']

idx_from_word('hudri'), idx_from_word('and')

(700, 3)

In [282]:
print(f'Number of all words in the text: {len(corpus_set)}')
max([len(s) for s in sentences])
l = [3]*5
l.extend([4])
torch.LongTensor(l)

Number of all words in the text: 706


tensor([3, 3, 3, 3, 3, 4])

# Create the dataset to load the input sequences from the sentences

In [283]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self,sents):
        """
        :param sents: the list of sentences
        """
        L = max([len(s) for s in sents])
        self.input = torch.LongTensor(len(sents),L)
        self.target = torch.LongTensor(len(sents),L)
        for i, s in enumerate(sents):
            inp = [idx_from_word('pad')]*(L-len(s))
            tar = [idx_from_word('pad')]*(L-len(s))
            w_list = []
            for w in s:
                w_list.append(idx_from_word(w))
            inp.extend(w_list)
            tar.extend(w_list[1:])
            tar.append(idx_from_word('pad'))
            self.input[i] = torch.LongTensor(inp)
            self.target[i] = torch.LongTensor(tar)

    def __len__(self):
        return len(self.input)

    def __getitem__(self, idx):
        return self.input[idx], self.target[idx]

dataset = Dataset(sentences)
dataset[10:12]

(tensor([[566, 566, 566, 566, 566, 566, 566, 566,  99, 194,   4,   0, 328, 292,
           26, 396, 362,   2, 540, 641,  88, 690,   1,  58, 196,   1, 395,  70,
           14,   2,  21, 155, 534, 397,  20, 321,   2, 547,  12,  73],
         [566, 566, 566, 566, 566, 566, 566, 566, 566, 566, 566, 566, 566, 566,
          566, 566, 566, 566, 566, 566, 566, 566, 566, 566, 566, 566, 566, 566,
          566, 566, 566, 485, 673,  26, 610,  17,  97, 383,   3, 693]]),
 tensor([[566, 566, 566, 566, 566, 566, 566, 566, 194,   4,   0, 328, 292,  26,
          396, 362,   2, 540, 641,  88, 690,   1,  58, 196,   1, 395,  70,  14,
            2,  21, 155, 534, 397,  20, 321,   2, 547,  12,  73, 566],
         [566, 566, 566, 566, 566, 566, 566, 566, 566, 566, 566, 566, 566, 566,
          566, 566, 566, 566, 566, 566, 566, 566, 566, 566, 566, 566, 566, 566,
          566, 566, 566, 673,  26, 610,  17,  97, 383,   3, 693, 566]]))

create the function `word_to_vec` which tuns a word or a list of words into vectors. If a word is not in the embbeddings the function returns a zero vector.

#  Build the model

In [368]:
class Model(nn.Module):
    def __init__(self,input_size,hidden_size,out_size,embed_mat,num_layers=1):
        super(Model, self).__init__()
        self.embeddings = nn.Embedding.from_pretrained(embed_mat)
        self.hidden_size = hidden_size
        self.batch_first = True
        self.num_layers = num_layers
        self.rnn = nn.LSTM(input_size=input_size,hidden_size=hidden_size,num_layers=num_layers,batch_first=True)
        self.linear1 = nn.Linear(hidden_size,hidden_size)
        self.linear2 = nn.Linear(hidden_size,out_size)

    def init_hidden(self,batch_size=1):
        return  torch.zeros((self.num_layers,batch_size,self.hidden_size)),torch.zeros((self.num_layers,batch_size,self.hidden_size))

    def forward(self,input,hidden):
        x = self.embeddings(input)
        out, hidden = self.rnn(x,hidden)
        out = self.linear1(out)
        dropout = nn.Dropout(p=0.2)
        out = dropout(out)
        out = torch.relu(out)
        return  self.linear2(out),hidden

# Train the model

## create the dataset for loading the data

In [371]:
n_epochs = 10
lr = 5e-3
batch_size = 8

#init dataset and data loader
dataset = Dataset(sentences)
data_loader = torch.utils.data.DataLoader(dataset,batch_size=batch_size)

# init the model
model = Model(input_size=50,hidden_size=1024,num_layers=5,out_size=len(word_to_idx),embed_mat=embeddings_matrix)

# init the optimizer
optim = torch.optim.Adam(model.parameters(),lr=lr)

# init the loss function
criterion = nn.CrossEntropyLoss()



# the training loop
start = time.time()
for i in range(n_epochs):
    step = 0
    h = model.init_hidden(batch_size=batch_size)
    for train_x, train_y in data_loader:
        if len(train_x) < batch_size:
            continue
        h = tuple([e.data  for e in h])
        #print('shape x:',train_x.shape)
        #print('shape y:',train_y.shape)
        #print('type y:',train_y.type())
        optim.zero_grad()
        pred,h = model(train_x,h)
        #print('pred:',pred.shape)
        loss = criterion(pred.reshape(batch_size,len(word_to_idx),40),train_y)
        loss.backward()
        optim.step()
        if step%5 == 0:
            print('Epoch-Step: %d-%d, elapsed time: %f  loss: %f'%(i,step,(time.time()-start),loss))
        step += 1

print('Finished in %f seconds'%(time.time()-start))

Epoch-Step: 0-0, elapsed time: 4.570958  loss: 6.558311
Epoch-Step: 0-5, elapsed time: 22.861003  loss: 6.299108
Epoch-Step: 0-10, elapsed time: 42.276511  loss: 5.265857
Epoch-Step: 0-15, elapsed time: 61.428453  loss: 5.383120
Epoch-Step: 1-0, elapsed time: 72.550402  loss: 4.744092
Epoch-Step: 1-5, elapsed time: 90.900595  loss: 5.823966
Epoch-Step: 1-10, elapsed time: 110.544825  loss: 5.059399
Epoch-Step: 1-15, elapsed time: 129.369892  loss: 5.058974
Epoch-Step: 2-0, elapsed time: 140.655609  loss: 4.574659
Epoch-Step: 2-5, elapsed time: 159.486790  loss: 5.646011
Epoch-Step: 2-10, elapsed time: 177.797986  loss: 5.019992
Epoch-Step: 2-15, elapsed time: 196.617591  loss: 4.947995
Epoch-Step: 3-0, elapsed time: 208.097561  loss: 4.550393
Epoch-Step: 3-5, elapsed time: 226.196033  loss: 5.682522
Epoch-Step: 3-10, elapsed time: 244.834483  loss: 4.967468
Epoch-Step: 3-15, elapsed time: 263.874659  loss: 5.044714
Epoch-Step: 4-0, elapsed time: 278.278709  loss: 4.592836
Epoch-Step: 4

In [359]:
with torch.no_grad():
    print('Hallo')
    model.eval()
    model.init_hidden(1)
    pred = model(dataset[5][0].reshape(1,-1))

Hallo


In [377]:
model.eval()
pred.new()


tensor([])

In [373]:
pred_idx = pred.topk(1,dim=2)[1].squeeze()

In [374]:
pred_idx

tensor([[149, 154, 156, 157, 154, 157, 144, 153, 144, 145, 154, 149, 157, 154,
         157, 153, 149, 153, 144, 153, 157, 144, 149, 157, 145, 144, 156, 153,
         144, 157, 144, 144, 157, 154, 154, 153, 156, 153, 146, 157],
        [157, 157, 144, 144, 145, 157, 157, 154, 144, 154, 152, 152, 144, 154,
         154, 154, 153, 153, 157, 151, 144, 154, 147, 147, 152, 152, 153, 147,
         147, 144, 144, 149, 144, 147, 157, 157, 157, 156, 149, 152],
        [144, 157, 153, 149, 157, 145, 144, 147, 153, 154, 147, 157, 144, 154,
         157, 157, 157, 153, 158, 157, 157, 144, 157, 147, 144, 157, 157, 147,
         146, 153, 157, 147, 145, 153, 144, 144, 144, 153, 157, 144],
        [157, 157, 144, 144, 156, 157, 152, 157, 157, 144, 156, 144, 144, 158,
         156, 149, 153, 144, 145, 144, 144, 144, 147, 144, 147, 147, 145, 157,
         144, 158, 154, 157, 156, 155, 147, 156, 147, 152, 154, 156],
        [147, 157, 157, 157, 153, 146, 144, 147, 144, 144, 144, 153, 154, 151,
         

In [375]:
for i in pred_idx:
    print(idx_to_word[i.item()])

ValueError: only one element tensors can be converted to Python scalars

In [364]:
sentences[5]

['very', 'well']

In [356]:
dataset[20]

(tensor([566, 566, 566, 566, 566, 566, 566, 566, 566, 566, 566, 566, 566, 566,
         566, 566, 566, 566, 566, 566,  51,  10,   5, 523,  17,   0, 349,   3,
         507, 331,  12,   0, 302,   7, 697, 561,   2, 129,   0, 308]),
 tensor([566, 566, 566, 566, 566, 566, 566, 566, 566, 566, 566, 566, 566, 566,
         566, 566, 566, 566, 566, 566,  10,   5, 523,  17,   0, 349,   3, 507,
         331,  12,   0, 302,   7, 697, 561,   2, 129,   0, 308, 566]))