In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
import re
import pickle

In [2]:
import torch.utils.data as Data

In [3]:
dtype = torch.FloatTensor

In [4]:
# sentences = ['i like dog', 'i love coffee', 'i hate milk']
regex = re.compile(r'[a-zA-Z]')
text_raw = pickle.load(open("../DATA/raw_text_dataset.puckle", 'rb'))[0]
sentences = []
for sen in text_raw:
    temp = ' '.join([''.join(regex.findall(w)) for w in sen.split('\n\n')[1].split(' ')])
    sentences.append(temp)
    

# ntences = make_sentence(sentences)
word_list = " ".join(sentences).split()
word_list = list(set(word_list))
word_list.append('<pad>')
word_dict = {w: i for i, w in enumerate(word_list)}
number_dict = {i: w for i, w in enumerate(word_list)}
n_class = len(word_dict)

In [5]:
n_step = 24
n_hidden = 3
m = 100 # embedding dimension

In [6]:
def make_batch(sentences, step):
    input_batch = []
    target_batch = []
    
    for sen in sentences:
        word = sen.split()
        for i in range(0, len(word), step):
            if i + step + 1 >= len(word):
                break
            input = [word_dict[n] for n in word[i: i + step]]
            target = word_dict[word[i + step + 1]]
        # print(word[-2])
        # print(target)
            input_batch.append(input)
            target_batch.append(target)
    
    return input_batch, target_batch

In [7]:
class NNLM(nn.Module):
    def __init__(self):
        super(NNLM, self).__init__()
        self.C = nn.Embedding(n_class, m)
        self.H = nn.Parameter(torch.randn(n_step * m, n_hidden).type(dtype))
        self.W = nn.Parameter(torch.randn(n_step * m, n_class).type(dtype))
        self.d = nn.Parameter(torch.randn(n_hidden).type(dtype))
        self.U = nn.Parameter(torch.randn(n_hidden, n_class).type(dtype))
        self.b = nn.Parameter(torch.randn(n_class).type(dtype))
    
    def forward(self, X):
        X = self.C(X)
        X = X.view(-1, n_step * m) # [batch_size, n_step * m]
        tanh = torch.tanh(self.d + torch.mm(X, self.H)) # [batch_size, n_hidden]
        output = self.b + torch.mm(X, self.W) + torch.mm(tanh, self.U) # [batch_size, n_class]
        return output

In [8]:
model = NNLM()

In [9]:
model.cuda()

NNLM(
  (C): Embedding(24785, 100)
)

In [10]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [11]:
input_batch, target_batch = make_batch(sentences, n_step)
input_batch = Variable(torch.LongTensor(input_batch))
target_batch = Variable(torch.LongTensor(target_batch))

In [12]:
torch_dataset = Data.TensorDataset(input_batch, target_batch)
loader = Data.DataLoader(dataset=torch_dataset, batch_size=128, shuffle=True, num_workers=2)

In [None]:
for epoch in range(5000):
    for step, (batch_x, batch_y) in enumerate(loader):
        optimizer.zero_grad()
        output = model(batch_x.cuda())
    
        loss = criterion(output, batch_y.cuda())
    
        loss.backward()
        optimizer.step()
    if epoch % 100 == 0:
        print('Epoch:' '%04d' % (epoch + 1), 'cost = ', '{:.6f}'.format(loss))

Epoch:0001 cost =  188.434616
Epoch:0101 cost =  0.000018
Epoch:0201 cost =  0.000000


In [None]:
predict = model(input_batch[10:20]).data.max(1, keepdim=True)[1]