In [1]:
import config
from navermovie_comments import load_trained_embedding
from navermovie_comments import load_movie_comments

word2vec_model = load_trained_embedding(
    data_name = 'large',
    tokenize = 'soynlp_unsup',
    embedding = 'word2vec'
)

_, texts, _ = load_movie_comments(
    large = True,
    tokenize = 'soynlp_unsup'
)

In [2]:
idx_to_vocab = word2vec_model.wv.index2word
wv = word2vec_model.wv.vectors

wv.shape

(93234, 100)

In [3]:
import numpy as np
wv_ = np.vstack([wv, np.zeros((1, wv.shape[1]), dtype=wv.dtype)])
wv_.shape

(93235, 100)

In [4]:
seed_words = {word for word, _ in word2vec_model.wv.most_similar('송강호', topn=100)}
seed_words.update({word for word, _ in word2vec_model.wv.most_similar('디카프리오', topn=100)})

print(len(seed_words)) # 172

172


  if np.issubdtype(vec.dtype, np.int):


In [5]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
torch.__version__


'1.1.0'

In [10]:
def create_dataset(idx_to_vocab, texts, seed_words, window=2):
    vocab_to_idx = {vocab:idx for idx, vocab in enumerate(idx_to_vocab)}
    padding_idx = len(vocab_to_idx)

    def encode(words):
        encode_ = lambda vocab:vocab_to_idx.get(vocab, padding_idx)
        return [encode_(w) for w in words]

    def slice_pad(idxs, i, n):
        left = idxs[max(0, i-window):i]
        if len(left) < window:
            left = [padding_idx] * (window - len(left)) + left
        right = idxs[i+1:min(i+1+window, n)]
        if len(right) < window:
            right = right + [padding_idx] * (window - len(right))
        return left + right

    x = []
    y = []

    for text in texts:
        words = text.split()
        n = len(words)
        labels = [1 if w in seed_words else 0 for w in words]
        idxs = encode(words)
        for i, label in enumerate(labels):
            context = slice_pad(idxs, i, n)
            x.append(np.asarray(context))
            y.append(label)

    x = np.vstack(x)
    x = torch.LongTensor(x)
    y = torch.LongTensor(y)
    return x, y

x, y = create_dataset(idx_to_vocab, texts, seed_words)
y.sum()

tensor(362379)

In [7]:
class NERWindowDataset(Dataset):
    def __init__(self, x, y):
        self.x = x
        self.y = y

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]

ner_dataloader = DataLoader(
    NERWindowDataset(x, y),
    batch_size = 64,
    shuffle = True)

In [8]:
for input_, output_ in ner_dataloader:
    print(input_.size())
    print(output_.size())
    break

torch.Size([64, 4])
torch.Size([64])


In [9]:
class NamedEntityWindowClassifier(nn.Module):
    def __init__(self, wordvec, n_classes, hidden_1_dim=50, n_windows=2):
        super(NamedEntityWindowClassifier, self).__init__()

        self.n_windows = n_windows
        self.n_vocabs, self.embed_dim = wordvec.shape
        self.embed = nn.Embedding(
            num_embeddings = self.n_vocabs,
            embedding_dim = self.embed_dim
        )
        self.fc1 = nn.Linear(
            self.embed_dim * 2 * n_windows,
            hidden_1_dim,
            bias = False
        )
        self.fc2 = nn.Linear(
            hidden_1_dim,
            n_classes,
            bias = True
        )

    def forward(self, x):
        """
        Arguments
        ---------
        x : torch.LongTensor
            context word index. size is 2 * n_windows
        """
        y = self.embed(x) # [batch, 2 * window, embed]
        y = y.view(y.size()[0], -1) # [batch, embed * 2 * widow]
        y = F.relu(self.fc1(y))
        y = self.fc2(y)
        return y

model = NamedEntityWindowClassifier(wv_, n_classes=2)

In [16]:
def train(data_loader, model, loss_func, optimizer, epochs):
    n_batchs = len(ner_dataloader)
    for epoch in range(epochs):
        loss_sum = 0
        for i, (x, y) in enumerate(data_loader):
            optimizer.zero_grad()
            y_pred = model(x)
            loss = loss_func(y_pred, y)
            loss.backward()
            optimizer.step()
            loss_sum += loss.data.numpy()

            if i % 100 == 0:
                loss_tmp = loss_sum / (i+1)
                template = '\repoch = {}, batch = {} / {}, training loss = {}'
                message = template.format(epoch, i, n_batchs, '%.3f' % loss_tmp)
                print(message, end='')

        print('\r## epoch = {}, training loss = {}'.format(epoch, '%.3f' % (loss_sum / (i+1)) ))

    return model

In [17]:
# Parameter for the optimizer
learning_rate = 0.001

# Loss and optimizer
loss_func = nn.CrossEntropyLoss()  
optimizer = torch.optim.Adam(
    model.parameters(),
    lr=learning_rate
)

train(ner_dataloader, model, loss_func, optimizer, epochs=10)

epoch = 0, batch = 12200 / 20510, training loss = 0.047

KeyboardInterrupt: 

In [80]:
z = torch.FloatTensor(np.random.random_sample((2, 3, 4)))
print(z.size())
z

torch.Size([2, 3, 4])


tensor([[[6.8958e-02, 7.4167e-01, 4.8499e-01, 6.2312e-01],
         [4.1036e-02, 4.2841e-01, 3.4260e-01, 3.8961e-01],
         [3.1294e-04, 4.9025e-01, 1.0517e-01, 9.9408e-01]],

        [[7.7422e-01, 2.0903e-01, 5.2505e-01, 1.3969e-01],
         [4.4244e-01, 3.0134e-01, 4.0958e-01, 7.1006e-01],
         [8.1710e-01, 1.5755e-01, 2.8302e-01, 2.5122e-01]]])

In [81]:
z.view(z.size()[0], -1)

tensor([[6.8958e-02, 7.4167e-01, 4.8499e-01, 6.2312e-01, 4.1036e-02, 4.2841e-01,
         3.4260e-01, 3.8961e-01, 3.1294e-04, 4.9025e-01, 1.0517e-01, 9.9408e-01],
        [7.7422e-01, 2.0903e-01, 5.2505e-01, 1.3969e-01, 4.4244e-01, 3.0134e-01,
         4.0958e-01, 7.1006e-01, 8.1710e-01, 1.5755e-01, 2.8302e-01, 2.5122e-01]])