In [1]:
#!/usr/bin/env python

import numpy as np
import chainer
from chainer import cuda, Function, gradient_check, Variable, \
                        optimizers, serializers, utils
from chainer import Link, Chain, ChainList
import chainer.functions as F
import chainer.links as L

from chainer.utils import walker_alias
import collections

In [24]:
# Set data
index2word = {}
word2index = {}
counts = collections.Counter()
dataset = []
with open('./data/ptb.train.txt') as f:
    for line in f:
        for word in line.split():
            if word not in word2index:
                ind = len(word2index)
                word2index[word] = ind
                index2word[ind] = word
            counts[word2index[word]] += 1
            dataset.append(word2index[word])

print(index2word[1])
print(word2index[index2word[1]])
print(index2word[2])
print(word2index[index2word[2]])


Rush
1
(August
2


In [None]:
n_vocab = len(word2index)
datasize = len(dataset)

cs = [counts[w] for w in range(len(counts))]
power = np.float32(0.75)
p = np.array(cs, power.dtype)
sampler = walker_alias.WalkerAlias(p) #random sample用

In [3]:
# Define model
class MyW2V(chainer.Chain):
    def __init__(self, n_vocab, n_units):
        super(MyW2V, self).__init__(
            embed=L.EmbedID(n_vocab, n_units),
        )
    def __call__(self, xb, yb, tb):
        xc = Variable(np.array(xb, dtype=np.int32))
        yc = Variable(np.array(yb, dtype=np.int32))
        tc = Variable(np.array(tb, dtype=np.int32))
        return F.sigmoid_cross_entropy(self.fwd(xc,yc), tc)    
    def fwd(self, x, y):
        x1 = self.embed(x)
        x2 = self.embed(y)
        return F.sum(x1 * x2, axis=1)

In [4]:
# my functions
ws = 3         ### window size
ngs = 5        ### negative sample size

def mkbatset(dataset, ids):
    xb, yb, tb = [], [], []
    for pos in ids:
        xid = dataset[pos]
        for i in range(1,ws):
            p = pos - i
            if p >= 0:
                xb.append(xid)
                yid = dataset[p]
                yb.append(yid)
                tb.append(1)
                for nid in sampler.sample(ngs):
                    xb.append(yid)
                    yb.append(nid)
                    tb.append(0) 
            p = pos + i
            if p < datasize:
                xb.append(xid)
                yid = dataset[p]
                yb.append(yid)
                tb.append(1)
                for nid in sampler.sample(ngs):
                    xb.append(yid)
                    yb.append(nid)
                    tb.append(0)      
    return [xb, yb, tb]

In [5]:
# Initialize model

demb = 100         
model = MyW2V(n_vocab, demb) 
optimizer = optimizers.Adam()
optimizer.setup(model)

In [7]:
# Learn

bs = 100
for epoch in range(10):
    print('epoch: {0}'.format(epoch))
    indexes = np.random.permutation(datasize)
    for pos in range(0, datasize, bs):
        print(epoch, pos)
        ids = indexes[pos:(pos+bs) if (pos+bs) < datasize else datasize]
        xb, yb, tb = mkbatset(dataset, ids)
        model.zerograds()        
        loss = model(xb, yb, tb)
        loss.backward()
        optimizer.update()

epoch: 0
0 0
0 100
0 200
0 300
0 400
0 500
0 600
0 700
0 800
0 900
0 1000
0 1100
0 1200
epoch: 1
1 0
1 100
1 200
1 300
1 400
1 500
1 600
1 700
1 800
1 900
1 1000
1 1100
1 1200
epoch: 2
2 0
2 100
2 200
2 300
2 400
2 500
2 600
2 700
2 800
2 900
2 1000
2 1100
2 1200
epoch: 3
3 0
3 100
3 200
3 300
3 400
3 500
3 600
3 700
3 800
3 900
3 1000
3 1100
3 1200
epoch: 4
4 0
4 100
4 200
4 300
4 400
4 500
4 600
4 700
4 800
4 900
4 1000
4 1100
4 1200
epoch: 5
5 0
5 100
5 200
5 300
5 400
5 500
5 600
5 700
5 800
5 900
5 1000
5 1100
5 1200
epoch: 6
6 0
6 100
6 200
6 300
6 400
6 500
6 600
6 700
6 800
6 900
6 1000
6 1100
6 1200
epoch: 7
7 0
7 100
7 200
7 300
7 400
7 500
7 600
7 700
7 800
7 900
7 1000
7 1100
7 1200
epoch: 8
8 0
8 100
8 200
8 300
8 400
8 500
8 600
8 700
8 800
8 900
8 1000
8 1100
8 1200
epoch: 9
9 0
9 100
9 200
9 300
9 400
9 500
9 600
9 700
9 800
9 900
9 1000
9 1100
9 1200


In [8]:
# Save model
with open('./model/w2v.model', 'w') as f:
    f.write('%d %d\n' % (len(index2word), 100))
    w = model.embed.W.data
    for i in range(w.shape[0]):
        v = ' '.join(['%f' % v for v in w[i]])
        f.write('%s %s\n' % (index2word[i], v))