# 第11回講義 演習

In [1]:
from __future__ import division
from collections import OrderedDict, Counter
from sklearn.utils import shuffle
from sklearn.metrics import f1_score
from sklearn.cross_validation import train_test_split
from gensim.models.word2vec import Word2Vec
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams

import numpy as np
import theano
import theano.tensor as T

rng = np.random.RandomState(42)
trng = RandomStreams(42)

Using gpu device 0: GRID K520 (CNMeM is enabled with initial size: 95.0% of memory, cuDNN 4007)


## 課題1. Word2vec

In [2]:
class Corpus:
    def __init__(self, file_path):
        self.file_path = file_path
    
    def __iter__(self):
        for line in open(self.file_path):
            instance = [l.strip().split() for l in line.split('|||')]
            # Return format : ['i', 'have', 'a', 'pen']
            yield instance[0]

In [3]:
# 文を返すイテレータを書く
sentences = Corpus('train.unk')

In [4]:
# sg -> Skipgram, hs -> hierachical softmax (not explained), negative -> negative sample size
model = Word2Vec(sentences, size=100, window=5, min_count=5, workers=4, sg=1, hs=0, negative=100)

In [5]:
# similarity
model.similarity('woman', 'man')

0.92517703564983433

In [6]:
# model.most_similar(positive=['bank', 'company'], negative=['money'])
model.most_similar(positive=['Morgan'])

[('Stanley', 0.935018002986908),
 ('Grenfell', 0.8918687701225281),
 ('Peabody', 0.8881924152374268),
 ('Oppenheimer', 0.8812254667282104),
 ('underwriter', 0.8702781796455383),
 ('Stearns', 0.8643594980239868),
 ('Bear', 0.8634917140007019),
 ('Geneva', 0.8613843321800232),
 ('Alex', 0.8571962118148804),
 ('Sachs', 0.8532192707061768)]


## 課題2. Recurrent Neural Network (RNN) Encoder-Decoderモデルで日中翻訳

### 1. データセットの読み込みと単語・品詞のID化

train.zh-enの中身 (中国語の文 ||| 英語の文)
```
<s> 我 能 赶上 去 UNK 饭店 的 巴士 吗 ? </s> ||| <s> can i catch a bus that goes to the hilton hotel ? </s>
<s> 有 去 市里 的 火车 吗 ? </s> ||| <s> is there a train that goes to the city ? </s>
<s> 在 UNK 下面 。 </s> ||| <s> it 's just down the hall . </s>
...
```

In [7]:
def build_vocab(file_path):
    f_vocab, e_vocab = set(), set()
    for line in open(file_path):
        f, e = [l.strip().split()[1:-1] for l in line.split('|||')]
        f_vocab.update(f)
        e_vocab.update(e)
    
    f_w2i = {w: np.int32(i+2) for i, w in enumerate(f_vocab)}
    e_w2i = {w: np.int32(i+2) for i, w in enumerate(e_vocab)}
    
    f_w2i['<s>'], f_w2i['</s>'] = np.int32(0), np.int32(1)
    e_w2i['<s>'], e_w2i['</s>'] = np.int32(0), np.int32(1)
    return set(f_w2i.keys()), set(e_w2i.keys()), f_w2i, e_w2i
    
def encode(sentence, vocab, w2i):
    encoded_sentence = []
    for w in sentence:
        if w in vocab:
            encoded_sentence.append(w2i[w])
        else:
            encoded_sentence.append(w2i['UNK'])
    return encoded_sentence
    
def decode(encoded_sentence, w2i):
    i2w = {i:w for w, i in w2i.items()}
    decoded_sentence = []
    for i in encoded_sentence:
        decoded_sentence.append(i2w[i])
    return decoded_sentence
    
def load_data(file_path, f_vocab, e_vocab, f_w2i, e_w2i):
    x, y = [], []
    for line in open(file_path):
        f, e = [l.strip().split() for l in line.split('|||')]
        f_enc = encode(f, f_vocab, f_w2i)
        e_enc = encode(e, e_vocab, e_w2i)
        x.append(f_enc)
        y.append(e_enc)
    return x, y

f_vocab, e_vocab, f_w2i, e_w2i = build_vocab('./train.zh-en')
train_X, train_y = load_data('./train.zh-en', f_vocab, e_vocab, f_w2i, e_w2i)
train_X, test_X, train_y, test_y = train_test_split(train_X, train_y, test_size=0.2, random_state=42)

### 2. 単語のembedding

In [8]:
def sharedX(X, name=None, dtype="float32"):
    return theano.shared(np.array(X, dtype=dtype), name=name)

class Projection:
    def __init__(self, in_dim, out_dim, scale):
        self.V = sharedX(rng.randn(in_dim, out_dim)*scale, name='V')
        self.params = [self.V]

    def f_prop(self, x):
        x_emb = self.V[x]
        return x_emb

### 3. Long short-term memory (LSTM)

LSTMの構造はスライド参照

- 入力ゲート: $\hspace{20mm}i_t = \sigma \left( W_{xi} x_t + W_{hi} h_{t-1} + W_{ci} c_{t-1} + b_i \right)$
- 忘却ゲート: $\hspace{20mm}f_t = \sigma \left( W_{xf} x_t + W_{hf} h_{t-1} + W_{cf} c_{t-1} + b_f \right)$  
- セル:　　　 $\hspace{20mm}c_t = f_t c_{t-1} + i_t \tanh \left( W_{xc} x_t + W_{hc} h_{t-1} + b_c \right)$  
- 出力ゲート: $\hspace{20mm}o_t = \sigma \left( W_{xo} x_t + W_{ho} h_{t-1} + W_{co} c_{t} + b_o \right)$  
- 隠れ層: 　　$\hspace{20mm}h_t = o_t\tanh \left( c_t \right)$

In [19]:
class LSTM:
    def __init__(self, in_dim, out_dim, scale=0.01, h_0=None, c_0=None):
        
        #- Input gate
        self.W_xi = sharedX(rng.randn(in_dim, out_dim)*scale, name='W_xi')
        self.W_hi = sharedX(rng.randn(out_dim, out_dim)*scale, name='W_hi')
        self.W_ci = sharedX(rng.randn(out_dim, out_dim)*scale, name='W_ci')
        self.b_i  = sharedX(rng.randn(out_dim)*scale, name='b_i')
        
        #- Forget gate
        self.W_xf = sharedX(rng.randn(in_dim, out_dim)*scale, name='W_xf')
        self.W_hf = sharedX(rng.randn(out_dim, out_dim)*scale, name='W_hf')
        self.W_cf = sharedX(rng.randn(out_dim, out_dim)*scale, name='W_cf')
        self.b_f  = sharedX(rng.randn(out_dim)*scale, name='b_f')
        
        #- Cell state
        self.W_xc = sharedX(rng.randn(in_dim, out_dim)*scale, name='W_xc')
        self.W_hc = sharedX(rng.randn(out_dim ,out_dim)*scale, name='W_hc')
        self.b_c  = sharedX(rng.randn(out_dim)*scale, name='b_c')
        
        #- Output gate
        self.W_xo = sharedX(rng.randn(in_dim, out_dim)*scale, name='W_xo')
        self.W_ho = sharedX(rng.randn(out_dim, out_dim)*scale, name='W_ho')
        self.W_co = sharedX(rng.randn(out_dim, out_dim)*scale, name='W_co')
        self.b_o  = sharedX(rng.randn(out_dim)*scale, name='b_o')

        #- Initial state
        if h_0 is None:
            self.h_0 = sharedX(np.zeros(out_dim), name='h_0')
        else:
            self.h_0 = h_0
        if c_0 is None:
            self.c_0 = sharedX(np.zeros(out_dim), name='c_0')
        else:
            self.c_0 = c_0

        self.output_info = [self.h_0, self.c_0]
        self.params = [self.W_xf, self.W_hf, self.W_cf, self.b_f
                       , self.W_xi, self.W_hi, self.W_ci, self.b_i
                       , self.W_xc, self.W_hc, self.b_c
                       , self.W_xo, self.W_ho, self.W_co, self.b_o]
    
    def f_prop(self, x):
        def fn(x, h_tm1, c_tm1):
            # Input gate
            i_t = T.nnet.sigmoid(T.dot(x, self.W_xi) + T.dot(h_tm1, self.W_hi) + T.dot(c_tm1, self.W_ci) + self.b_i)
            
            # Forget gate
            f_t =  T.nnet.sigmoid(T.dot(x, self.W_xf) + T.dot(h_tm1, self.W_hf) + T.dot(c_tm1, self.W_cf) + self.b_f)
            
            # Cell state
            c_t = f_t * c_tm1 + i_t * T.tanh(T.dot(x, self.W_xc) + T.dot(h_tm1, self.W_hc) + self.b_c)
            
            # Output gate
            o_t = T.nnet.sigmoid(T.dot(x, self.W_xo) + T.dot(h_tm1, self.W_ho) + T.dot(c_tm1, self.W_co) + self.b_o)
            
            # Hidden state
            h_t = o_t * T.tanh(c_t)
            
            return h_t, c_t
        
        [h,c], _ = theano.scan(fn = fn, sequences=[x],outputs_info=self.output_info)
        
        return h

### 4. 線形層

In [20]:
class Linear:
    def __init__(self, in_dim, out_dim, scale):
        self.W_out = sharedX(rng.randn(in_dim, out_dim)*scale, name='W_out')
        self.b_out = sharedX(rng.randn(out_dim,)*scale, name='b_out')
        self.params = [self.W_out, self.b_out]

    def f_prop(self, x):
        z = T.dot(x, self.W_out) + self.b_out
        return z

### 5. 活性化層

In [21]:
class Activation:
    def __init__(self, function):
        self.function = function
        self.params = []

    def f_prop(self, x):
        self.z = self.function(x)
        return self.z

### 6. 更新則

In [38]:
def sgd(cost, params, eps=np.float32(0.1)):
    g_params = T.grad(cost, params)
    updates = OrderedDict()
    for param, g_param in zip(params, g_params):
        updates[param] = param - eps*g_param
    return updates

def Adam(params, g_params, lr=0.001, b1=0.1, b2=0.001, e=1e-8):
    updates = []
    i = theano.shared(np.float32(0.))
    i_t = i + 1.
    fix1 = 1. - (1. - b1)**i_t
    fix2 = 1. - (1. - b2)**i_t
    lr_t = lr * (T.sqrt(fix2) / fix1)
    for p, g in zip(params, g_params):
        m = theano.shared(p.get_value() * 0.)
        v = theano.shared(p.get_value() * 0.)
        m_t = (b1 * g) + ((1. - b1) * m)
        v_t = (b2 * T.sqr(g)) + ((1. - b2) * v)
        g_t = m_t / (T.sqrt(v_t) + e)
        p_t = p - (lr_t * g_t)
        updates.append((m, m_t))
        updates.append((v, v_t))
        updates.append((p, p_t))
    updates.append((i, i_t))
    return updates

### 7. ネットワークの定義

In [54]:
x = T.ivector('x')
t = T.ivector('t')

# Target

#<s> I like ---</s>#
#|<-        ->| in#
##   |<-        ->|out#
t_in = t[:-1]
t_out = t[1:]

hid_dim = 500
out_dim = len(e_vocab)

def f_props(layers, x):
    layer_out = x
    for i, layer in enumerate(layers):
        if i == 0:
            layer_out = layer.f_prop(x)
        else:
            layer_out = layer.f_prop(layer_out)
    return layer_out

encoder = [
    Projection(len(f_vocab),500, scale=0.01),
    LSTM(in_dim = 500,out_dim = hid_dim),
]

h_enc = f_props(encoder, x)[-1] # Take the last state of encoder

decoder = [
    Projection(len(e_vocab),500, scale=0.01),
    LSTM(in_dim = 500,out_dim = hid_dim, h_0 = h_enc),
    Linear(in_dim = hid_dim, out_dim = out_dim, scale = 0.01),
    Activation(T.nnet.softmax)
]

### 8. train関数とvalid関数とtest関数

In [55]:
def join(layers):
    params = []
    for layer in layers:
        params += layer.params
    return params

y = f_props(decoder, t_in)
cost = T.mean(T.nnet.categorical_crossentropy(y, t_out))

params = join(encoder + decoder)
gparams = T.grad(cost, params)
updates = sgd(cost, params, eps=np.float32(0.2))
#updates = Adam(params, gparams, lr=0.01, b1=0.1, b2=0.005, e=1e-6)

train = theano.function(inputs=[x, t], outputs=cost, updates=updates)
valid = theano.function(inputs=[x, t], outputs=cost)
test  = theano.function(inputs=[x, t], outputs=[cost, T.argmax(y, axis=1)])

In [57]:
len(train_X)

35212

### 9. 学習

In [58]:
epochs = 5
cost_mean = 0
for epoch in xrange(epochs):
    train_X, train_y = shuffle(train_X, train_y)  # Shuffle Samples !!
    for i, (instance_x, instance_y) in enumerate(zip(train_X, train_y)):
        train_cost = train(instance_x, instance_y)
        cost_mean += train_cost
        if i%1000 == 0:
            print "EPOCH:: %i, Iteration %i, Training Cost: %.3f" % (epoch + 1, i, cost_mean / 1000)
            cost_mean = 0

EPOCH:: 1, Iteration 0, Training Cost: 0.003
EPOCH:: 1, Iteration 1000, Training Cost: 3.036
EPOCH:: 1, Iteration 2000, Training Cost: 2.981
EPOCH:: 1, Iteration 3000, Training Cost: 3.033
EPOCH:: 1, Iteration 4000, Training Cost: 2.983
EPOCH:: 1, Iteration 5000, Training Cost: 2.946
EPOCH:: 1, Iteration 6000, Training Cost: 2.969
EPOCH:: 1, Iteration 7000, Training Cost: 2.922
EPOCH:: 1, Iteration 8000, Training Cost: 2.905
EPOCH:: 1, Iteration 9000, Training Cost: 2.920
EPOCH:: 1, Iteration 10000, Training Cost: 2.889
EPOCH:: 1, Iteration 11000, Training Cost: 2.915
EPOCH:: 1, Iteration 12000, Training Cost: 2.912
EPOCH:: 1, Iteration 13000, Training Cost: 2.890
EPOCH:: 1, Iteration 14000, Training Cost: 2.902
EPOCH:: 1, Iteration 15000, Training Cost: 2.930
EPOCH:: 1, Iteration 16000, Training Cost: 2.918
EPOCH:: 1, Iteration 17000, Training Cost: 2.922
EPOCH:: 1, Iteration 18000, Training Cost: 2.860
EPOCH:: 1, Iteration 19000, Training Cost: 2.807
EPOCH:: 1, Iteration 20000, Train

KeyboardInterrupt: 

### 10. テスト

idからwordへの辞書を作成

In [59]:
f_i2w = {value:key for key, value in f_w2i.items()}
e_i2w = {value:key for key, value in e_w2i.items()}

テスト

In [61]:
num = 45
instance_x, instance_y = test_X[num], test_y[num]
test_cost, pred_y = test(instance_x, instance_y)
print "Test Cost: %.3f" % test_cost
print "元の文: %s" % ' '.join([f_i2w[com] for com in instance_x])
print "翻訳文: %s" % ' '.join([e_i2w[com] for com in pred_y])

Test Cost: 2.028
元の文: <s> 入场费 是 多少 钱 ? </s>
翻訳文: how much is it UNK ? ? </s>
