### Scan

Theano ではループのために For 文ではなく、Scan というものを使います　　
少しややこしいので、簡単な例を

In [11]:
##Suppose you have a sequence [1, 2, 3, 4, 5] let's define identity function with scan
x = T.fvector("x")

def step(x):
    return x

h, _ = theano.scan(
                       fn=step,
                       sequences=x, 
                       outputs_info=None
                    )

f = theano.function([x], h)

print f(numpy.array([1, 2, 3, 4, 5]).astype("float32"))

[ 1.  2.  3.  4.  5.]


  from scan_perform.scan_perform import *


In [12]:
##Next we define accumulation function
x = T.fvector("x")

def step(x, h_tm1):
    return x + h_tm1

h, _ = theano.scan(
                       fn=step,
                       sequences=x, 
                       outputs_info=0.0, #Initial value for h
                       #go_backwards=True #you might use it for bi-directional RNNs
                    )

f = theano.function([x], h)

print f(numpy.array([1, 2, 3, 4, 5]).astype("float32"))

[  1.   3.   6.  10.  15.]


In [13]:
## Let's do the same thing with matrix, accumulation over column
x = T.fmatrix("x")

def step(x, h_tm1):
    return x + h_tm1

h, _ = theano.scan(
                       fn=step,
                       sequences=x, 
                       outputs_info=numpy.array([0., 0., 0., 0., 0.]) #Initial value for h, it's better to use T.alloc().
                    )

f = theano.function([x], h)

print f(numpy.array([[1, 2, 3, 4, 5], [1, 2, 3, 4, 5], [1, 2, 3, 4, 5]]).astype("float32"))

[[  1.   2.   3.   4.   5.]
 [  2.   4.   6.   8.  10.]
 [  3.   6.   9.  12.  15.]]


In [14]:
## Advanced :: take previous inputs
x = T.fmatrix("x")

def step(x, h_tm1, h_tm2):
    return x + h_tm1 + h_tm2

h, _ = theano.scan(
                       fn=step,
                       sequences=[ dict(input= x, taps = [0, -1, -2])],
                       outputs_info=None #Initial value for h
                    )

f = theano.function([x], h)

print f(numpy.array([[1, 2, 3, 4, 5], [1, 2, 3, 4, 5], [1, 2, 3, 4, 5],[1, 2, 3, 4, 5], [1, 2, 3, 4, 5]]).astype("float32"))

[[  3.   6.   9.  12.  15.]
 [  3.   6.   9.  12.  15.]
 [  3.   6.   9.  12.  15.]]


### [宿題] POS Tagging

文が与えられた時、その品詞を予測する RNN を学習します。

word2index は単語をIDに変換する辞書、tag2index は品詞をIDに変換する辞書です。  
train_data, dev_data には文と品詞タグのペアが入っています。  
文の長さと品詞タグの長さは必ず同じです。

encode_dataset を使うと単語と品詞をIDに変換することができます。

In [1]:
from collections import OrderedDict

import numpy
import theano
import theano.tensor as T
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
from sklearn.utils import shuffle
from sklearn.metrics import f1_score

def load_data(file_path):
    dataset = []
    vocab, tag = set(), set()
    for line in open(file_path):
        instance = [ l.strip().split() for l in line.split('|||') ]
        vocab.update(instance[0])
        tag.update(instance[1])
        dataset.append(instance)
    return dataset, vocab, tag

def encode_dataset(dataset, word2index, tag2index):
    X, y = [], []
    vocab = set(word2index.keys())
    for sentence, tags in dataset:
        X.append([ word2index[word] if word in vocab else word2index['<unk>'] for word in sentence])
        y.append([ tag2index[tag] for tag in tags])
    return X, y

train_data, train_vocab, train_tags = load_data('train.unk')
special_words = set(['<unk>'])

word2index = dict(map(lambda x: (x[1], x[0]), enumerate(train_vocab | special_words)))
tag2index  = dict(map(lambda x: (x[1], x[0]), enumerate(train_tags)))

In [3]:
train_size = len(train_data)
train_data, dev_data = train_data[:train_size//10 * 8], train_data[train_size//10 * 8:]

In [10]:
for word, tag in zip(train_data[0][0], train_data[0][1]):
    print word, tag


In IN
an DT
Oct. NNP
19 CD
review NN
of IN
`` ``
The DT
Misanthrope NN
'' ''
at IN
Chicago NNP
's POS
Goodman NNP
Theatre NNP
`` ``
Revitalized VBN
Classics NNS
Take VBP
the DT
Stage NN
in IN
Windy NNP
City NNP
, ,
'' ''
Leisure NN
& CC
Arts NNS
, ,
the DT
role NN
of IN
Celimene NNP
, ,
played VBN
by IN
Kim NNP
Cattrall NNP
, ,
was VBD
mistakenly RB
attributed VBN
to TO
Christina NNP
Haag NNP
. .


次のセルを完成させて提出してください　　

今回の入力は単語のID列（ベクトル x）と品詞のID列 (ベクトル y)です。  
Projection レイヤーを使って、単語をベクトルに変換します。  
その後、RNN に入力し、その出力値をSotfmax関数を使って確率分布に変換します。  
予測は画像の時とおなじく、最大の確率を持つクラスを予測とします。

In [None]:
train_size = len(train_data)
train_data, dev_data = train_data[:train_size//10 * 8], train_data[train_size//10 * 8:]

train_X, train_y = encode_dataset(train_data, word2index, tag2index)
dev_X  , dev_y   = encode_dataset(dev_data,   word2index, tag2index)

rng = numpy.random.RandomState(42)
trng = RandomStreams(42)

def sharedX(X, dtype="float32"):
    return theano.shared(numpy.asarray(X, dtype=dtype))


class Activation:
    def __init__(self, func):
        self.func = func
        self.params = []

    def fprop(self, x):
        return self.func(x)


class Projection:
    def __init__(self, in_dim, out_dim, scale):
        self.W = sharedX(rng.randn(in_dim, out_dim) * scale)
        self.params = [ self.W ]

    def fprop(self, x):
        h = #WRITE ME
        return h
    
    
class Linear:
    def __init__(self, in_dim, out_dim, scale):
        self.W = sharedX(rng.randn(in_dim, out_dim) * scale)
        self.b = sharedX(rng.randn(out_dim,) * scale)
        self.params = [ self.W, self.b ]

    def fprop(self, x):
        h = T.dot(x, self.W)+self.b
        return h

    
class RNN:
    def __init__(self, in_dim, out_dim, scale):
        self.scale = scale
        self.hid_dim = hid_dim

        ## 重みの次元を決める。
        self.Wx = sharedX(rng.randn(#WRITE ME, ) * scale)
        self.Wh = sharedX(rng.randn(#WRITE ME, ) * scale)
        self.bh = sharedX(rng.randn(#WRITE ME, ) * scale)
        ## Initial State をどのように初期化するか
        self.h0 = sharedX(#WEIRE ME）

        self.output_info = [ self.h0 ]
        self.params = [  ]

    def fprop(self, x):
        def step(u_t, h_tm1):
            h = #WRITE ME
            return h
        
        ## Scan の方法を考える 
        h, _ = theano.scan(#WRITE ME)
        return h
    

def sgd(cost, params, lr):
    gparams = T.grad(cost, params)
    updates = OrderedDict()
    for param, gparam in zip(params, gparams):
        ## Advanced Gradient Glip を実装する　（必須ではない）
        #WRITE ME
        updates[param] = param - lr * gparam
    return updates


def prop(layers, x):
    for i, layer in enumerate(layers):
        if i == 0:
            layer_out = layer.fprop(x)
        else:
            layer_out = layer.fprop(layer_out)
    return layer_out


def get_params(layers):
    params = []
    for layer in layers:
        params += layer.params
    return params


### build Model + Train
vocab_size = len(word2index)
hid_dim    = 100
out_dim    = len(tag2index)

x, t = T.lvector("x"), T.lvector("t")

layers = [
    　 # レイヤー構成を決める
    ]

prob = prop(layers, x) 
cost = # Loss function を決める　
pred = #　予測した確率から、予測値を決める

## Collect Parameters
params = get_params(layers) 

## Define update graph
updates = sgd(cost, params, lr=numpy.float32(0.01)) 

## Compile Function
train = theano.function([x,t], cost, updates=updates)
valid = theano.function([x,t], [cost, pred])
test  = theano.function([x]　,　pred)

epochs = 100
## Train
for epoch in range(epochs):
    train_X, train_y = shuffle(train_X, train_y)  # Shuffle Samples !!
    for i, (instance_x, instance_y) in enumerate(zip(train_X, train_y)):
        cost = train(instance_x, instance_y)
        if i % 1000 == 0:
            print "EPOCH:: %i, Iteration %i, cost: %.3f"%(epoch+1, i, cost)
    
    dev_true, dev_pred = [], []
    for i, (instance_x, instance_y) in enumerate(zip(dev_X, dev_y)):
        cost, pred = valid(instance_x, instance_y)
        dev_pred += list(pred) # 予測結果はベクトル
        dev_true += instance_y