# Many to Many Classification
Simple example for Many to Many Classification (Simple pos tagger) by Recurrent Neural Networks

- Creating the **data pipeline** with `tf.data`
- Preprocessing word sequences (variable input sequence length) using `padding technique` by `user function (pad_seq)`
- Using `tf.nn.embedding_lookup` for getting vector of tokens (eg. word, character)
- Training **many to many classification** with `tf.contrib.seq2seq.sequence_loss`
- Masking unvalid token with `tf.sequence_mask`
- Creating the model as **Class**
- Reference
    - https://github.com/aisolab/sample_code_of_Deep_learning_Basics/blob/master/DLEL/DLEL_12_2_RNN_(toy_example).ipynb

### Setup

In [1]:
import os, sys
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import string
%matplotlib inline

slim = tf.contrib.slim
print(tf.__version__)

1.10.0


### Prepare example data 

In [2]:
sentences = [['I', 'feel', 'hungry'],
     ['tensorflow', 'is', 'very', 'difficult'],
     ['tensorflow', 'is', 'a', 'framework', 'for', 'deep', 'learning'],
     ['tensorflow', 'is', 'very', 'fast', 'changing']]
pos = [['pronoun', 'verb', 'adjective'],
     ['noun', 'verb', 'adverb', 'adjective'],
     ['noun', 'verb', 'determiner', 'noun', 'preposition', 'adjective', 'noun'],
     ['noun', 'verb', 'adverb', 'adjective', 'verb']]

In [3]:
# word dic
word_list = []
for elm in sentences:
    word_list += elm
word_list = list(set(word_list))
word_list.sort()
word_list = ['<pad>'] + word_list

word_dic = {word : idx for idx, word in enumerate(word_list)}
print(word_dic)

{'<pad>': 0, 'I': 1, 'a': 2, 'changing': 3, 'deep': 4, 'difficult': 5, 'fast': 6, 'feel': 7, 'for': 8, 'framework': 9, 'hungry': 10, 'is': 11, 'learning': 12, 'tensorflow': 13, 'very': 14}


In [4]:
# pos dic
pos_list = []
for elm in pos:
    pos_list += elm
pos_list = list(set(pos_list))
pos_list.sort()
pos_list = ['<pad>'] + pos_list
print(pos_list)

pos_dic = {pos : idx for idx, pos in enumerate(pos_list)}
pos_dic

['<pad>', 'adjective', 'adverb', 'determiner', 'noun', 'preposition', 'pronoun', 'verb']


{'<pad>': 0,
 'adjective': 1,
 'adverb': 2,
 'determiner': 3,
 'noun': 4,
 'preposition': 5,
 'pronoun': 6,
 'verb': 7}

In [5]:
pos_idx_to_dic = {elm[1] : elm[0] for elm in pos_dic.items()}
pos_idx_to_dic

{0: '<pad>',
 1: 'adjective',
 2: 'adverb',
 3: 'determiner',
 4: 'noun',
 5: 'preposition',
 6: 'pronoun',
 7: 'verb'}

### Create pad_seq function

In [6]:
def pad_seq(sequences, max_len, dic):
    seq_len, seq_indices = [], []
    for seq in sequences:
        seq_len.append(len(seq))
        seq_idx = [dic.get(char) for char in seq]
        seq_idx += (max_len - len(seq_idx)) * [dic.get('<pad>')] # 0 is idx of meaningless token "<pad>"
        seq_indices.append(seq_idx)
    return seq_len, seq_indices

### Pre-process data

In [7]:
max_length = 10
X_length, X_indices = pad_seq(sequences = sentences, max_len = max_length, dic = word_dic)
print(X_length, np.shape(X_indices))

[3, 4, 7, 5] (4, 10)


In [8]:
y = [elm + ['<pad>'] * (max_length - len(elm)) for elm in pos]
y = [list(map(lambda el : pos_dic.get(el), elm)) for elm in y]
print(np.shape(y))

(4, 10)


In [9]:
y

[[6, 7, 1, 0, 0, 0, 0, 0, 0, 0],
 [4, 7, 2, 1, 0, 0, 0, 0, 0, 0],
 [4, 7, 3, 4, 5, 1, 4, 0, 0, 0],
 [4, 7, 2, 1, 7, 0, 0, 0, 0, 0]]

### Define SimPosRNN

In [None]:
class SimPosRNN:
    def __init__(self, X_length, X_indices, y, n_of_classes, hidden_dim, max_len, word_dic):
        
        # Data pipeline
        with tf.variable_scope('input_layer'):
            # input layer를 구현해보세요
            # tf.get_variable을 사용하세요
            # tf.nn.embedding_lookup을 사용하세요
            self._X_length = X_length
            self._X_indices = X_indices
            self._y = y
            
    
        # RNN cell (many to many)
        with tf.variable_scope('rnn_cell'):
            # RNN cell을 구현해보세요
            # tf.contrib.rnn.BasicRNNCell을 사용하세요
            # tf.nn.dynamic_rnn을 사용하세요
            # tf.contrib.rnn.OutputProjectionWrapper를 사용하세요
        
        with tf.variable_scope('seq2seq_loss'):
            # tf.sequence_mask를 사용하여 masks를 정의하세요
            # tf.contrib.seq2seq.sequence_loss의 weights argument에 masks를 넣으세요
    
        with tf.variable_scope('prediction'):
            # tf.argmax를 사용하세요
    
    def predict(self, sess, X_length, X_indices):
        # predict instance method를 구현하세요

        return sess.run(self._prediction, feed_dict = feed_prediction)

### Create a model of SimPosRNN

In [10]:
# hyper-parameter#
lr = .003
epochs = 100
batch_size = 2
total_step = int(np.shape(X_indices)[0] / batch_size)
print(total_step)

2


In [None]:
## create data pipeline with tf.data
# tf.data를 이용해서 직접 구현해보세요

In [None]:
# 최종적으로 model은 아래의 코드를 통해서 생성됩니다.
sim_pos_rnn = SimPosRNN(X_length = X_length_mb, X_indices = X_indices_mb, y = y_mb,
                        n_of_classes = 8, hidden_dim = 16, max_len = max_length, word_dic = word_dic)

### Creat training op and train model

In [None]:
## create training op
opt = tf.train.AdamOptimizer(learning_rate = lr)
training_op = opt.minimize(loss = sim_pos_rnn.seq2seq_loss)

In [None]:
sess = tf.Session()
sess.run(tf.global_variables_initializer())

tr_loss_hist = []

for epoch in range(epochs):
    avg_tr_loss = 0
    tr_step = 0
    
    sess.run(tr_iterator.initializer)
    try:
        while True:
            # 여기를 직접구현하시면 됩니다.

            
    except tf.errors.OutOfRangeError:
        pass
    
    avg_tr_loss /= tr_step
    tr_loss_hist.append(avg_tr_loss)
    if (epoch + 1) % 10 == 0:
        print('epoch : {:3}, tr_loss : {:.3f}'.format(epoch + 1, avg_tr_loss))

In [None]:
yhat = sim_pos_rnn.predict(sess = sess, X_length = X_length, X_indices = X_indices)
yhat

In [None]:
y

In [None]:
yhat = [list(map(lambda elm : pos_idx_to_dic.get(elm), row)) for row in yhat]
for elm in yhat:
    print(elm)