# Section 12. Parts-of-Speech tagging (Part 2 - RNN)

# Model 2: Recurrent Neural Network

### Recurrent Neural Nets 
- In its simplest form, looks similar to feed forward, except it has a feedback loop where hidden goes back into itself
- This allows model to take into account data from the past
- This is useful for word sequences and taking into account context:
    - p(tag | "milk") is ambiguous!!!
    - p(tag | "I just drank a glss of milk") is more clear, "milk" is clearly a noun
![](pictures/NLP_12_rnns.png)

- GRUs and LSTMs are like little mini systems of neural networks. 
- These units help us learn long term dependencies, and avoid vanishing and exploding gradients
- They can also be swapped out and changed pretty easily, as TensorFlow allows you to do so

![](pictures/NLP_12_rnnsgrus.png)

# Let's write some code!

# 1. Import packages

In [6]:
# Basic packages
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import os
import sys
from datetime import datetime

# ML package
from sklearn.utils import shuffle
from tensorflow.contrib.rnn import static_rnn as get_rnn_output
from tensorflow.contrib.rnn import BasicRNNCell, GRUCell
from sklearn.metrics import f1_score

# Custom package to get data
from MC_NLP_util import get_data

# 2. Create get_data class

In [1]:
def get_data(split_sequences=False, train_data = 'NLP_12_chunking_train.txt', test_data='NLP_12_chunking_test.txt', wordidx_tag_start=0):
    if not os.path.exists(train_data):
        print("Training data is not in root folder.")
        print("Please check the comments to get the download link.")
        exit()
    elif not os.path.exists(test_data):
        print("Test data is not in root folder.")
        print("Please check the comments to get the download link.")
        exit()

    word2idx = {}
    tag2idx = {}
    # word_idx MUST start at 1 for rnns, because TF will pad sequences with 0s
    word_idx = wordidx_tag_start
    tag_idx = wordidx_tag_start
    Xtrain = []
    Ytrain = []
    currentX = []
    currentY = []
    # REMEMBER: Each LINE contains one word and one tag, 
    # and each SENTANCE is separated by ONE BLANK LINE
    # IF the line is blank, 
    for line in open(train_data):
        line = line.rstrip() #right strip empty chars
        # CHECK IF LINE IS BLANK
        if line:  
            # SPLIT WORD AND TAG
            r = line.split()
            word, tag, _ = r
            # ADD WORD TO WORD2IDX
            if word not in word2idx:
                word2idx[word] = word_idx
                word_idx += 1
            currentX.append(word2idx[word])
            # ADD WORD TO TAG2IDX
            if tag not in tag2idx:
                tag2idx[tag] = tag_idx
                tag_idx += 1
            currentY.append(tag2idx[tag])
        # If split sequences is false, then we just take each word/tag as our inputs
        # If split sequences is true, then we append currentX/Y, which contains will sentence
        # ... and then empt currentX/Y so we can collect the next sentence. 
        # if it is NOT true, then we can 
        elif split_sequences:
            Xtrain.append(currentX) #append full sentence of words
            Ytrain.append(currentY) #append full sentence of tags
            currentX = []
            currentY = []
    # Again, if split sequences is FALSE, we simply take all word/tags and toss them into long ass list!
    if not split_sequences:
        Xtrain = currentX
        Ytrain = currentY

    # load and score test data
    Xtest = []
    Ytest = []
    currentX = []
    currentY = []
    for line in open(test_data):
        line = line.rstrip()
        if line:
            r = line.split()
            word, tag, _ = r
            if word in word2idx:
                currentX.append(word2idx[word])
            else:
                currentX.append(word_idx) # use this as unknown
            currentY.append(tag2idx[tag])
        elif split_sequences:
            Xtest.append(currentX)
            Ytest.append(currentY)
            currentX = []
            currentY = []
    if not split_sequences:
        Xtest = currentX
        Ytest = currentY

    return Xtrain, Ytrain, Xtest, Ytest, word2idx

# 3. Create RNN Class

In [41]:
class LogisticRegression:
    def __init__(self):
        pass

    def fit(self, X, Y, V=None, K=None, D=50, lr=1e-1, mu=0.99, batch_sz=100, epochs=6):
        if V is None:
            V = len(set(X))
        if K is None:
            K = len(set(Y))
        N = len(X)

        W = np.random.randn(V, K) / np.sqrt(V + K)
        b = np.zeros(K)
        self.W = theano.shared(W)
        self.b = theano.shared(b)
        self.params = [self.W, self.b]

        thX = T.ivector('X')
        thY = T.ivector('Y')

        py_x = T.nnet.softmax(self.W[thX] + self.b)
        prediction = T.argmax(py_x, axis=1)

        cost = -T.mean(T.log(py_x[T.arange(thY.shape[0]), thY]))
        grads = T.grad(cost, self.params)
        dparams = [theano.shared(p.get_value()*0) for p in self.params]
        self.cost_predict_op = theano.function(
            inputs=[thX, thY],
            outputs=[cost, prediction],
            allow_input_downcast=True,
        )

        updates = [(p, p + mu*dp - lr*g) for p, dp, g in zip(self.params, dparams, grads)] + [(dp, mu*dp - lr*g) for dp, g in zip(dparams, grads)]
        
        train_op = theano.function(inputs=[thX, thY], outputs=[cost, prediction], updates=updates, allow_input_downcast=True)

        costs = []
        n_batches = N // batch_sz
        for i in range(epochs):
            X, Y = shuffle(X, Y)
            print("epoch:", i)
            for j in range(n_batches):
                Xbatch = X[j*batch_sz:(j*batch_sz + batch_sz)]
                Ybatch = Y[j*batch_sz:(j*batch_sz + batch_sz)]

                c, p = train_op(Xbatch, Ybatch)
                costs.append(c)
                if j % 200 == 0:
                    print(
                        "i:", i, "j:", j,
                        "n_batches:", n_batches,
                        "cost:", c,
                        "error:", np.mean(p != Ybatch))
        plt.plot(costs)
        plt.show()

    def score(self, X, Y):
        _, p = self.cost_predict_op(X, Y)
        return np.mean(p == Y)

    def f1_score(self, X, Y):
        _, p = self.cost_predict_op(X, Y)
        return f1_score(Y, p, average=None).mean()

# 4. Get data

In [42]:
# Get data
Xtrain, Ytrain, Xtest, Ytest, word2idx = get_data()

# convert to numpy arrays
Xtrain = np.array(Xtrain)
Ytrain = np.array(Ytrain)

# convert Xtrain to indicator matrix
N = len(Xtrain)
V = len(word2idx) + 1
print("vocabulary size:", V)
# Xtrain_indicator = np.zeros((N, V))
# Xtrain_indicator[np.arange(N), Xtrain] = 1

vocabulary size: 19123


# 5. DecisionTree model

In [43]:
# decision tree
dt = DecisionTreeClassifier()

# without indicator
dt.fit(Xtrain.reshape(N, 1), Ytrain)
print("dt train score:", dt.score(Xtrain.reshape(N, 1), Ytrain))
p = dt.predict(Xtrain.reshape(N, 1))
print("dt train f1:", f1_score(Ytrain, p, average=None).mean())

# with indicator -- too slow!!
# dt.fit(Xtrain_indicator, Ytrain)
# print("dt score:", dt.score(Xtrain_indicator, Ytrain))

dt train score: 0.964959594194
dt train f1: 0.907858696936


  'precision', 'predicted', average, warn_for)


# 6. LogisticRegression model

In [None]:
# train and score
model = LogisticRegression()
model.fit(Xtrain, Ytrain, V=V)
print("training complete")
print("lr train score:", model.score(Xtrain, Ytrain))
print("lr train f1:", model.f1_score(Xtrain, Ytrain))

epoch: 0
i: 0 j: 0 n_batches: 2117 cost: 3.784513628317909 error: 0.98


# 7. Compare baseline models

In [None]:
Ntest = len(Xtest)
Xtest = np.array(Xtest)
Ytest = np.array(Ytest)
# convert Xtest to indicator
# Xtest_indicator = np.zeros((Ntest, V))
# Xtest_indicator[np.arange(Ntest), Xtest] = 1

# decision tree test score
print("dt test score:", dt.score(Xtest.reshape(Ntest, 1), Ytest))
p = dt.predict(Xtest.reshape(Ntest, 1))
print("dt test f1:", f1_score(Ytest, p, average=None).mean())
# print("dt test score:", dt.score(Xtest_indicator, Ytest)) # too slow!

# logistic test score -- too slow!!
print("lr test score:", model.score(Xtest, Ytest))
print("lr test f1:", model.f1_score(Xtest, Ytest))