In [32]:
import sys, os
import numpy as np
sys.path.append('/Users/matt.meng/dev/Stanford_CS224n_2017_assignments/assignment3')

In [2]:
from data_util import load_and_preprocess_data, load_embeddings, ModelHelper
from q2_rnn import RNNModel, Config

In [3]:
data_path = '/Users/matt.meng/dev/Stanford_CS224n_2017_assignments/assignment3/data'

#### mock the args from the argparse

In [4]:
class Args:
    pass

args = Args()
args.cell = 'rnn'
config = Config(args)
config.data_train = open(os.path.join(data_path, 'train.conll'))
config.data_dev = open(os.path.join(data_path, 'dev.conll'))

config.vocab = open(os.path.join(data_path, 'vocab.txt'))
config.vectors = open(os.path.join(data_path, 'wordVectors.txt'))


#### load the data and embeddings

In [5]:
helper, train, dev, train_raw, dev_raw = load_and_preprocess_data(config)
embeddings = load_embeddings(config, helper)


INFO:Loading training data...
INFO:Done. Read 14041 sentences
INFO:Loading dev data...
INFO:Done. Read 3250 sentences
INFO:Built dictionary for 10007 features.
INFO:Initialized embeddings.


#### data preprocess steps
1. the CONLmL data text is loaded
2. `build` function from `ModelHelper` class is used to create the token dictionary `dok2id`
3. the `tok2id` is stored in `helper` object and also used `load_embeddings` to load the pre-trained embedding
4. the training data is also vectorized by `helper.vectorize`, which 
    * convert words from sentence into index
    * append the casing result to each word
    * vectorize the labels

In [12]:
print len(train), embeddings.shape

14041 (10008, 50)


In [30]:
print train[5]

([[12, 10000], [62, 10002], [169, 10003], [194, 10003], [327, 10003], [203, 10003], [521, 10003], [6841, 10003], [158, 10003], [62, 10003], [169, 10003], [194, 10003], [802, 10003], [203, 10003], [3788, 10003], [16, 10003], [24, 10003], [4, 10000], [12, 10000], [2, 10003], [307, 10002], [15, 10003], [387, 10003], [171, 10003], [10006, 10002], [562, 10003], [2014, 10003], [10006, 10002], [82, 10003], [8, 10003], [172, 10003], [3368, 10003], [3, 10000]], [4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 1, 4, 4, 4, 0, 0, 0, 0, 4, 4, 4, 4, 4])


#### data entry point for model
1. create the model in order to use the `preprocess_sequence_data`
2. the `preprocess_sequence_data` is used in `fit`

In [7]:
model = RNNModel(helper, config, embeddings)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [8]:
model

<q2_rnn.RNNModel at 0x1197ef810>

#### flatten array

In [31]:
sum([[1, 2], [3, 4]], [])

[1, 2, 3, 4]

#### use the model-specific `preprocess_sequence_data`
1. the internal `featurize_windows` function build a moving window of fixed length
2. list of word vector is flattened using `sum(window, [])`
3. all the sentences are padded to the same length using `pad_sequences`

In [9]:
train_examples = model.preprocess_sequence_data(train)

In [25]:
print len(train_examples[100][2])

113


#### padded training data

In [40]:
print train_examples[110]

[[[10004, 10003, 12, 10000, 2, 10002], [12, 10000, 2, 10002, 6593, 10003], [2, 10002, 6593, 10003, 818, 10003], [6593, 10003, 818, 10003, 61, 10003], [818, 10003, 61, 10003, 5, 10003], [61, 10003, 5, 10003, 4130, 10002], [5, 10003, 4130, 10002, 51, 10003], [4130, 10002, 51, 10003, 1188, 10003], [51, 10003, 1188, 10003, 4, 10000], [1188, 10003, 4, 10000, 34, 10003], [4, 10000, 34, 10003, 385, 10003], [34, 10003, 385, 10003, 3, 10000], [385, 10003, 3, 10000, 10005, 10003], [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 

### yield mini batches from data 
1. in the `run_epoch`, function `minibatches` is called to generate batches
2. `minibatches` first converts the (sentence, labels) element-type data into column-wise data
3. `get_minibatches` shuffles index array and yield mini-batch

#### example to convert data into column-wise format

In [33]:
batches = [np.array(col) for col in zip(*train_examples)]

In [34]:
batches[0]

3

In [36]:
test_data = np.random.randint(5, size=(4, 2))
print test_data

[[1 2]
 [4 1]
 [2 4]
 [2 0]]


In [37]:
a, b = zip(*test_data)

In [38]:
a

(1, 4, 2, 2)