In [91]:
#import cPickle (python2.7)
#http://testpy.hatenablog.com/entry/2017/03/17/000626
import _pickle as cPickle

import numpy as np
from collections import defaultdict, OrderedDict

import re
import warnings
import sys
import time
warnings.filterwarnings("ignore") 

### Load pickle data

In [3]:
# the pickle file contains [revs, W, W2, word_idx_map, vocab]
x = cPickle.load(open("mr.p","rb"), encoding="latin1") # Add encoding="latin1" because got UnicodeDecodeError

In [4]:
revs, W, W2, word_idx_map, vocab = x[0], x[1], x[2], x[3], x[4]

In [77]:
import pandas as pd
max_l = np.max(pd.DataFrame(revs)["num_words"])
print("max sentence length: ", max_l)

max sentence length:  56


In [6]:
print('revs',len(x[0])) # number of sentence
print('W', len(x[1]))
print('W2', len(x[2])) # W2 are randomly initialized vectors
print('word_idx_map', len(x[3]))
print('vocab', len(x[4]))

revs 10662
W 18766
W2 18766
word_idx_map 18765
vocab 18765


In [56]:
revs[2]

{'y': 1, 'text': 'effective but too tepid biopic', 'split': 7, 'num_words': 5}

In [15]:
word_idx_map['good'] # word and its index

12002

In [21]:
vocab['good'] # word and its count

393.0

In [11]:
# mode= sys.argv[1]
# word_vectors = sys.argv[2]

mode = "-nonstatic"
word_vectors = "-rand"

if mode=="-nonstatic":
    print("model architecture: CNN-non-static")
    non_static=True
elif mode=="-static":
    print("model architecture: CNN-static")
    non_static=False

#execfile("conv_net_classes.py")  

if word_vectors=="-rand":
    print("using: random vectors")
    U = W2
elif word_vectors=="-word2vec":
    print("using: word2vec vectors")
    U = W

model architecture: CNN-non-static
using: random vectors


### Make dataset
make each sentence an word index map using word_idx_map

In [26]:
def get_idx_from_sent(sent, word_idx_map, max_l=51, k=300, filter_h=5):
    """
    Transforms sentence into a list of indices. Pad with zeroes.
    """
    x = []
    pad = filter_h - 1
    for i in range(pad):
        x.append(0)
    words = sent.split()
    for word in words:
        if word in word_idx_map:
            x.append(word_idx_map[word])
    while len(x) < max_l+2*pad:
        x.append(0)
    return x

def make_idx_data_cv(revs, word_idx_map, cv, max_l=51, k=300, filter_h=5):
    """
    Transforms sentences into a 2-d matrix.
    """
    train, test = [], []
    for rev in revs:
        sent = get_idx_from_sent(rev["text"], word_idx_map, max_l, k, filter_h) # one sentence
        sent.append(rev["y"])
        if rev["split"]==cv:  # "split" is random number of np.random.randint(0,10)
            test.append(sent)        
        else:  
            train.append(sent)   
    train = np.array(train, dtype="int")
    test = np.array(test, dtype="int")
    return [train, test] 

In [104]:
t = "effective but too tepid biopic"
t_sent = get_idx_from_sent(t, word_idx_map, max_l, k=300, filter_h=5)
print("sentence length(before)", len(t_sent)) # max_l(51)+2*pad(filter_h-1)
t_sent.append(1) #sent.append(rev["y"])
print("sentence length(after added y label)", len(t_sent))

sentence length(before) 64
sentence length(after added y label) 65


In [63]:
i = 0
datasets = make_idx_data_cv(revs, word_idx_map, i, max_l=56, k=300, filter_h=5)

In [79]:
print(len(datasets[0][0]))
datasets[0][0] # sentence => word index map padding with zeros

65


array([    0,     0,     0,     0,  5563, 10855, 10100,   262, 13764,
        3291,  5563, 10487, 12491,  6797,  9380,  9224, 16503,  6347,
        9195,  6797,  1773, 13764, 10123,  7252, 11843,  9366, 13213,
         599, 16044, 11417,  9596, 14903,  1356, 18642,  8580, 16278,
       12741,  2064,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     1])

In [87]:
print('train data size:', datasets[0].shape)
print('test data size:', datasets[1].shape)

train data size: (9572, 65)
test data size: (1090, 65)


### Model parameters

In [89]:
lr_decay=0.95
filter_hs=[3,4,5]
conv_non_linear="relu"
hidden_units=[100,2]
shuffle_batch=True
n_epochs=25
sqr_norm_lim=9
non_static=non_static
batch_size=50
dropout_rate=[0.5]

img_w=300
shuffle_batch=True
# activations=[Iden]

"""
Train a simple conv net
img_h = sentence length (padded where necessary)
img_w = word vector length (300 for word2vec)
filter_hs = filter window sizes    
hidden_units = [x,y] x is the number of feature maps (per filter window), and y is the penultimate layer
sqr_norm_lim = s^2 in the paper
lr_decay = adadelta decay parameter
"""    
rng = np.random.RandomState(3435)
img_h = len(datasets[0][0])-1  # sentence length (subtracted 1 for y label)
filter_w = img_w    
feature_maps = hidden_units[0]
filter_shapes = []
pool_sizes = []
for filter_h in filter_hs:
    filter_shapes.append((feature_maps, 1, filter_h, filter_w))
    pool_sizes.append((img_h-filter_h+1, img_w-filter_w+1))

# filter_shapes [(100, 1, 3, 300), (100, 1, 4, 300), (100, 1, 5, 300)]
# pool_sizes [(62, 1), (61, 1), (60, 1)]

parameters = [("image shape",img_h,img_w),("filter shape",filter_shapes), ("hidden_units",hidden_units),
              ("dropout", dropout_rate), ("batch_size",batch_size),("non_static", non_static),
                ("learn_decay",lr_decay), ("conv_non_linear", conv_non_linear), ("non_static", non_static)
                ,("sqr_norm_lim",sqr_norm_lim),("shuffle_batch",shuffle_batch)]
print(parameters)   

# [('image shape', 64, 300), ('filter shape', [(100, 1, 3, 300), (100, 1, 4, 300), (100, 1, 5, 300)]), 
# ('hidden_units', [100, 2]), ('dropout', [0.5]), ('batch_size', 50), 
# ('non_static', True), ('learn_decay', 0.95), ('conv_non_linear', 'relu'), 
# ('non_static', True), ('sqr_norm_lim', 9), ('shuffle_batch', True)]

[('image shape', 64, 300), ('filter shape', [(100, 1, 3, 300), (100, 1, 4, 300), (100, 1, 5, 300)]), ('hidden_units', [100, 2]), ('dropout', [0.5]), ('batch_size', 50), ('non_static', True), ('learn_decay', 0.95), ('conv_non_linear', 'relu'), ('non_static', True), ('sqr_norm_lim', 9), ('shuffle_batch', True)]


In [105]:
print(filter_shapes)
image_shape=(batch_size, 1, img_h, img_w)
print('one batch', image_shape)
print(pool_sizes)

[(100, 1, 3, 300), (100, 1, 4, 300), (100, 1, 5, 300)]
one batch (50, 1, 64, 300)
[(62, 1), (61, 1), (60, 1)]


### Network

- Theano:
    - conv_layer: LeNetConvPoolLayer
    - classifier: MLPDropout

### Prepare Train/Val/Test dataset

In [103]:
#shuffle dataset and assign to mini batches. if dataset size is not a multiple of mini batches, replicate 
#extra data (at random)
np.random.seed(3435)
if datasets[0].shape[0] % batch_size > 0:
    extra_data_num = batch_size - datasets[0].shape[0] % batch_size
    train_set = np.random.permutation(datasets[0])   
    extra_data = train_set[:extra_data_num]
    new_data=np.append(datasets[0],extra_data,axis=0)
else:
    new_data = datasets[0]
new_data = np.random.permutation(new_data)

# datasets[0].shape (9572, 65)
# new_data.shape (9600, 65) (dividable by batch_size)

n_batches = new_data.shape[0]/batch_size
n_train_batches = int(np.round(n_batches*0.9)) # 90% for train, 10% for val

#divide train set into train/val sets 
test_set_x = datasets[1][:,:img_h] 
test_set_y = np.asarray(datasets[1][:,-1], "int32")
train_set = new_data[:n_train_batches * batch_size,:]
val_set = new_data[n_train_batches*batch_size:,:]


### Train model

In [None]:
results = []
#r = range(0, 10) # orginal code
r = range(0, 1) 
for i in r:
    datasets = make_idx_data_cv(revs, word_idx_map, i, max_l=56,k=300, filter_h=5)
    perf = train_conv_net(datasets,
                          U,
                          lr_decay=0.95,
                          filter_hs=[3,4,5],
                          conv_non_linear="relu",
                          hidden_units=[100,2], 
                          shuffle_batch=True, 
                          n_epochs=25, 
                          sqr_norm_lim=9,
                          non_static=non_static,
                          batch_size=50,
                          dropout_rate=[0.5])
    print "cv: " + str(i) + ", perf: " + str(perf)
    results.append(perf)  