In [91]:
#import cPickle (python2.7)
#http://testpy.hatenablog.com/entry/2017/03/17/000626
import _pickle as cPickle

import numpy as np
from collections import defaultdict, OrderedDict

import re
import warnings
import sys
import time
warnings.filterwarnings("ignore") 

In [271]:
import torch 
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
from torch.autograd import Variable

import pandas as pd

### Load pickle data

In [3]:
# the pickle file contains [revs, W, W2, word_idx_map, vocab]
x = cPickle.load(open("mr.p","rb"), encoding="latin1") # Add encoding="latin1" because got UnicodeDecodeError

In [4]:
revs, W, W2, word_idx_map, vocab = x[0], x[1], x[2], x[3], x[4]

In [77]:
max_l = np.max(pd.DataFrame(revs)["num_words"])
print("max sentence length: ", max_l)

max sentence length:  56


In [6]:
print('revs',len(x[0])) # number of sentence
print('W', len(x[1]))
print('W2', len(x[2])) # W2 are randomly initialized vectors
print('word_idx_map', len(x[3]))
print('vocab', len(x[4]))

revs 10662
W 18766
W2 18766
word_idx_map 18765
vocab 18765


In [56]:
revs[2]

{'y': 1, 'text': 'effective but too tepid biopic', 'split': 7, 'num_words': 5}

In [15]:
word_idx_map['good'] # word and its index

12002

In [21]:
vocab['good'] # word and its count

393.0

In [11]:
# mode= sys.argv[1]
# word_vectors = sys.argv[2]

mode = "-nonstatic"
word_vectors = "-rand"

if mode=="-nonstatic":
    print("model architecture: CNN-non-static")
    non_static=True
elif mode=="-static":
    print("model architecture: CNN-static")
    non_static=False

#execfile("conv_net_classes.py")  

if word_vectors=="-rand":
    print("using: random vectors")
    U = W2
elif word_vectors=="-word2vec":
    print("using: word2vec vectors")
    U = W

model architecture: CNN-non-static
using: random vectors


In [126]:
U.shape

(18766, 300)

### Make dataset (check original code)
make each sentence an word index map using word_idx_map

In [241]:
def get_idx_from_sent(sent, word_idx_map, max_l=51, k=300, filter_h=5):
    """
    Transforms sentence into a list of indices. Pad with zeroes.
    """
    x = []
    pad = filter_h - 1
    for i in range(pad):
        x.append(0)
    words = sent.split()
    for word in words:
        if word in word_idx_map:
            x.append(word_idx_map[word])
    while len(x) < max_l+2*pad:
        x.append(0)
    return x

def make_idx_data_cv(revs, word_idx_map, cv, max_l=51, k=300, filter_h=5):
    """
    Transforms sentences into a 2-d matrix.
    """
    train, test = [], []
    for rev in revs:
        sent = get_idx_from_sent(rev["text"], word_idx_map, max_l, k, filter_h) # one sentence
        sent.append(rev["y"])
        if rev["split"]==cv:  # "split" is random number of np.random.randint(0,10)
            test.append(sent)        
        else:  
            train.append(sent)   
    train = np.array(train, dtype="int")
    test = np.array(test, dtype="int")
    return [train, test] 

In [242]:
t = "effective but too tepid biopic"
t_sent = get_idx_from_sent(t, word_idx_map, max_l, k=300, filter_h=5)
print("sentence length(before)", len(t_sent)) # max_l(51)+2*pad(filter_h-1)
t_sent.append(1) #sent.append(rev["y"])
print("sentence length(after added y label)", len(t_sent))

sentence length(before) 64
sentence length(after added y label) 65


In [243]:
i = 0
datasets = make_idx_data_cv(revs, word_idx_map, i, max_l=56, k=300, filter_h=5)

In [244]:
print(len(datasets[0][0]))
datasets[0][0] # sentence => word index map padding with zeros

65


array([    0,     0,     0,     0,  5563, 10855, 10100,   262, 13764,
        3291,  5563, 10487, 12491,  6797,  9380,  9224, 16503,  6347,
        9195,  6797,  1773, 13764, 10123,  7252, 11843,  9366, 13213,
         599, 16044, 11417,  9596, 14903,  1356, 18642,  8580, 16278,
       12741,  2064,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     1])

In [245]:
print('train data size:', datasets[0].shape)
print('test data size:', datasets[1].shape)

train data size: (9572, 65)
test data size: (1090, 65)


### Make dataset (using vectors)

In [246]:
def get_idx_from_sent_2vec(sent, U, word_idx_map, max_l=51, k=300, filter_h=5):
    """
    Transforms sentence into a list of indices. Pad with zeroes.
    """
    pad = filter_h - 1
    x = np.zeros((max_l+2*pad, k))

    words = sent.split()
    # starting after padding
    i = pad
    for word in words:
        if word in word_idx_map:
            x[i] = U[word_idx_map[word]]
            i += 1
    return x

def make_idx_data_cv_2vec(revs, U, word_idx_map, cv, max_l=51, k=300, filter_h=5):
    """
    Transforms sentences into a 2-d matrix.
    """
    train_image, train_label = [], []
    test_image, test_label = [], []
    for rev in revs:
        sent = get_idx_from_sent_2vec(rev["text"], U, word_idx_map, max_l, k, filter_h) # one sentence
        if rev["split"]==cv:  # "split" is random number of np.random.randint(0,10)
            test_image.append(sent) 
            test_label.append(rev["y"])
        else:  
            train_image.append(sent)
            train_label.append(rev["y"])
    train_image = np.array(train_image)
    train_label = np.array(train_label)
    test_image = np.array(test_image)
    test_label = np.array(test_label)
    return (train_image, train_label), (test_image, test_label)

In [247]:
t = "effective but too tepid biopic"
t_sent_2vec = get_idx_from_sent_2vec(t, W, word_idx_map, max_l, k=300, filter_h=5)
print("sentence length(before)", len(t_sent_2vec)) # max_l(51)+2*pad(filter_h-1)
# t_sent_2vec.append(1) #sent.append(rev["y"])
# print("sentence length(after added y label)", len(t_sent_2vec ))

sentence length(before) 64


In [248]:
t_sent_2vec[0].shape

(300,)

In [249]:
p = pd.DataFrame(t_sent_2vec[:64])
p.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-0.028931,-0.034912,-0.125977,0.078613,-0.182617,0.080078,0.143555,-0.00528,0.308594,0.070801,...,-0.177734,-0.08252,0.108398,0.208008,-0.145508,0.375,-0.070801,-0.003143,-0.10498,0.339844
5,-0.047607,0.081543,0.045654,0.091797,-0.014709,0.111328,0.06543,-0.09668,0.138672,0.143555,...,-0.114746,0.041504,-0.041992,0.092285,-0.000713,0.075195,0.049316,-0.055664,0.10498,-0.108398
6,0.129883,0.131836,-0.032959,0.148438,-0.138672,0.141602,0.192383,-0.053955,0.110352,0.068848,...,0.006165,0.079102,-0.070312,0.025757,-0.137695,-0.045166,0.070801,-0.065918,0.032959,0.208984
7,0.248047,0.236328,0.107422,0.217773,-0.257812,-0.014771,-0.118164,-0.064941,0.076172,0.291016,...,-0.057861,-0.306641,-0.055176,0.128906,0.067383,-0.069336,-0.001877,0.133789,0.057617,0.175781
8,0.285156,-0.195312,0.109863,0.237305,0.322266,0.363281,0.015869,-0.137695,0.017334,-0.427734,...,-0.07666,0.111328,-0.365234,0.025757,0.011292,0.02478,0.001472,0.109863,-0.053467,0.296875
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [250]:
i = 0
datasets_2vec = make_idx_data_cv_2vec(revs, W, word_idx_map, i, max_l=56, k=300, filter_h=5)

In [251]:
(train_image, train_label), (test_image, test_label) = datasets_2vec

In [287]:
t_image = torch.FloatTensor(train_image).reshape(-1, 1, 64, 300)
t_label = torch.LongTensor(train_label)
train_dataset = list(zip(t_image, t_label))
len(train_dataset)

9572

In [288]:
# Data loader
batch_size = 50
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                           batch_size=batch_size, 
                                           shuffle=True)


In [300]:
for images, labels in train_loader:
    print('images:', images.shape, '\nlabels:', labels.shape)
    print(images[1][0])
    print(labels[1])
    break

images: torch.Size([50, 1, 64, 300]) 
labels: torch.Size([50])
tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        ...,
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]])
tensor(1)


In [319]:
c_image = torch.FloatTensor(test_image).reshape(-1, 1, 64, 300)
c_label = torch.LongTensor(test_label)
test_dataset = list(zip(c_image, c_label))
len(test_dataset)

1090

In [320]:
# Data loader
batch_size = 50
test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
                                           batch_size=batch_size, 
                                           shuffle=True)


In [329]:
for images, labels in test_loader:
    print('images:', images.shape, '\nlabels:', labels.shape)
    print(images[1][0])
    print(labels[1])
    break

images: torch.Size([50, 1, 64, 300]) 
labels: torch.Size([50])
tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        ...,
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]])
tensor(1)


### Model parameters

In [89]:
lr_decay=0.95
filter_hs=[3,4,5]
conv_non_linear="relu"
hidden_units=[100,2]
shuffle_batch=True
n_epochs=25
sqr_norm_lim=9
non_static=non_static
batch_size=50
dropout_rate=[0.5]

img_w=300
shuffle_batch=True
# activations=[Iden]

"""
Train a simple conv net
img_h = sentence length (padded where necessary)
img_w = word vector length (300 for word2vec)
filter_hs = filter window sizes    
hidden_units = [x,y] x is the number of feature maps (per filter window), and y is the penultimate layer
sqr_norm_lim = s^2 in the paper
lr_decay = adadelta decay parameter
"""    
rng = np.random.RandomState(3435)
img_h = len(datasets[0][0])-1  # sentence length (subtracted 1 for y label)
filter_w = img_w    
feature_maps = hidden_units[0]
filter_shapes = []
pool_sizes = []
for filter_h in filter_hs:
    filter_shapes.append((feature_maps, 1, filter_h, filter_w))
    pool_sizes.append((img_h-filter_h+1, img_w-filter_w+1))

# filter_shapes [(100, 1, 3, 300), (100, 1, 4, 300), (100, 1, 5, 300)]
# pool_sizes [(62, 1), (61, 1), (60, 1)]

parameters = [("image shape",img_h,img_w),("filter shape",filter_shapes), ("hidden_units",hidden_units),
              ("dropout", dropout_rate), ("batch_size",batch_size),("non_static", non_static),
                ("learn_decay",lr_decay), ("conv_non_linear", conv_non_linear), ("non_static", non_static)
                ,("sqr_norm_lim",sqr_norm_lim),("shuffle_batch",shuffle_batch)]
print(parameters)   

# [('image shape', 64, 300), ('filter shape', [(100, 1, 3, 300), (100, 1, 4, 300), (100, 1, 5, 300)]), 
# ('hidden_units', [100, 2]), ('dropout', [0.5]), ('batch_size', 50), 
# ('non_static', True), ('learn_decay', 0.95), ('conv_non_linear', 'relu'), 
# ('non_static', True), ('sqr_norm_lim', 9), ('shuffle_batch', True)]

[('image shape', 64, 300), ('filter shape', [(100, 1, 3, 300), (100, 1, 4, 300), (100, 1, 5, 300)]), ('hidden_units', [100, 2]), ('dropout', [0.5]), ('batch_size', 50), ('non_static', True), ('learn_decay', 0.95), ('conv_non_linear', 'relu'), ('non_static', True), ('sqr_norm_lim', 9), ('shuffle_batch', True)]


In [105]:
print(filter_shapes)
image_shape=(batch_size, 1, img_h, img_w)
print('one batch train', image_shape)
print(pool_sizes)

[(100, 1, 3, 300), (100, 1, 4, 300), (100, 1, 5, 300)]
one batch (50, 1, 64, 300)
[(62, 1), (61, 1), (60, 1)]


### Padding & Stride
- Output size

    $ O = \frac {W-K+2P}{S} + 1 $
    - O: output h/w
    - W: input h/w
    - K: filter size(kernel size)
    - P: padding
        - $  P = \frac {K-1}{2} $
    - S: stride

### Model:


```
Network
Input ->
Conv -> ReLU -> MaxPool |
Conv -> ReLU -> MaxPool | -> concat
Conv -> ReLU -> MaxPool |
Fully Connected Layer(Logits -> Softmax) -> Labels
```

```
Convolutional layer formula:
- Filter 1(Kernel) size K = 3 => (3 x 300)
- P(same padding) P = (3-1)/2=1
- S(stride) S = 1
- in_channels = 1
- out_channels (int) – Number of channels produced by the convolution = 100
Pooling layer formula:
- K
```

```
*Filter dimensions*:
Conv1 (W_conv, (100, 1, 3, 300))
Conv1 (b_conv, (100,))
Conv2 (W_conv, (100, 1, 4, 300))
Conv2 (b_conv, (100,))
Conv3 (W_conv, (100, 1, 5, 300))
Conv3 (b_conv, (100,))

*Layer input dimensions*:
- Input image(64, 300) 

----------------------------------------------------------------------
|  Conv1  (100, 3, 300)   Conv2  (100, 4, 300)   Conv3  (100, 5, 300) |
|  MaxPool (100, 62, 1)   MaxPool (100, 61, 1)   MaxPool (100, 60, 1) |
-----------------------------Concat ----------------------------------

- Concatenated (100, 1, 1) + (100, 1, 1) + (100, 1, 1) => (300, 1, 1) 

- Fully Connected Layer(Logits (100, 1) -> Logits (2, 1) -> Softmax) -> Labels
```

### Network

- Theano:
    - conv_layer: LeNetConvPoolLayer
    - classifier: MLPDropout

In [311]:
class ConvPoolLayer(nn.Module):
    def __init__(self, num_classes=2):
        super(ConvPoolLayer, self).__init__()

        # Layer 1: conv - relu - conv- relu - pool
        self.ngram1 = nn.Sequential(
            nn.Conv2d(in_channels=1, out_channels=100, kernel_size=(3, 300), stride=1, padding=0),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=(62, 1), stride=None))
        self.ngram2 = nn.Sequential(
            nn.Conv2d(in_channels=1, out_channels=100, kernel_size=(4, 300), stride=1, padding=0),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=(61, 1), stride=None))
        self.ngram3 = nn.Sequential(
            nn.Conv2d(in_channels=1, out_channels=100, kernel_size=(5, 300), stride=1, padding=0),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=(60, 1), stride=None))
        
        # Fully Connected 1 (readout)
        self.fc1 = nn.Linear(300, 100)
        self.fc2 = nn.Linear(100, num_classes)

    def forward(self, x):

        out1 = self.ngram1(x)
        out2 = self.ngram2(x)
        out3 = self.ngram3(x)
        out = torch.cat((out1, out2, out3), 1)
        
        out = out.view(out.size(0), -1)
    
        # Linear function (readout)
        out = self.fc1(out)
        out = self.fc2(out)
        
        return out

In [312]:
model = ConvPoolLayer(2)
print(model)

ConvPoolLayer(
  (ngram1): Sequential(
    (0): Conv2d(1, 100, kernel_size=(3, 300), stride=(1, 1))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=(62, 1), stride=(62, 1), padding=0, dilation=1, ceil_mode=False)
  )
  (ngram2): Sequential(
    (0): Conv2d(1, 100, kernel_size=(4, 300), stride=(1, 1))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=(61, 1), stride=(61, 1), padding=0, dilation=1, ceil_mode=False)
  )
  (ngram3): Sequential(
    (0): Conv2d(1, 100, kernel_size=(5, 300), stride=(1, 1))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=(60, 1), stride=(60, 1), padding=0, dilation=1, ceil_mode=False)
  )
  (fc1): Linear(in_features=300, out_features=100, bias=True)
  (fc2): Linear(in_features=100, out_features=2, bias=True)
)


In [313]:
criterion = nn.CrossEntropyLoss()

In [314]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

In [315]:
for p in model.parameters():
    print(p.shape)

torch.Size([100, 1, 3, 300])
torch.Size([100])
torch.Size([100, 1, 4, 300])
torch.Size([100])
torch.Size([100, 1, 5, 300])
torch.Size([100])
torch.Size([100, 300])
torch.Size([100])
torch.Size([2, 100])
torch.Size([2])


### Train model

In [335]:
num_epochs = 20
iter = 0
# accuracies = []
for epoch in range(num_epochs):
    for i, (images, labels) in enumerate(train_loader):
        # Load images as Variable
        images = Variable(images) # Now we dont need to resize like images.view(xx)
        labels = Variable(labels)
        
        # Clear gradients w.r.t parameters
        optimizer.zero_grad()
        
        # Forward pass to get output/logits
        outputs = model(images)
        
        # Calculate Loss: Softmax --> cross entropy loss
        loss = criterion(outputs, labels)
        
        # Getting gradients w.r.t paramters
        loss.backward()
        
        # Updating parameters
        optimizer.step()
        
        iter += 1
        
        if iter % 50 == 0:
            # Print Loss
            print('Iteration: {}. Loss: {}.'.format(iter, loss.item()))

Iteration: 50. Loss: 0.20017242431640625.
Iteration: 100. Loss: 0.08401701599359512.
Iteration: 150. Loss: 0.12604370713233948.
Iteration: 200. Loss: 0.15190933644771576.
Iteration: 250. Loss: 0.09675393998622894.
Iteration: 300. Loss: 0.11735799908638.
Iteration: 350. Loss: 0.0904175266623497.
Iteration: 400. Loss: 0.023380205035209656.
Iteration: 450. Loss: 0.14850455522537231.
Iteration: 500. Loss: 0.14952605962753296.
Iteration: 550. Loss: 0.24317945539951324.
Iteration: 600. Loss: 0.013713160529732704.
Iteration: 650. Loss: 0.03922988474369049.
Iteration: 700. Loss: 0.2628262937068939.
Iteration: 750. Loss: 0.061310939490795135.
Iteration: 800. Loss: 0.0010832254774868488.
Iteration: 850. Loss: 0.04209603741765022.
Iteration: 900. Loss: 0.03877854719758034.
Iteration: 950. Loss: 0.011271928437054157.
Iteration: 1000. Loss: 0.05429753661155701.
Iteration: 1050. Loss: 0.008044209331274033.
Iteration: 1100. Loss: 0.027259407564997673.
Iteration: 1150. Loss: 0.00010646475129760802.
It

### Test model

In [336]:
wrong_predictions = []

model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for images, labels in test_loader:
        images = Variable(images)
        labels = Variable(labels)
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        
        # See which are error predictions
        result = (predicted == labels)
        err_imgs = images[result == 0] # 0 means wrong prediction
        err_labels = labels[result == 0]
        err_outputs = predicted[result == 0]
        for img, lbl, out in zip(err_imgs, err_labels, err_outputs):
            wrong_predictions.append((img, lbl, out))
     
    print('Test Accuracy of the model on the 10000 test images: {} %'.format(100 * correct / total))


Test Accuracy of the model on the 10000 test images: 77.79816513761467 %


### Prepare Train/Val/Test dataset

In [103]:
#shuffle dataset and assign to mini batches. if dataset size is not a multiple of mini batches, replicate 
#extra data (at random)
np.random.seed(3435)
if datasets[0].shape[0] % batch_size > 0:
    extra_data_num = batch_size - datasets[0].shape[0] % batch_size
    train_set = np.random.permutation(datasets[0])   
    extra_data = train_set[:extra_data_num]
    new_data=np.append(datasets[0],extra_data,axis=0)
else:
    new_data = datasets[0]
new_data = np.random.permutation(new_data)

# datasets[0].shape (9572, 65)
# new_data.shape (9600, 65) (dividable by batch_size)

n_batches = new_data.shape[0]/batch_size
n_train_batches = int(np.round(n_batches*0.9)) # 90% for train, 10% for val

#divide train set into train/val sets 
test_set_x = datasets[1][:,:img_h] 
test_set_y = np.asarray(datasets[1][:,-1], "int32")
train_set = new_data[:n_train_batches * batch_size,:]
val_set = new_data[n_train_batches*batch_size:,:]


### Train model

In [None]:
results = []
#r = range(0, 10) # orginal code
r = range(0, 1) 
for i in r:
    datasets = make_idx_data_cv(revs, word_idx_map, i, max_l=56,k=300, filter_h=5)
    perf = train_conv_net(datasets,
                          U,
                          lr_decay=0.95,
                          filter_hs=[3,4,5],
                          conv_non_linear="relu",
                          hidden_units=[100,2], 
                          shuffle_batch=True, 
                          n_epochs=25, 
                          sqr_norm_lim=9,
                          non_static=non_static,
                          batch_size=50,
                          dropout_rate=[0.5])
    print "cv: " + str(i) + ", perf: " + str(perf)
    results.append(perf)  