In [1]:
from utils import *
import tensorflow as tf
from sklearn.cross_validation import train_test_split
import time
import random
import os



In [2]:
trainset = sklearn.datasets.load_files(container_path = 'data', encoding = 'UTF-8')
trainset.data, trainset.target = separate_dataset(trainset,1.0)
print (trainset.target_names)
print (len(trainset.data))
print (len(trainset.target))

['negative', 'positive']
10662
10662


In [3]:
concat = ' '.join(trainset.data).split()
vocabulary_size = len(list(set(concat)))
data, count, dictionary, rev_dictionary = build_dataset(concat, vocabulary_size)
print('vocab from size: %d'%(vocabulary_size))
print('Most common words', count[4:10])
print('Sample data', data[:10], [rev_dictionary[i] for i in data[:10]])

vocab from size: 20332
Most common words [('film', 1453), ('movie', 1270), ('one', 727), ('like', 721), ('story', 477), ('much', 386)]
Sample data [6040, 9003, 8532, 556, 1066, 375, 489, 26, 1054, 5] ['zany', 'exuberantly', 'irreverent', 'animated', 'space', 'adventure', 'perhaps', 'best', 'sports', 'movie']


In [4]:
GO = dictionary['GO']
PAD = dictionary['PAD']
EOS = dictionary['EOS']
UNK = dictionary['UNK']

In [5]:
size_layer = 128
dimension_output = len(trainset.target_names)
maxlen = 50
batch_size = 32

In [6]:
class Attention:
    def __init__(self,hidden_size):
        self.hidden_size = hidden_size
        self.dense_layer = tf.layers.Dense(hidden_size)
        self.v = tf.random_normal([hidden_size],mean=0,stddev=1/np.sqrt(hidden_size))
        
    def score(self, hidden_tensor, encoder_outputs):
        energy = tf.nn.tanh(self.dense_layer(tf.concat([hidden_tensor,encoder_outputs],2)))
        energy = tf.transpose(energy,[0,2,1])
        batch_size = tf.shape(encoder_outputs)[0]
        v = tf.expand_dims(tf.tile(tf.expand_dims(self.v,0),[batch_size,1]),1)
        energy = tf.matmul(v,energy)
        return tf.squeeze(energy,1)
    
    def __call__(self, hidden, encoder_outputs):
        seq_len = tf.shape(encoder_outputs)[1]
        batch_size = tf.shape(encoder_outputs)[0]
        H = tf.tile(tf.expand_dims(hidden, 1),[1,seq_len,1])
        attn_energies = self.score(H,encoder_outputs)
        return tf.expand_dims(tf.nn.softmax(attn_energies),1)

class Model:
    def __init__(
        self,
        dict_size,
        size_layers,
        learning_rate,
        num_classes,
        maxlen,
        num_blocks = 3,
        block_size = 128,
    ):
        self.X = tf.placeholder(tf.int32,[None, maxlen])
        self.Y = tf.placeholder(tf.int32,[None])
        embeddings = tf.Variable(tf.random_uniform([dict_size, size_layers], -1, 1))
        embedded = tf.nn.embedding_lookup(embeddings, self.X)
        self.attention = Attention(size_layers)

        def residual_block(x, size, rate, block):
            with tf.variable_scope(
                'block_%d_%d' % (block, rate), reuse = False
            ):
                attn_weights = self.attention(tf.reduce_sum(x,axis=1), x)
                conv_filter = tf.layers.conv1d(
                    attn_weights,
                    x.shape[2] // 4,
                    kernel_size = size,
                    strides = 1,
                    padding = 'same',
                    dilation_rate = rate,
                    activation = tf.nn.tanh,
                )
                conv_gate = tf.layers.conv1d(
                    x,
                    x.shape[2] // 4,
                    kernel_size = size,
                    strides = 1,
                    padding = 'same',
                    dilation_rate = rate,
                    activation = tf.nn.sigmoid,
                )
                out = tf.multiply(conv_filter, conv_gate)
                out = tf.layers.conv1d(
                    out,
                    block_size,
                    kernel_size = 1,
                    strides = 1,
                    padding = 'same',
                    activation = tf.nn.tanh,
                )
                return tf.add(x, out), out

        forward = tf.layers.conv1d(
            embedded, block_size, kernel_size = 1, strides = 1, padding = 'SAME'
        )
        zeros = tf.zeros_like(forward)
        for i in range(num_blocks):
            for r in [1, 2, 4, 8, 16]:
                forward, s = residual_block(
                    forward, size = 7, rate = r, block = i
                )
                zeros = tf.add(zeros, s)
        self.logits = tf.reduce_sum(tf.layers.conv1d(
            forward, num_classes, kernel_size = 1, strides = 1, padding = 'SAME'
        ), 1)
        self.cost = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=self.logits,
            labels=self.Y))
        self.optimizer = tf.train.AdamOptimizer(learning_rate).minimize(self.cost)
        correct_pred = tf.equal(tf.argmax(self.logits, 1,output_type=tf.int32), self.Y)
        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [7]:
tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Model(len(dictionary), size_layer, 1e-3, dimension_output, maxlen)
sess.run(tf.global_variables_initializer())

In [8]:
vectors = str_idx(trainset.data,dictionary,maxlen)
train_X, test_X, train_Y, test_Y = train_test_split(vectors, trainset.target,test_size = 0.2)

In [9]:
from tqdm import tqdm
import time

EARLY_STOPPING, CURRENT_CHECKPOINT, CURRENT_ACC, EPOCH = 3, 0, 0, 0

while True:
    lasttime = time.time()
    if CURRENT_CHECKPOINT == EARLY_STOPPING:
        print('break epoch:%d\n' % (EPOCH))
        break

    train_acc, train_loss, test_acc, test_loss = 0, 0, 0, 0
    pbar = tqdm(
        range(0, len(train_X), batch_size), desc = 'train minibatch loop'
    )
    for i in pbar:
        batch_x = train_X[i : min(i + batch_size, train_X.shape[0])]
        batch_y = train_Y[i : min(i + batch_size, train_X.shape[0])]
        batch_x_expand = np.expand_dims(batch_x,axis = 1)
        acc, cost, _ = sess.run(
            [model.accuracy, model.cost, model.optimizer],
            feed_dict = {
                model.Y: batch_y,
                model.X: batch_x
            },
        )
        assert not np.isnan(cost)
        train_loss += cost
        train_acc += acc
        pbar.set_postfix(cost = cost, accuracy = acc)
        
    pbar = tqdm(range(0, len(test_X), batch_size), desc = 'test minibatch loop')
    for i in pbar:
        batch_x = test_X[i : min(i + batch_size, test_X.shape[0])]
        batch_y = test_Y[i : min(i + batch_size, test_X.shape[0])]
        batch_x_expand = np.expand_dims(batch_x,axis = 1)
        acc, cost = sess.run(
            [model.accuracy, model.cost],
            feed_dict = {
                model.Y: batch_y,
                model.X: batch_x
            },
        )
        test_loss += cost
        test_acc += acc
        pbar.set_postfix(cost = cost, accuracy = acc)

    train_loss /= len(train_X) / batch_size
    train_acc /= len(train_X) / batch_size
    test_loss /= len(test_X) / batch_size
    test_acc /= len(test_X) / batch_size

    if test_acc > CURRENT_ACC:
        print(
            'epoch: %d, pass acc: %f, current acc: %f'
            % (EPOCH, CURRENT_ACC, test_acc)
        )
        CURRENT_ACC = test_acc
        CURRENT_CHECKPOINT = 0
    else:
        CURRENT_CHECKPOINT += 1

    print('time taken:', time.time() - lasttime)
    print(
        'epoch: %d, training loss: %f, training acc: %f, valid loss: %f, valid acc: %f\n'
        % (EPOCH, train_loss, train_acc, test_loss, test_acc)
    )
    EPOCH += 1

train minibatch loop: 100%|██████████| 267/267 [00:10<00:00, 25.66it/s, accuracy=0.529, cost=0.891]
test minibatch loop: 100%|██████████| 67/67 [00:01<00:00, 59.19it/s, accuracy=0.571, cost=1.18] 
train minibatch loop:   1%|          | 3/267 [00:00<00:09, 28.60it/s, accuracy=0.594, cost=0.912]

epoch: 0, pass acc: 0.000000, current acc: 0.592727
time taken: 11.537999629974365
epoch: 0, training loss: 2.407873, training acc: 0.536281, valid loss: 1.033647, valid acc: 0.592727



train minibatch loop: 100%|██████████| 267/267 [00:09<00:00, 28.36it/s, accuracy=0.824, cost=0.379]
test minibatch loop: 100%|██████████| 67/67 [00:00<00:00, 72.09it/s, accuracy=0.667, cost=1.02] 
train minibatch loop:   1%|          | 3/267 [00:00<00:09, 28.55it/s, accuracy=0.75, cost=0.554] 

epoch: 1, pass acc: 0.592727, current acc: 0.637756
time taken: 10.34778881072998
epoch: 1, training loss: 1.108567, training acc: 0.658149, valid loss: 0.912250, valid acc: 0.637756



train minibatch loop: 100%|██████████| 267/267 [00:09<00:00, 28.33it/s, accuracy=0.941, cost=0.15] 
test minibatch loop: 100%|██████████| 67/67 [00:00<00:00, 71.62it/s, accuracy=0.476, cost=3.1] 
train minibatch loop:   1%|          | 3/267 [00:00<00:09, 28.32it/s, accuracy=0.875, cost=0.417]

time taken: 10.361033916473389
epoch: 2, training loss: 1.363167, training acc: 0.680516, valid loss: 2.513253, valid acc: 0.539727



train minibatch loop: 100%|██████████| 267/267 [00:09<00:00, 28.23it/s, accuracy=0.471, cost=1.59]  
test minibatch loop: 100%|██████████| 67/67 [00:00<00:00, 71.38it/s, accuracy=0.714, cost=1.33] 
train minibatch loop:   1%|          | 3/267 [00:00<00:09, 28.46it/s, accuracy=0.75, cost=0.854] 

epoch: 3, pass acc: 0.637756, current acc: 0.700355
time taken: 10.397602319717407
epoch: 3, training loss: 0.872497, training acc: 0.749098, valid loss: 1.075091, valid acc: 0.700355



train minibatch loop: 100%|██████████| 267/267 [00:09<00:00, 28.23it/s, accuracy=0.941, cost=0.123] 
test minibatch loop: 100%|██████████| 67/67 [00:00<00:00, 71.70it/s, accuracy=0.667, cost=3.35]
train minibatch loop:   1%|          | 3/267 [00:00<00:09, 28.51it/s, accuracy=0.688, cost=1.19] 

time taken: 10.393543720245361
epoch: 4, training loss: 0.951980, training acc: 0.803508, valid loss: 3.091106, valid acc: 0.603063



train minibatch loop: 100%|██████████| 267/267 [00:09<00:00, 28.24it/s, accuracy=1, cost=0.0568]    
test minibatch loop: 100%|██████████| 67/67 [00:00<00:00, 71.82it/s, accuracy=0.429, cost=5.94]
train minibatch loop:   1%|          | 3/267 [00:00<00:09, 27.65it/s, accuracy=0.75, cost=1.54]  

time taken: 10.388004779815674
epoch: 5, training loss: 0.663642, training acc: 0.858835, valid loss: 4.446053, valid acc: 0.581676



train minibatch loop: 100%|██████████| 267/267 [00:09<00:00, 28.25it/s, accuracy=0.941, cost=0.0587]
test minibatch loop: 100%|██████████| 67/67 [00:00<00:00, 71.41it/s, accuracy=0.714, cost=3.03] 
train minibatch loop:   1%|          | 3/267 [00:00<00:09, 28.27it/s, accuracy=0.969, cost=0.36] 

epoch: 6, pass acc: 0.700355, current acc: 0.711138
time taken: 10.390414237976074
epoch: 6, training loss: 0.381432, training acc: 0.913016, valid loss: 2.420590, valid acc: 0.711138



train minibatch loop: 100%|██████████| 267/267 [00:09<00:00, 28.31it/s, accuracy=1, cost=4.46e-5]   
test minibatch loop: 100%|██████████| 67/67 [00:00<00:00, 72.12it/s, accuracy=0.667, cost=3.22] 
train minibatch loop:   1%|          | 3/267 [00:00<00:09, 28.56it/s, accuracy=0.938, cost=0.325]

epoch: 7, pass acc: 0.711138, current acc: 0.716518
time taken: 10.360514640808105
epoch: 7, training loss: 0.278529, training acc: 0.938563, valid loss: 2.731792, valid acc: 0.716518



train minibatch loop: 100%|██████████| 267/267 [00:09<00:00, 28.27it/s, accuracy=1, cost=5.52e-5]   
test minibatch loop: 100%|██████████| 67/67 [00:00<00:00, 71.76it/s, accuracy=0.714, cost=4.91]
train minibatch loop:   1%|          | 3/267 [00:00<00:09, 28.49it/s, accuracy=0.969, cost=0.405]

epoch: 8, pass acc: 0.716518, current acc: 0.726140
time taken: 10.378602981567383
epoch: 8, training loss: 0.395688, training acc: 0.935162, valid loss: 3.874101, valid acc: 0.726140



train minibatch loop: 100%|██████████| 267/267 [00:09<00:00, 28.36it/s, accuracy=1, cost=2.99e-5]   
test minibatch loop: 100%|██████████| 67/67 [00:00<00:00, 71.81it/s, accuracy=0.762, cost=3.98]
train minibatch loop:   1%|          | 3/267 [00:00<00:09, 28.48it/s, accuracy=0.969, cost=0.0812]

epoch: 9, pass acc: 0.726140, current acc: 0.728730
time taken: 10.351183652877808
epoch: 9, training loss: 0.181422, training acc: 0.968226, valid loss: 3.625677, valid acc: 0.728730



train minibatch loop: 100%|██████████| 267/267 [00:09<00:00, 28.25it/s, accuracy=0.941, cost=0.0919]
test minibatch loop: 100%|██████████| 67/67 [00:00<00:00, 71.68it/s, accuracy=0.762, cost=3.92] 
train minibatch loop:   1%|          | 3/267 [00:00<00:09, 28.34it/s, accuracy=1, cost=0.0163]    

time taken: 10.387167692184448
epoch: 10, training loss: 0.122172, training acc: 0.975861, valid loss: 3.925538, valid acc: 0.724511



train minibatch loop: 100%|██████████| 267/267 [00:09<00:00, 28.29it/s, accuracy=1, cost=0.000703]  
test minibatch loop: 100%|██████████| 67/67 [00:00<00:00, 71.79it/s, accuracy=0.714, cost=5.44]
train minibatch loop:   1%|          | 3/267 [00:00<00:09, 28.46it/s, accuracy=0.938, cost=0.36]

epoch: 11, pass acc: 0.728730, current acc: 0.729891
time taken: 10.373524904251099
epoch: 11, training loss: 0.142030, training acc: 0.972447, valid loss: 4.970537, valid acc: 0.729891



train minibatch loop: 100%|██████████| 267/267 [00:09<00:00, 28.32it/s, accuracy=1, cost=4.32e-5]   
test minibatch loop: 100%|██████████| 67/67 [00:00<00:00, 71.68it/s, accuracy=0.619, cost=7.37]
train minibatch loop:   1%|          | 3/267 [00:00<00:09, 28.68it/s, accuracy=1, cost=2.65e-5]   

time taken: 10.365166187286377
epoch: 12, training loss: 0.128455, training acc: 0.980068, valid loss: 5.587411, valid acc: 0.724243



train minibatch loop: 100%|██████████| 267/267 [00:09<00:00, 28.39it/s, accuracy=1, cost=0.000676]  
test minibatch loop: 100%|██████████| 67/67 [00:00<00:00, 70.62it/s, accuracy=0.619, cost=8.16]
train minibatch loop:   1%|          | 3/267 [00:00<00:09, 26.87it/s, accuracy=0.969, cost=0.0973]

time taken: 10.35442066192627
epoch: 13, training loss: 0.096518, training acc: 0.984172, valid loss: 6.090484, valid acc: 0.709240



train minibatch loop: 100%|██████████| 267/267 [00:09<00:00, 28.37it/s, accuracy=0.941, cost=0.0766]
test minibatch loop: 100%|██████████| 67/67 [00:00<00:00, 71.38it/s, accuracy=0.762, cost=7.99]
train minibatch loop:   1%|          | 3/267 [00:00<00:09, 28.38it/s, accuracy=1, cost=0.00228]  

epoch: 14, pass acc: 0.729891, current acc: 0.730605
time taken: 10.351580619812012
epoch: 14, training loss: 0.684831, training acc: 0.940804, valid loss: 6.728577, valid acc: 0.730605



train minibatch loop: 100%|██████████| 267/267 [00:09<00:00, 28.50it/s, accuracy=1, cost=0.000274]  
test minibatch loop: 100%|██████████| 67/67 [00:00<00:00, 71.60it/s, accuracy=0.619, cost=8.15]
train minibatch loop:   1%|          | 3/267 [00:00<00:09, 28.48it/s, accuracy=1, cost=1.96e-5]   

time taken: 10.305923700332642
epoch: 15, training loss: 0.126788, training acc: 0.982413, valid loss: 6.516656, valid acc: 0.723774



train minibatch loop: 100%|██████████| 267/267 [00:09<00:00, 28.48it/s, accuracy=1, cost=0.00104]   
test minibatch loop: 100%|██████████| 67/67 [00:00<00:00, 72.09it/s, accuracy=0.667, cost=8.14]
train minibatch loop:   1%|          | 3/267 [00:00<00:09, 28.62it/s, accuracy=1, cost=2.47e-5] 

time taken: 10.307766437530518
epoch: 16, training loss: 0.057105, training acc: 0.990855, valid loss: 7.102426, valid acc: 0.724488



train minibatch loop: 100%|██████████| 267/267 [00:09<00:00, 28.43it/s, accuracy=0.941, cost=0.0527]
test minibatch loop: 100%|██████████| 67/67 [00:00<00:00, 71.50it/s, accuracy=0.667, cost=7.68]

time taken: 10.329671382904053
epoch: 17, training loss: 0.051003, training acc: 0.991572, valid loss: 7.450499, valid acc: 0.716987

break epoch:18






In [10]:
real_Y, predict_Y = [], []

pbar = tqdm(
    range(0, len(test_X), batch_size), desc = 'validation minibatch loop'
)
for i in pbar:
    batch_x = test_X[i : min(i + batch_size, test_X.shape[0])]
    batch_y = test_Y[i : min(i + batch_size, test_X.shape[0])]
    predict_Y += np.argmax(
        sess.run(
            model.logits, feed_dict = {model.X: batch_x, model.Y: batch_y}
        ),
        1,
    ).tolist()
    real_Y += batch_y

validation minibatch loop: 100%|██████████| 67/67 [00:01<00:00, 65.03it/s]


In [11]:
print(metrics.classification_report(real_Y, predict_Y, target_names = trainset.target_names))

             precision    recall  f1-score   support

   negative       0.73      0.68      0.70      1057
   positive       0.70      0.75      0.72      1076

avg / total       0.71      0.71      0.71      2133

