In [1]:
from utils import *
import tensorflow as tf
from sklearn.cross_validation import train_test_split
import time



In [2]:
trainset = sklearn.datasets.load_files(container_path = 'data', encoding = 'UTF-8')
trainset.data, trainset.target = separate_dataset(trainset,1.0)
print (trainset.target_names)
print (len(trainset.data))
print (len(trainset.target))

['negative', 'positive']
10662
10662


In [3]:
concat = ' '.join(trainset.data).split()
vocabulary_size = len(list(set(concat)))
data, count, dictionary, rev_dictionary = build_dataset(concat, vocabulary_size)
print('vocab from size: %d'%(vocabulary_size))
print('Most common words', count[4:10])
print('Sample data', data[:10], [rev_dictionary[i] for i in data[:10]])

vocab from size: 20332
Most common words [('film', 1453), ('movie', 1270), ('one', 727), ('like', 721), ('story', 477), ('much', 386)]
Sample data [534, 2497, 3124, 11717, 36, 8564, 217, 151, 19, 4279] ['rock', 'destined', '21st', 'centurys', 'new', 'conan', 'hes', 'going', 'make', 'splash']


In [4]:
GO = dictionary['GO']
PAD = dictionary['PAD']
EOS = dictionary['EOS']
UNK = dictionary['UNK']

In [5]:
class Model:
    def __init__(self, size_layer, num_layers, embedded_size,
                 dict_size, dimension_output, learning_rate):
        
        def cells(size, reuse=False):
            return tf.nn.rnn_cell.LSTMCell(size,initializer=tf.orthogonal_initializer(),reuse=reuse)
        
        self.X = tf.placeholder(tf.int32, [None, None])
        self.Y = tf.placeholder(tf.int32, [None])
        encoder_embeddings = tf.Variable(tf.random_uniform([dict_size, embedded_size], -1, 1))
        encoder_embedded = tf.nn.embedding_lookup(encoder_embeddings, self.X)
        
        encoder_embedded = tf.layers.dense(encoder_embedded, embedded_size * 2)
        
        for n in range(num_layers):
            (out_fw, out_bw), (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn(
                cell_fw = cells(size_layer // 2),
                cell_bw = cells(size_layer // 2),
                inputs = encoder_embedded,
                dtype = tf.float32,
                scope = 'bidirectional_rnn_%d'%(n))
            encoder_embedded = tf.concat((out_fw, out_bw), 2)
        self.logits = encoder_embedded[:, -1]
        self.cost = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits = self.logits, labels = self.Y))
        self.optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(self.cost)
        correct_pred = tf.equal(tf.argmax(self.logits, 1,output_type=tf.int32), self.Y)
        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [6]:
size_layer = 128
num_layers = 1
embedded_size = 128
dimension_output = len(trainset.target_names)
learning_rate = 1e-2
maxlen = 50
batch_size = 128

In [7]:
vectors = str_idx(trainset.data, dictionary, maxlen)
train_X, test_X, train_Y, test_Y = train_test_split(vectors, 
                                                    trainset.target,
                                                    test_size = 0.2)

In [8]:
tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Model(size_layer,num_layers,embedded_size,len(dictionary),dimension_output,learning_rate)
sess.run(tf.global_variables_initializer())

In [9]:
from tqdm import tqdm

EARLY_STOPPING, CURRENT_CHECKPOINT, CURRENT_ACC, EPOCH = 5, 0, 0, 0
while True:
    lasttime = time.time()
    if CURRENT_CHECKPOINT == EARLY_STOPPING:
        print('break epoch:%d\n'%(EPOCH))
        break
        
    train_acc, train_loss, test_acc, test_loss = 0, 0, 0, 0
    pbar = tqdm(range(0, len(train_X), batch_size), desc='train minibatch loop')
    for i in pbar:
        batch_x = train_X[i:min(i+batch_size,train_X.shape[0])]
        batch_y = train_Y[i:min(i+batch_size,train_X.shape[0])]
        acc, loss, _ = sess.run([model.accuracy, model.cost, model.optimizer], 
                           feed_dict = {model.X : batch_x, model.Y : batch_y})
        assert not np.isnan(loss)
        train_loss += loss
        train_acc += acc
        pbar.set_postfix(cost=loss, accuracy = acc)
    
    pbar = tqdm(range(0, len(test_X), batch_size), desc='test minibatch loop')
    for i in pbar:
        batch_x = test_X[i:min(i+batch_size,test_X.shape[0])]
        batch_y = test_Y[i:min(i+batch_size,test_X.shape[0])]
        acc, loss = sess.run([model.accuracy, model.cost], 
                           feed_dict = {model.X : batch_x, model.Y : batch_y})
        test_loss += loss
        test_acc += acc
        pbar.set_postfix(cost=loss, accuracy = acc)
    
    train_loss /= (len(train_X) / batch_size)
    train_acc /= (len(train_X) / batch_size)
    test_loss /= (len(test_X) / batch_size)
    test_acc /= (len(test_X) / batch_size)
    
    if test_acc > CURRENT_ACC:
        print('epoch: %d, pass acc: %f, current acc: %f'%(EPOCH,CURRENT_ACC, test_acc))
        CURRENT_ACC = test_acc
        CURRENT_CHECKPOINT = 0
    else:
        CURRENT_CHECKPOINT += 1
        
    print('time taken:', time.time()-lasttime)
    print('epoch: %d, training loss: %f, training acc: %f, valid loss: %f, valid acc: %f\n'%(EPOCH,train_loss,
                                                                                          train_acc,test_loss,
                                                                                          test_acc))
    EPOCH += 1

train minibatch loop: 100%|██████████| 67/67 [00:03<00:00, 21.30it/s, accuracy=0.469, cost=3.16]
test minibatch loop: 100%|██████████| 17/17 [00:00<00:00, 46.16it/s, accuracy=0.494, cost=3.16]
train minibatch loop:   4%|▍         | 3/67 [00:00<00:02, 22.29it/s, accuracy=0.648, cost=3.16]

epoch: 0, pass acc: 0.000000, current acc: 0.509727
time taken: 3.6873512268066406
epoch: 0, training loss: 3.362462, training acc: 0.490450, valid loss: 3.221023, valid acc: 0.509727



train minibatch loop: 100%|██████████| 67/67 [00:03<00:00, 21.39it/s, accuracy=0.469, cost=3.16]
test minibatch loop: 100%|██████████| 17/17 [00:00<00:00, 49.01it/s, accuracy=0.506, cost=3.16]
train minibatch loop:   4%|▍         | 3/67 [00:00<00:02, 21.63it/s, accuracy=0.648, cost=3.16]

time taken: 3.4824252128601074
epoch: 1, training loss: 3.173714, training acc: 0.523279, valid loss: 3.219762, valid acc: 0.509495



train minibatch loop: 100%|██████████| 67/67 [00:03<00:00, 21.20it/s, accuracy=0.457, cost=3.16]
test minibatch loop: 100%|██████████| 17/17 [00:00<00:00, 48.37it/s, accuracy=0.494, cost=3.16]
train minibatch loop:   4%|▍         | 3/67 [00:00<00:02, 21.35it/s, accuracy=0.656, cost=3.16]

time taken: 3.514200210571289
epoch: 2, training loss: 3.173489, training acc: 0.524149, valid loss: 3.219747, valid acc: 0.509727



train minibatch loop: 100%|██████████| 67/67 [00:03<00:00, 21.28it/s, accuracy=0.469, cost=3.16]
test minibatch loop: 100%|██████████| 17/17 [00:00<00:00, 51.00it/s, accuracy=0.494, cost=3.16]
train minibatch loop:   4%|▍         | 3/67 [00:00<00:03, 20.92it/s, accuracy=0.656, cost=3.16]

epoch: 3, pass acc: 0.509727, current acc: 0.510664
time taken: 3.484762191772461
epoch: 3, training loss: 3.173480, training acc: 0.526914, valid loss: 3.219506, valid acc: 0.510664



train minibatch loop: 100%|██████████| 67/67 [00:03<00:00, 21.36it/s, accuracy=0.481, cost=3.15]
test minibatch loop: 100%|██████████| 17/17 [00:00<00:00, 48.99it/s, accuracy=0.494, cost=3.15]
train minibatch loop:   4%|▍         | 3/67 [00:00<00:02, 21.56it/s, accuracy=0.641, cost=3.15]

time taken: 3.487187147140503
epoch: 4, training loss: 3.164650, training acc: 0.529913, valid loss: 3.209525, valid acc: 0.507383



train minibatch loop: 100%|██████████| 67/67 [00:03<00:00, 21.42it/s, accuracy=0.481, cost=3.14]
test minibatch loop: 100%|██████████| 17/17 [00:00<00:00, 50.85it/s, accuracy=0.482, cost=3.14]
train minibatch loop:   4%|▍         | 3/67 [00:00<00:02, 21.85it/s, accuracy=0.641, cost=3.14]

time taken: 3.4660778045654297
epoch: 5, training loss: 3.155919, training acc: 0.531672, valid loss: 3.199303, valid acc: 0.509021



train minibatch loop: 100%|██████████| 67/67 [00:03<00:00, 21.21it/s, accuracy=0.494, cost=3.14]
test minibatch loop: 100%|██████████| 17/17 [00:00<00:00, 53.04it/s, accuracy=0.506, cost=3.14]
train minibatch loop:   3%|▎         | 2/67 [00:00<00:03, 18.49it/s, accuracy=0.57, cost=3.14] 

epoch: 6, pass acc: 0.510664, current acc: 0.511370
time taken: 3.5042474269866943
epoch: 6, training loss: 3.153342, training acc: 0.533030, valid loss: 3.199299, valid acc: 0.511370



train minibatch loop: 100%|██████████| 67/67 [00:03<00:00, 21.16it/s, accuracy=0.494, cost=3.14]
test minibatch loop: 100%|██████████| 17/17 [00:00<00:00, 45.29it/s, accuracy=0.506, cost=3.14]
train minibatch loop:   4%|▍         | 3/67 [00:00<00:02, 22.26it/s, accuracy=0.648, cost=3.14]

time taken: 3.5448806285858154
epoch: 7, training loss: 3.153340, training acc: 0.532561, valid loss: 3.199297, valid acc: 0.509964



train minibatch loop: 100%|██████████| 67/67 [00:03<00:00, 21.59it/s, accuracy=0.481, cost=3.14]
test minibatch loop: 100%|██████████| 17/17 [00:00<00:00, 50.59it/s, accuracy=0.506, cost=3.14]
train minibatch loop:   3%|▎         | 2/67 [00:00<00:03, 19.12it/s, accuracy=0.648, cost=3.14]

time taken: 3.432685613632202
epoch: 8, training loss: 3.153339, training acc: 0.532493, valid loss: 3.199295, valid acc: 0.509026



train minibatch loop: 100%|██████████| 67/67 [00:03<00:00, 21.27it/s, accuracy=0.469, cost=3.14]
test minibatch loop: 100%|██████████| 17/17 [00:00<00:00, 49.57it/s, accuracy=0.494, cost=3.14]
train minibatch loop:   4%|▍         | 3/67 [00:00<00:02, 21.99it/s, accuracy=0.648, cost=3.14]

time taken: 3.49562931060791
epoch: 9, training loss: 3.153339, training acc: 0.533597, valid loss: 3.199294, valid acc: 0.506914



train minibatch loop: 100%|██████████| 67/67 [00:03<00:00, 21.20it/s, accuracy=0.481, cost=3.14]
test minibatch loop: 100%|██████████| 17/17 [00:00<00:00, 51.53it/s, accuracy=0.506, cost=3.14]
train minibatch loop:   4%|▍         | 3/67 [00:00<00:03, 20.85it/s, accuracy=0.648, cost=3.14]

time taken: 3.4959912300109863
epoch: 10, training loss: 3.153338, training acc: 0.535072, valid loss: 3.199292, valid acc: 0.506213



train minibatch loop: 100%|██████████| 67/67 [00:03<00:00, 21.01it/s, accuracy=0.481, cost=3.14]
test minibatch loop: 100%|██████████| 17/17 [00:00<00:00, 47.83it/s, accuracy=0.506, cost=3.14]

time taken: 3.4796535968780518
epoch: 11, training loss: 3.153338, training acc: 0.534017, valid loss: 3.199291, valid acc: 0.510433

break epoch:12






In [12]:
real_Y, predict_Y = [], []

pbar = tqdm(range(0, len(test_X), batch_size), desc='validation minibatch loop')
for i in pbar:
    batch_x = test_X[i:min(i+batch_size,test_X.shape[0])]
    batch_y = test_Y[i:min(i+batch_size,test_X.shape[0])]
    predict_Y += np.argmax(sess.run(model.logits, feed_dict = {model.X : batch_x, model.Y : batch_y}),1).tolist()
    real_Y += batch_y

validation minibatch loop: 100%|██████████| 17/17 [00:00<00:00, 49.74it/s]


In [13]:
from sklearn import metrics
print(metrics.classification_report(real_Y, predict_Y, target_names = ['negative','positive']))

             precision    recall  f1-score   support

   negative       0.50      0.60      0.55      1069
   positive       0.50      0.40      0.44      1064

avg / total       0.50      0.50      0.49      2133

