# RNN for text classification

In [1]:
import numpy as np
import h5py
import string
import random
import tensorflow as tf


  from ._conv import register_converters as _register_converters


In [2]:
# Download word vectors
from urllib.request import urlretrieve
import os
if not os.path.isfile('mini.h5'):
    print("Downloading Conceptnet Numberbatch word embeddings...")
    conceptnet_url = 'http://conceptnet.s3.amazonaws.com/precomputed-data/2016/numberbatch/17.06/mini.h5'
    urlretrieve(conceptnet_url, 'mini.h5')

Downloading Conceptnet Numberbatch word embeddings...


In [164]:
import numpy as np
import h5py
with h5py.File('mini.h5', 'r') as f:
    all_words = [word.decode('utf-8') for word in f['mat']['axis1'][:]]
    all_embeddings = f['mat']['block0_values'][:]
    
english_words = [word[6:] for word in all_words if word.startswith('/c/en/')]
english_word_indices = [i for i, word in enumerate(all_words) if word.startswith('/c/en/')]
english_embedddings = all_embeddings[english_word_indices]

norms = np.linalg.norm(english_embedddings, axis=1)
normalized_embeddings = english_embedddings.astype('float32') / norms.astype('float32').reshape([-1, 1])

index = {word: i for i, word in enumerate(english_words)}

def similarity_score(w1, w2):
    score = np.dot(normalized_embeddings[index[w1], :], normalized_embeddings[index[w2], :])
    return score

def print_similarity(w1,w2):
    try:
        print('{0}\t{1}\t'.format(w1,w2), \
          similarity_score('{}'.format(w1), '{}'.format(w2)))
    except:
        print('One of the words is not in the dictionary.')
    return None


In [165]:
remove_punct=str.maketrans('','',string.punctuation)

# This function converts a line of our data file into
# a tuple (x, y), where x is 300-dimensional representation
# of the words in a review, and y is its label.
def convert_line_to_example(line):
    # Pull out the first character: that's our label (0 or 1)
    y = int(line[0])
    # Split the line into words using Python's split() function
    words = line[2:].translate(remove_punct).lower().split()
    # Look up the embeddings of each word, ignoring words not
    # in our pretrained vocabulary.
    embeddings = [normalized_embeddings[index[w]] for w in words
                  if w in index]
    # Take the mean of the embeddings
    x = np.mean(np.vstack(embeddings), axis=0)
    return {'x': x, 'y': y, 'w':embeddings}

# Apply the function to each line in the file.
enc = 'utf-8' # This is necessary from within the singularity shell
with open("movie-simple.txt", "r", encoding=enc) as f:
    dataset = [convert_line_to_example(l) for l in f.readlines()]

In [166]:
import random
random.shuffle(dataset)

batch_size = 100
total_batches = len(dataset) // batch_size

train_batches = 3 * total_batches // 4
train, test = dataset[:train_batches*batch_size], dataset[train_batches*batch_size:]

## MLP

In [27]:
results_acc=[]
results_param=[]
for lr in [0.0005, 0.005, 0.05, 0.1]:
    for layer_size_1 in [25, 50,100,150,200]:
        for layer_size_2 in [25, 50,100,150,200]:
            tf.reset_default_graph()

            # Placeholders for input
            X = tf.placeholder(tf.float32, [None, 300])
            y = tf.placeholder(tf.float32, [None, 1])

            # Three-layer MLP
            h1 = tf.layers.dense(X, layer_size_1, tf.nn.relu, kernel_initializer=tf.initializers.truncated_normal)
            h2 = tf.layers.dense(h1, layer_size_2, tf.nn.relu, kernel_initializer=tf.initializers.truncated_normal)
            logits = tf.layers.dense(h2, 1)
            probabilities = tf.sigmoid(logits)

            # Loss and metrics
            loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=y))
            accuracy = tf.reduce_mean(tf.cast(tf.equal(tf.round(tf.sigmoid(logits)), y), tf.float32))

            # Training
            train_step = tf.train.GradientDescentOptimizer(lr).minimize(loss)

            # Initialization of variables
            initialize_all = tf.global_variables_initializer()
            sess = tf.InteractiveSession()
            sess.run(initialize_all)
            for epoch in range(500):
                for batch in range(train_batches):
                    data = train[batch*batch_size:(batch+1)*batch_size]
                    reviews = [sample['x'] for sample in data]
                    labels  = [sample['y'] for sample in data]
                    labels = np.array(labels).reshape([-1, 1])
                    _, l, acc = sess.run([train_step, loss, accuracy], feed_dict={X: reviews, y: labels})
                if epoch % 10 == 0:
                    print("Epoch", epoch, "Loss", l, "Acc", acc)
                random.shuffle(train)

            # Evaluate on test set
            test_reviews = [sample['x'] for sample in test]
            test_labels  = [sample['y'] for sample in test]
            test_labels = np.array(test_labels).reshape([-1, 1])
            acc = sess.run(accuracy, feed_dict={X: test_reviews, y: test_labels})
            print("Final accuracy:", acc)
            results_acc.append(acc)
            results_param.append([lr, layer_size_1, layer_size_2])
argmax=np.argmax(results_acc)
print ('Max accuracy: ', results_acc[argmax])
print ('max parameters: ', results_param[argmax])

After tuning the hyperparameters the accuracy was 0.95.

Size of hidden layer 1: 25
Size of hidden layer 2: 50
Learning rate: 0.1


Accuracy: 0.95

### RNN

In [167]:
with open("movie-simple.txt", "r",encoding=enc) as f:
    dataset = [convert_line_to_example(l) for l in f.readlines()]
import random
random.shuffle(dataset)
batch_size = 1
total_batches = len(dataset) // batch_size
train_batches = 3 * total_batches // 4
train, test = dataset[:train_batches*batch_size], dataset[train_batches*batch_size:]

In [168]:
results_acc=[]
results_param=[]

for n_neurons in [10,20,50,100,200]:
    for lr in [0.001, 0.01, 0.1]:
        tf.reset_default_graph()
        # sizes
        n_steps = None
        n_inputs = 300
        # Build RNN
        X= tf.placeholder(tf.float32, [None, n_steps, n_inputs])
        y= tf.placeholder(tf.float32, [None, 1])
        basic_cell = tf.contrib.rnn.BasicRNNCell(n_neurons,activation=tf.nn.tanh)
        outputs, states = tf.nn.dynamic_rnn(basic_cell, X, dtype=tf.float32)
        last_cell_output=outputs[:,-1,:]
        y_=tf.layers.dense(last_cell_output,1)

        # Loss and metrics
        loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=y_, labels=y))
        accuracy = tf.reduce_mean(tf.cast(tf.equal(tf.round(tf.sigmoid(y_)), y), tf.float32))

        # Training
        train_step = tf.train.AdamOptimizer(lr).minimize(loss)


        initialize_all = tf.global_variables_initializer()
        sess = tf.InteractiveSession()
        sess.run(initialize_all)
        l_ma=.74
        acc_ma=.5
        for epoch in range(5):
            for batch in range(train_batches):
                data = train[batch*batch_size:(batch+1)*batch_size]
                reviews = np.array([sample['w'] for sample in data]).reshape([1,-1,300])
                labels  = np.array([sample['y'] for sample in data]).reshape([1,1])
                labels = np.array(labels).reshape([-1, 1])
                _, l, acc = sess.run([train_step, loss, accuracy], feed_dict={X: reviews, y: labels})
                l_ma=.99*l_ma+(.01)*l
                acc_ma=.99*acc_ma+(.01)*acc
                if (batch+1) % 100 == 0:
                    print("batch", batch, "Loss", l_ma, "Acc", acc_ma)
            if epoch % 1 == 0:
                print("Epoch", epoch, "Loss", l_ma, "Acc", acc_ma)
            random.shuffle(train)
        # Evaluate on test set
        test_acc=0
        n=0
        for sample in test:
            test_reviews = np.array([sample['w'] ]).reshape([1,-1,300])
            test_labels  = np.array([sample['y']]).reshape([1,1])
            test_labels = np.array(test_labels).reshape([-1, 1])
            test_acc += sess.run(accuracy, feed_dict={X: test_reviews, y: test_labels})
            n+=1
        acc=test_acc/n 
        print("Final accuracy:", acc)
        results_acc.append(acc)
        results_param.append([lr, layer_size_1, layer_size_2])
argmax=np.argmax(results_acc)
print ('Max accuracy: ', results_acc[argmax])
print ('max parameters: ', results_param[argmax])



batch 99 Loss 0.6890430946503397 Acc 0.5679267292680439
batch 199 Loss 0.6452920381140875 Acc 0.637673310413806
batch 299 Loss 0.5929292699282466 Acc 0.7139748132471735
batch 399 Loss 0.5777827962740152 Acc 0.6991921835980426
batch 499 Loss 0.45764420688484003 Acc 0.7995061641592535
batch 599 Loss 0.44890900851499205 Acc 0.8242916622930305
batch 699 Loss 0.37686708342248737 Acc 0.8656283694864586
batch 799 Loss 0.3355433456304097 Acc 0.8689760916827961
batch 899 Loss 0.2987315440251814 Acc 0.9054652230764809
batch 999 Loss 0.22958673844969288 Acc 0.9265061950572931
Epoch 0 Loss 0.23755461405959188 Acc 0.9308909410781158
batch 99 Loss 0.1834649188244278 Acc 0.9530707386449337
batch 199 Loss 0.2136503709762892 Acc 0.9371769508088312
batch 299 Loss 0.24893887145855337 Acc 0.9093194403831523
batch 399 Loss 0.18573083285803635 Acc 0.9361814622153067
batch 499 Loss 0.18992625614935113 Acc 0.9361551760904817
batch 599 Loss 0.2196896773019936 Acc 0.9102254590929971
batch 699 Loss 0.16047697517

batch 199 Loss 0.6999153401126215 Acc 0.674025576187964
batch 299 Loss 0.6964332185044196 Acc 0.6758463508950977
batch 399 Loss 0.758182352767888 Acc 0.6615615799955862
batch 499 Loss 0.7778918484959784 Acc 0.6399580689040285
batch 599 Loss 0.6996220263830355 Acc 0.6789968686781622
batch 699 Loss 0.7107403005297368 Acc 0.6310814491471803
batch 799 Loss 0.7671936429976595 Acc 0.6275104773864335
batch 899 Loss 0.7273004408059797 Acc 0.645121984496663
batch 999 Loss 0.7632513690162444 Acc 0.6052939122024075
Epoch 3 Loss 0.7517338800619747 Acc 0.6213502877212884
batch 99 Loss 0.7839349181215559 Acc 0.56310697665866
batch 199 Loss 0.7813894709688121 Acc 0.6323161086001202
batch 299 Loss 0.7130465794738682 Acc 0.6413323299615559
batch 399 Loss 0.8140676341472399 Acc 0.57265715870532
batch 499 Loss 0.7771496098883882 Acc 0.6061744519901362
batch 599 Loss 0.8622973208490164 Acc 0.6116288833476228
batch 699 Loss 0.7747244015669684 Acc 0.6448590005762913
batch 799 Loss 0.7756000849491566 Acc 0.6

batch 299 Loss 0.9008175050489206 Acc 0.5497390711200434
batch 399 Loss 0.8627725850154366 Acc 0.559488984875775
batch 499 Loss 0.9166894551455351 Acc 0.5595192260346036
batch 599 Loss 1.028829331423228 Acc 0.5702703455646485
batch 699 Loss 1.0440773563626635 Acc 0.5473328896911931
batch 799 Loss 0.9126141569428391 Acc 0.5481491533336174
batch 899 Loss 0.8285546153720692 Acc 0.5406207673616358
batch 999 Loss 0.7736159984814307 Acc 0.6094969992210771
Epoch 1 Loss 0.9823598504946953 Acc 0.5678535451805692
batch 99 Loss 0.8118138489707809 Acc 0.6611545175600805
batch 199 Loss 0.7998729633489222 Acc 0.661327556623255
batch 299 Loss 0.8926119748826398 Acc 0.6112268039877938
batch 399 Loss 0.8353235447659232 Acc 0.6254092010784538
batch 499 Loss 0.9008066595448931 Acc 0.6278441569982175
batch 599 Loss 0.8039494470047975 Acc 0.6498422356753144
batch 699 Loss 0.8113323431456906 Acc 0.6278228410493212
batch 799 Loss 0.8526169295897501 Acc 0.6406471844063568
batch 899 Loss 1.0964416553342637 Acc

batch 299 Loss 0.6040048766161472 Acc 0.7680341201036825
batch 399 Loss 0.6189196136391321 Acc 0.6903981481376905
batch 499 Loss 0.5558889227882851 Acc 0.7185116108051592
batch 599 Loss 0.48979867109341274 Acc 0.7728630732856252
batch 699 Loss 0.51962734369642 Acc 0.7667221452136953
batch 799 Loss 0.5680741126008407 Acc 0.7141250021154685
batch 899 Loss 0.5183742205654934 Acc 0.7022111639388284
batch 999 Loss 0.4951565644501967 Acc 0.752591685048371
Epoch 4 Loss 0.5599841441388069 Acc 0.7004777086213305
Final accuracy: 0.6458923512747875
batch 99 Loss 1.2779178681445218 Acc 0.5839997188887098
batch 199 Loss 1.1338655559852868 Acc 0.5400665633809886
batch 299 Loss 1.467515512209052 Acc 0.4643499134822689
batch 399 Loss 1.2331155568557042 Acc 0.49232586262044276
batch 499 Loss 1.8945450139946631 Acc 0.486843036788335
batch 599 Loss 2.127750557751528 Acc 0.4912568170269294
batch 699 Loss 2.10154731930898 Acc 0.4974658030342793
batch 799 Loss 1.748054223649093 Acc 0.47944006080080187
batch

batch 299 Loss 0.7669031590226141 Acc 0.5912613378502917
batch 399 Loss 0.820261545155693 Acc 0.5561833136926619
batch 499 Loss 0.7986800935409806 Acc 0.509223803225904
batch 599 Loss 0.8281327616727683 Acc 0.47840913768819154
batch 699 Loss 0.7992261125375523 Acc 0.532117893556792
batch 799 Loss 0.7934839433999653 Acc 0.5139918123699265
batch 899 Loss 0.8184559222478046 Acc 0.5076895512938444
batch 999 Loss 0.7982031589518148 Acc 0.5085114296283384
Epoch 2 Loss 0.7851258012969545 Acc 0.5090109180038495
batch 99 Loss 0.844658489891707 Acc 0.5397939698252321
batch 199 Loss 0.8730173143869551 Acc 0.5254573879215967
batch 299 Loss 0.8281851204738335 Acc 0.5401213604288934
batch 399 Loss 0.826237049474842 Acc 0.565071548073966
batch 499 Loss 0.8917416495199503 Acc 0.49651947913879424
batch 599 Loss 0.82704617404733 Acc 0.5007636068998558
batch 699 Loss 0.737016414903358 Acc 0.604657234659229
batch 799 Loss 0.7510710688043167 Acc 0.5895789910659714
batch 899 Loss 0.7886255790632327 Acc 0.51

batch 399 Loss 0.9105648774827116 Acc 0.5890944461949873
batch 499 Loss 0.9242953308702428 Acc 0.5374457755143317
batch 599 Loss 0.9446107252212541 Acc 0.5352160922795922
batch 699 Loss 1.185294135479346 Acc 0.4492592592278209
batch 799 Loss 0.9717739192317059 Acc 0.4521832883485542
batch 899 Loss 0.9456283007534574 Acc 0.5351570634587598
batch 999 Loss 1.0726272426042402 Acc 0.5283894207011407
Epoch 0 Loss 1.0630553625829693 Acc 0.5198020385077372
batch 99 Loss 1.0177807434355175 Acc 0.5248114941074811
batch 199 Loss 0.9856888085467843 Acc 0.5211989256393522
batch 299 Loss 0.9497293796784723 Acc 0.5276814813115606
batch 399 Loss 0.8889635934713707 Acc 0.4923713160865309
batch 499 Loss 0.8998282124856068 Acc 0.5412592689596639
batch 599 Loss 0.8069256593185833 Acc 0.5520810000712962
batch 699 Loss 0.8324706128736824 Acc 0.5157741173997502
batch 799 Loss 0.9873943505276581 Acc 0.48940703725490764
batch 899 Loss 1.002240018997958 Acc 0.5860369276883198
batch 999 Loss 1.111102791087572 Ac

After tuning the hyperparameters the accuracy was 0.95 for 5 epochs.

The optimal number of layer is 50:

the learning rate is 0.001.

