In [19]:
import tensorflow as tf
import numpy as np
import time
import os
from sklearn.preprocessing import LabelEncoder
from nltk.corpus import stopwords
import re
from nltk.tokenize import word_tokenize
import collections
import random
from unidecode import unidecode
import time
from sklearn.cross_validation import train_test_split
import itertools
import pickle

english_stopwords = stopwords.words('english')



In [6]:
def clearstring(string):
    string = unidecode(string)
    string = re.sub('[^A-Za-z ]+', '', string)
    string = word_tokenize(string)
    string = filter(None, string)
    string = [y.strip() for y in string]
    string = [y for y in string if len(y) > 2 and y.find('nbsp') < 0 and y.find('href') < 0 and y not in english_stopwords]
    string = ' '.join(string).lower()
    return ''.join(''.join(s)[:2] for _, s in itertools.groupby(string))

def read_data(location):
    list_folder = os.listdir(location)
    label = list_folder
    label.sort()
    outer_string, outer_label = [], []
    for i in range(len(list_folder)):
        list_file = os.listdir(location + list_folder[i])
        strings = []
        for x in range(len(list_file)):
            with open(location + list_folder[i] + '/' + list_file[x], 'r') as fopen:
                strings += fopen.read().split('\n')
        strings = list(filter(None, strings))
        for k in range(len(strings)):
            strings[k] = clearstring(strings[k])
        labels = [i] * len(strings)
        outer_string += strings
        outer_label += labels
    
    dataset = np.array([outer_string, outer_label])
    dataset = dataset.T
    np.random.shuffle(dataset)
    
    string = []
    for i in range(dataset.shape[0]):
        string += dataset[i][0].split()
    
    return string, dataset, label

In [11]:
_,df,label = read_data('/home/husein/space/text-dataset/sentiment/data/')
df[:5,:]

array([[ 'bland inert production one shakespeare vibrant plays guess intent make play accessible understandable possible audience exposed shakespeare though making every line clear every intent obvious drained play life turned flat caricature somehow actually boring hard feat given wonderful material acting forgettable best sam waterston benedick douglas watson pedro others however fare well april shawnham hero pouty breathless airhead frequently provokes winces jerry mayer john nonsensical cartoon character level snidely whiplash though snidley much enjoyable murray abraham know guy killed mozart version unless disguise name removed credits given producer joseph papp basically theater god production disappointing head scratching well bother watch branagh much ado instead version overflowing vitality humor say nothing wonderful performances',
        '0'],
       [ 'german private ill renowned copying dutch naturally formats well case edel starck xeroxing went far basics screwball stan

In [9]:
with open('vector-sentiment.p', 'rb') as fopen:
    vectors = pickle.load(fopen)
with open('dict-sentiment.p', 'rb') as fopen:
    dictionary = pickle.load(fopen)

In [20]:
train_X, test_X, train_Y, test_Y = train_test_split(df[:,0], df[:, 1].astype('int'), test_size = 0.2)

In [21]:
class Model:
    
    def __init__(self, num_layers, size_layer, dimension_input, dimension_output, learning_rate):
        def lstm_cell():
            return tf.nn.rnn_cell.LSTMCell(size_layer)
        self.rnn_cells = tf.nn.rnn_cell.MultiRNNCell([lstm_cell() for _ in range(num_layers)])
        self.X = tf.placeholder(tf.float32, [None, None, dimension_input])
        self.Y = tf.placeholder(tf.float32, [None, dimension_output])
        drop = tf.contrib.rnn.DropoutWrapper(self.rnn_cells, output_keep_prob = 0.5)
        self.outputs, self.last_state = tf.nn.dynamic_rnn(drop, self.X, dtype = tf.float32)
        self.rnn_W = tf.Variable(tf.random_normal((size_layer, dimension_output)))
        self.rnn_B = tf.Variable(tf.random_normal([dimension_output]))
        self.logits = tf.matmul(self.outputs[:, -1], self.rnn_W) + self.rnn_B
        self.cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits = self.logits, labels = self.Y))
        l2 = sum(0.0005 * tf.nn.l2_loss(tf_var) for tf_var in tf.trainable_variables())
        self.cost += l2
        self.optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(self.cost)
        self.correct_pred = tf.equal(tf.argmax(self.logits, 1), tf.argmax(self.Y, 1))
        self.accuracy = tf.reduce_mean(tf.cast(self.correct_pred, tf.float32))

In [14]:
lens = [len(df[i,0].split())for i in range(df.shape[0])]

In [15]:
np.mean(lens)

117.961

In [16]:
maxlen = 60
location = os.getcwd()
num_layers = 2
size_layer = 256
learning_rate = 0.0001
batch = 100

In [23]:
tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Model(num_layers, size_layer, vectors.shape[1], len(label), learning_rate)
sess.run(tf.global_variables_initializer())
dimension = vectors.shape[1]
saver = tf.train.Saver(tf.global_variables())
EARLY_STOPPING, CURRENT_CHECKPOINT, CURRENT_ACC, EPOCH = 10, 0, 0, 0
while True:
    lasttime = time.time()
    if CURRENT_CHECKPOINT == EARLY_STOPPING:
        print('break epoch:', EPOCH)
        break
    train_acc, train_loss, test_acc, test_loss = 0, 0, 0, 0
    for i in range(0, (train_X.shape[0] // batch) * batch, batch):
        batch_x = np.zeros((batch, maxlen, dimension))
        batch_y = np.zeros((batch, len(label)))
        for k in range(batch):
            tokens = train_X[i + k].split()[:maxlen]
            emb_data = np.zeros((maxlen, dimension), dtype = np.float32)
            for no, text in enumerate(tokens[::-1]):
                try:
                    emb_data[-1 - no, :] += vectors[dictionary[text], :]
                except Exception as e:
                    print(e)
                    continue
            batch_y[k, int(train_Y[i + k])] = 1.0
            batch_x[k, :, :] = emb_data[:, :]
        loss, _ = sess.run([model.cost, model.optimizer], feed_dict = {model.X : batch_x, model.Y : batch_y})
        train_loss += loss
        train_acc += sess.run(model.accuracy, feed_dict = {model.X : batch_x, model.Y : batch_y})
    
    for i in range(0, (test_X.shape[0] // batch) * batch, batch):
        batch_x = np.zeros((batch, maxlen, dimension))
        batch_y = np.zeros((batch, len(label)))
        for k in range(batch):
            tokens = test_X[i + k].split()[:maxlen]
            emb_data = np.zeros((maxlen, dimension), dtype = np.float32)
            for no, text in enumerate(tokens[::-1]):
                try:
                    emb_data[-1 - no, :] += vectors[dictionary[text], :]
                except:
                    continue
            batch_y[k, int(test_Y[i + k])] = 1.0
            batch_x[k, :, :] = emb_data[:, :]
        loss, acc = sess.run([model.cost, model.accuracy], feed_dict = {model.X : batch_x, model.Y : batch_y})
        test_loss += loss
        test_acc += acc
        
    train_loss /= (train_X.shape[0] // batch)
    train_acc /= (train_X.shape[0] // batch)
    test_loss /= (test_X.shape[0] // batch)
    test_acc /= (test_X.shape[0] // batch)
    if test_acc > CURRENT_ACC:
        print('epoch:', EPOCH, ', pass acc:', CURRENT_ACC, ', current acc:', test_acc)
        CURRENT_ACC = test_acc
        CURRENT_CHECKPOINT = 0
        saver.save(sess, os.getcwd() + "/rnn-sentiment.ckpt")
    else:
        CURRENT_CHECKPOINT += 1
    EPOCH += 1
    print('time taken:', time.time()-lasttime)
    print('epoch:', EPOCH, ', training loss:', train_loss, ', training acc:', train_acc, ', valid loss:', test_loss, ', valid acc:', test_acc)

Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See tf.nn.softmax_cross_entropy_with_logits_v2.

epoch: 0 , pass acc: 0 , current acc: 0.611799997091
time taken: 26.790687561035156
epoch: 1 , training loss: 1.26772443533 , training acc: 0.578849998564 , valid loss: 1.1737433815 , valid acc: 0.611799997091
epoch: 1 , pass acc: 0.611799997091 , current acc: 0.68400000453
time taken: 26.385164976119995
epoch: 2 , training loss: 1.11333983958 , training acc: 0.659199998677 , valid loss: 1.07287054181 , valid acc: 0.68400000453
epoch: 2 , pass acc: 0.68400000453 , current acc: 0.699800001383
time taken: 26.471251726150513
epoch: 3 , training loss: 1.02642693698 , training acc: 0.704400001764 , valid loss: 1.01873482704 , valid acc: 0.699800001383
epoch: 3 , pass acc: 0.699800001383 , current acc: 0.705400002003
time taken: 26.53897476196289
epoch: 4 , training loss: 0.982846246958 , training acc: 0.7