In [1]:
import tensorflow as tf
import numpy as np
import time
import os
from sklearn.preprocessing import LabelEncoder
from nltk.corpus import stopwords
import re
from nltk.tokenize import word_tokenize
import collections
import random
from unidecode import unidecode
import time
from sklearn.cross_validation import train_test_split
import itertools
import pickle

english_stopwords = stopwords.words('english')



In [2]:
def clearstring(string):
    string = unidecode(string)
    string = re.sub('[^A-Za-z ]+', '', string)
    string = word_tokenize(string)
    string = filter(None, string)
    string = [y.strip() for y in string]
    string = [y for y in string if len(y) > 2 and y.find('nbsp') < 0 and y.find('href') < 0 and y not in english_stopwords]
    string = ' '.join(string).lower()
    return ''.join(''.join(s)[:2] for _, s in itertools.groupby(string))

def read_data(location):
    list_folder = os.listdir(location)
    label = list_folder
    label.sort()
    outer_string, outer_label = [], []
    for i in range(len(list_folder)):
        list_file = os.listdir(location + list_folder[i])
        strings = []
        for x in range(len(list_file)):
            with open(location + list_folder[i] + '/' + list_file[x], 'r') as fopen:
                strings += fopen.read().split('\n')
        strings = list(filter(None, strings))
        for k in range(len(strings)):
            strings[k] = clearstring(strings[k])
        labels = [i] * len(strings)
        outer_string += strings
        outer_label += labels
    
    dataset = np.array([outer_string, outer_label])
    dataset = dataset.T
    np.random.shuffle(dataset)
    
    string = []
    for i in range(dataset.shape[0]):
        string += dataset[i][0].split()
    
    return string, dataset, label

In [3]:
_,df,label = read_data('/home/husein/space/text-dataset/message/data/')
df[:5,:]

array([[ 'statement congressman danny davis military intervention syria apparent use chemical weapons syria human tragedy hearts families lost lives much remains unknown events today congress beginning debate issue constituents deeply concerned events syria searching appropriate response end killing move quickly toward political solution war syria however overwhelmingly opposed military intervention kind concerned morality action concerned legality action concerned loss life resulting action concerned effectiveness lack effectiveness action concerned action involving deeply civil war syria concerned follow conflict action might trigger concerned sabotaging possible agreement iran nuclear weapons concerned drawing resources focus critical immediate problems home share concerns remain today staunch support president obama however first responsibility member congress represent interests views constituents calls emails letters forms communication office running opposition military interven

In [4]:
with open('vector-message.p', 'rb') as fopen:
    vectors = pickle.load(fopen)
with open('dict-message.p', 'rb') as fopen:
    dictionary = pickle.load(fopen)

In [5]:
train_X, test_X, train_Y, test_Y = train_test_split(df[:,0], df[:, 1].astype('int'), test_size = 0.2)

In [6]:
class Model:
    
    def __init__(self, num_layers, size_layer, dimension_input, dimension_output, learning_rate):
        def lstm_cell():
            return tf.nn.rnn_cell.LSTMCell(size_layer)
        self.rnn_cells = tf.nn.rnn_cell.MultiRNNCell([lstm_cell() for _ in range(num_layers)])
        self.X = tf.placeholder(tf.float32, [None, None, dimension_input])
        self.Y = tf.placeholder(tf.float32, [None, dimension_output])
        drop = tf.contrib.rnn.DropoutWrapper(self.rnn_cells, output_keep_prob = 0.5)
        self.outputs, self.last_state = tf.nn.dynamic_rnn(drop, self.X, dtype = tf.float32)
        self.rnn_W = tf.Variable(tf.random_normal((size_layer, dimension_output)))
        self.rnn_B = tf.Variable(tf.random_normal([dimension_output]))
        self.logits = tf.matmul(self.outputs[:, -1], self.rnn_W) + self.rnn_B
        self.cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits = self.logits, labels = self.Y))
        l2 = sum(0.0005 * tf.nn.l2_loss(tf_var) for tf_var in tf.trainable_variables())
        self.cost += l2
        self.optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(self.cost)
        self.correct_pred = tf.equal(tf.argmax(self.logits, 1), tf.argmax(self.Y, 1))
        self.accuracy = tf.reduce_mean(tf.cast(self.correct_pred, tf.float32))

In [9]:
lens = [len(df[i,0].split())for i in range(df.shape[0])]

In [10]:
np.mean(lens)

17.80012016823553

In [11]:
maxlen = 20
location = os.getcwd()
num_layers = 2
size_layer = 256
learning_rate = 0.0001
batch = 100

In [12]:
tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Model(num_layers, size_layer, vectors.shape[1], len(label), learning_rate)
sess.run(tf.global_variables_initializer())
dimension = vectors.shape[1]
saver = tf.train.Saver(tf.global_variables())
EARLY_STOPPING, CURRENT_CHECKPOINT, CURRENT_ACC, EPOCH = 10, 0, 0, 0
while True:
    lasttime = time.time()
    if CURRENT_CHECKPOINT == EARLY_STOPPING:
        print('break epoch:', EPOCH)
        break
    train_acc, train_loss, test_acc, test_loss = 0, 0, 0, 0
    for i in range(0, (train_X.shape[0] // batch) * batch, batch):
        batch_x = np.zeros((batch, maxlen, dimension))
        batch_y = np.zeros((batch, len(label)))
        for k in range(batch):
            tokens = train_X[i + k].split()[:maxlen]
            emb_data = np.zeros((maxlen, dimension), dtype = np.float32)
            for no, text in enumerate(tokens[::-1]):
                try:
                    emb_data[-1 - no, :] += vectors[dictionary[text], :]
                except Exception as e:
                    print(e)
                    continue
            batch_y[k, int(train_Y[i + k])] = 1.0
            batch_x[k, :, :] = emb_data[:, :]
        loss, _ = sess.run([model.cost, model.optimizer], feed_dict = {model.X : batch_x, model.Y : batch_y})
        train_loss += loss
        train_acc += sess.run(model.accuracy, feed_dict = {model.X : batch_x, model.Y : batch_y})
    
    for i in range(0, (test_X.shape[0] // batch) * batch, batch):
        batch_x = np.zeros((batch, maxlen, dimension))
        batch_y = np.zeros((batch, len(label)))
        for k in range(batch):
            tokens = test_X[i + k].split()[:maxlen]
            emb_data = np.zeros((maxlen, dimension), dtype = np.float32)
            for no, text in enumerate(tokens[::-1]):
                try:
                    emb_data[-1 - no, :] += vectors[dictionary[text], :]
                except:
                    continue
            batch_y[k, int(test_Y[i + k])] = 1.0
            batch_x[k, :, :] = emb_data[:, :]
        loss, acc = sess.run([model.cost, model.accuracy], feed_dict = {model.X : batch_x, model.Y : batch_y})
        test_loss += loss
        test_acc += acc
        
    train_loss /= (train_X.shape[0] // batch)
    train_acc /= (train_X.shape[0] // batch)
    test_loss /= (test_X.shape[0] // batch)
    test_acc /= (test_X.shape[0] // batch)
    if test_acc > CURRENT_ACC:
        print('epoch:', EPOCH, ', pass acc:', CURRENT_ACC, ', current acc:', test_acc)
        CURRENT_ACC = test_acc
        CURRENT_CHECKPOINT = 0
        saver.save(sess, os.getcwd() + "/rnn-message.ckpt")
    else:
        CURRENT_CHECKPOINT += 1
    EPOCH += 1
    print('time taken:', time.time()-lasttime)
    print('epoch:', EPOCH, ', training loss:', train_loss, ', training acc:', train_acc, ', valid loss:', test_loss, ', valid acc:', test_acc)

Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See tf.nn.softmax_cross_entropy_with_logits_v2.

epoch: 0 , pass acc: 0 , current acc: 0.252222221759
time taken: 3.4906442165374756
epoch: 1 , training loss: 3.15088228079 , training acc: 0.254102564393 , valid loss: 3.01215646002 , valid acc: 0.252222221759
epoch: 1 , pass acc: 0.252222221759 , current acc: 0.282222222951
time taken: 3.2484922409057617
epoch: 2 , training loss: 2.82656818781 , training acc: 0.312564102503 , valid loss: 2.93745064735 , valid acc: 0.282222222951
time taken: 3.1713643074035645
epoch: 3 , training loss: 2.72776541954 , training acc: 0.358717949727 , valid loss: 2.85411951277 , valid acc: 0.276666667726
epoch: 3 , pass acc: 0.282222222951 , current acc: 0.294444446762
time taken: 3.2841989994049072
epoch: 4 , training loss: 2.63015687771 , training acc: 0.37307692109 , valid loss: 2.85543113285 , valid acc: 0.29444444