In [67]:
import tensorflow as tf
import numpy as np
import os
from sklearn.preprocessing import LabelEncoder
import re
import collections
import random
import time

class Model_vec:
    
    def __init__(self, batch_size, dimension_size, learning_rate, vocabulary_size):
        self.train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
        self.train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
        embeddings = tf.Variable(tf.random_uniform([vocabulary_size, dimension_size], -1.0, 1.0))
        embed = tf.nn.embedding_lookup(embeddings, self.train_inputs)
        self.nce_weights = tf.Variable(tf.truncated_normal([vocabulary_size, dimension_size], stddev = 1.0 / np.sqrt(dimension_size)))
        self.nce_biases = tf.Variable(tf.zeros([vocabulary_size]))
        self.loss = tf.reduce_mean(tf.nn.nce_loss(weights = self.nce_weights, biases = self.nce_biases, labels = self.train_labels,
                                                  inputs=embed, num_sampled = batch_size / 2, num_classes = vocabulary_size))
        self.optimizer = tf.train.AdamOptimizer(learning_rate).minimize(self.loss)
        self.norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
        self.normalized_embeddings = embeddings / self.norm
        
class Model:
    def __init__(self, sequence_length, dimension_input, dimension_output, 
                 learning_rate, filter_sizes, pooling_size, out_dimension, num_layer):
        self.X = tf.placeholder(tf.float32, shape=[None, sequence_length, dimension_input, 1])
        self.Y = tf.placeholder(tf.float32, shape=[None, dimension_output])
        pooled_outputs = []
        reduce_size = int(np.ceil((sequence_length) * 1.0 / pooling_size))
        for i in filter_sizes:
            w = tf.Variable(tf.truncated_normal([i, dimension_input, 1, out_dimension], stddev=0.1))
            b = tf.Variable(tf.truncated_normal([out_dimension], stddev = 0.01))
            conv = tf.nn.relu(tf.nn.conv2d(self.X, w, strides=[1, 1, 1, 1],padding="VALID") + b)
            pooled = tf.nn.max_pool(conv,ksize=[1, pooling_size, 1, 1],strides=[1, pooling_size, 1, 1],padding='VALID')
            pooled = tf.reshape(pooled, [-1, reduce_size -1, out_dimension])
            pooled_outputs.append(pooled)
        h_pool = tf.concat(pooled_outputs, 2)
        def lstm_cell():
            return tf.nn.rnn_cell.LSTMCell(out_dimension)
        self.rnn_cells = tf.nn.rnn_cell.MultiRNNCell([lstm_cell() for _ in range(num_layers)])
        drop = tf.contrib.rnn.DropoutWrapper(self.rnn_cells, output_keep_prob = 0.5)
        self.outputs, self.last_state = tf.nn.dynamic_rnn(drop, h_pool, dtype = tf.float32)
        self.rnn_W = tf.Variable(tf.random_normal((out_dimension, dimension_output)))
        self.rnn_B = tf.Variable(tf.random_normal([dimension_output]))
        self.logits = tf.matmul(self.outputs[:, -1], self.rnn_W) + self.rnn_B
        self.cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits = self.logits, labels = self.Y))
        l2 = sum(0.0005 * tf.nn.l2_loss(tf_var) for tf_var in tf.trainable_variables())
        self.cost += l2
        self.optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(self.cost)
        self.correct_pred = tf.equal(tf.argmax(self.logits, 1), tf.argmax(self.Y, 1))
        self.accuracy = tf.reduce_mean(tf.cast(self.correct_pred, tf.float32))

In [6]:
import os
from sklearn.preprocessing import LabelEncoder
import re
import collections
import random
import time

In [7]:
def clearstring(string):
    string = re.sub('[^\'\"A-Za-z0-9 ]+', '', string)
    string = string.split(' ')
    string = filter(None, string)
    string = [y.strip() for y in string]
    return ' '.join(string)

def read_data():
    list_folder = os.listdir('data/')
    label = list_folder
    label.sort()
    outer_string, outer_label = [], []
    for i in range(len(list_folder)):
        list_file = os.listdir('data/' + list_folder[i])
        strings = []
        for x in range(len(list_file)):
            with open('data/' + list_folder[i] + '/' + list_file[x], 'r') as fopen:
                strings += fopen.read().split('\n')
        strings = list(filter(None, strings))
        for k in range(len(strings)):
            strings[k] = clearstring(strings[k])
        labels = [i] * len(strings)
        outer_string += strings
        outer_label += labels
    
    dataset = np.array([outer_string, outer_label])
    dataset = dataset.T
    np.random.shuffle(dataset)
    
    string = []
    for i in range(dataset.shape[0]):
        string += dataset[i][0].split()
    
    return string, dataset, label

In [8]:
def build_dataset(words, vocabulary_size):
    count = []
    count.extend(collections.Counter(words).most_common(vocabulary_size))
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary) + 1
    data = []
    unk_count = 0
    for word in words:
        if word in dictionary:
            index = dictionary[word]
        data.append(index)
    dictionary['PAD'] = 0
    reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, dictionary, reverse_dictionary

def generate_batch_skipgram(words, batch_size, num_skips, skip_window):
    data_index = 0
    assert batch_size % num_skips == 0
    assert num_skips <= 2 * skip_window
    batch = np.ndarray(shape=(batch_size), dtype=np.int32)
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    span = 2 * skip_window + 1
    buffer = collections.deque(maxlen=span)
    for i in range(span):
        buffer.append(words[data_index])
        data_index = (data_index + 1) % len(words)
    for i in range(batch_size // num_skips):
        target = skip_window
        targets_to_avoid = [skip_window]
        for j in range(num_skips):
            while target in targets_to_avoid:
                target = random.randint(0, span - 1)
            targets_to_avoid.append(target)
            batch[i * num_skips + j] = buffer[skip_window]
            labels[i * num_skips + j, 0] = buffer[target]
        buffer.append(words[data_index])
        data_index = (data_index + 1) % len(words)
    data_index = (data_index + len(words) - span) % len(words)
    return batch, labels

def generatevector(dimension, batch_size, skip_size, skip_window, num_skips, iteration, words_real):
    
    print ("Data size:", len(words_real))
    data, dictionary, reverse_dictionary = build_dataset(words_real, len(words_real))
    sess = tf.InteractiveSession()
    print ("Creating Word2Vec model..")
    model = Model_vec(batch_size, dimension, 0.1, len(dictionary))
    sess.run(tf.global_variables_initializer())
    last_time = time.time()
    for step in range(iteration):
        new_time = time.time()
        batch_inputs, batch_labels = generate_batch_skipgram(data, batch_size, num_skips, skip_window)
        feed_dict = {model.train_inputs: batch_inputs, model.train_labels: batch_labels}
        _, loss = sess.run([model.optimizer, model.loss], feed_dict=feed_dict)
        if ((step + 1) % 1000) == 0:
            print ("epoch:", step + 1, ", loss:", loss, ", speed:", (time.time() - new_time) * 1000, "s / 1000 epoch")
    tf.reset_default_graph()       
    return dictionary, reverse_dictionary, model.normalized_embeddings.eval()

In [9]:
string, data, label = read_data()
maxlen = 50
location = os.getcwd()
dimension = 512
skip_size = 8
skip_window = 1
num_skips = 2
iteration_train_vectors = 20000
num_layers = 3
size_layer = 256
learning_rate = 0.0001
epoch = 100
batch = 100

In [10]:
dictionary, reverse_dictionary, vectors = generatevector(dimension, 32, skip_size, skip_window, num_skips, iteration_train_vectors, string)

Data size: 8007324
Creating Word2Vec model..
epoch: 1000 , loss: 54.275 , speed: 76.04336738586426 s / 1000 epoch
epoch: 2000 , loss: 35.8356 , speed: 76.1864185333252 s / 1000 epoch
epoch: 3000 , loss: 50.1899 , speed: 75.96135139465332 s / 1000 epoch
epoch: 4000 , loss: 28.8578 , speed: 75.98257064819336 s / 1000 epoch
epoch: 5000 , loss: 33.8236 , speed: 76.04789733886719 s / 1000 epoch
epoch: 6000 , loss: 21.5477 , speed: 76.07746124267578 s / 1000 epoch
epoch: 7000 , loss: 24.3524 , speed: 76.1268138885498 s / 1000 epoch
epoch: 8000 , loss: 20.1253 , speed: 76.04265213012695 s / 1000 epoch
epoch: 9000 , loss: 31.1462 , speed: 76.15041732788086 s / 1000 epoch
epoch: 10000 , loss: 11.2176 , speed: 76.08556747436523 s / 1000 epoch
epoch: 11000 , loss: 97.3372 , speed: 76.18188858032227 s / 1000 epoch
epoch: 12000 , loss: 20.1947 , speed: 76.02334022521973 s / 1000 epoch
epoch: 13000 , loss: 21.858 , speed: 76.30228996276855 s / 1000 epoch
epoch: 14000 , loss: 21.6708 , speed: 76.1294

In [12]:
from sklearn.cross_validation import train_test_split
train_X, test_X, train_Y, test_Y = train_test_split(data[:, 0], data[:, 1], test_size = 0.2)



In [None]:
tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Model(maxlen, dimension, len(label), learning_rate, [3, 3, 3], 5, size_layer, num_layers)
sess.run(tf.global_variables_initializer())
saver = tf.train.Saver(tf.global_variables())
EARLY_STOPPING, CURRENT_CHECKPOINT, CURRENT_ACC, EPOCH = 100, 0, 0, 0
while True:
    if CURRENT_CHECKPOINT == EARLY_STOPPING:
        print('break epoch:', EPOCH)
        break
    train_acc, train_loss, test_acc, test_loss = 0, 0, 0, 0
    for i in range(0, (train_X.shape[0] // batch) * batch, batch):
        batch_x = np.zeros((batch, maxlen, dimension))
        batch_y = np.zeros((batch, len(label)))
        for k in range(batch):
            tokens = train_X[i + k].split()[:maxlen]
            for no, text in enumerate(tokens[::-1]):
                try:
                    batch_x[k, -1 - no, :] += vectors[dictionary[text], :]
                except:
                    continue
            batch_y[k, int(train_Y[i + k])] = 1.0
        batch_x = np.expand_dims(batch_x, axis=-1)
        loss, _ = sess.run([model.cost, model.optimizer], feed_dict = {model.X : batch_x, model.Y : batch_y})
        train_loss += loss
        train_acc += sess.run(model.accuracy, feed_dict = {model.X : batch_x, model.Y : batch_y})
    
    for i in range(0, (test_X.shape[0] // batch) * batch, batch):
        batch_x = np.zeros((batch, maxlen, dimension))
        batch_y = np.zeros((batch, len(label)))
        for k in range(batch):
            tokens = test_X[i + k].split()[:maxlen]
            for no, text in enumerate(tokens[::-1]):
                try:
                    batch_x[k, -1 - no, :] += vectors[dictionary[text], :]
                except:
                    continue
            batch_y[k, int(test_Y[i + k])] = 1.0
        batch_x = np.expand_dims(batch_x, axis=-1)
        loss, acc = sess.run([model.cost, model.accuracy], feed_dict = {model.X : batch_x, model.Y : batch_y})
        test_loss += loss
        test_acc += acc
        
    train_loss /= (train_X.shape[0] // batch)
    train_acc /= (train_X.shape[0] // batch)
    test_loss /= (test_X.shape[0] // batch)
    test_acc /= (test_X.shape[0] // batch)
    if test_acc > CURRENT_ACC:
        print('epoch:', EPOCH, ', pass acc:', CURRENT_ACC, ', current acc:', test_acc)
        CURRENT_ACC = test_acc
        CURRENT_CHECKPOINT = 0
        saver.save(sess, os.getcwd() + "/model-cnn-vector.ckpt")
    else:
        CURRENT_CHECKPOINT += 1
    EPOCH += 1
    print('epoch:', EPOCH, ', training loss:', train_loss, ', training acc:', train_acc, ', valid loss:', test_loss, ', valid acc:', test_acc)

epoch: 0 , pass acc: 0 , current acc: 0.678547396666
epoch: 1 , training loss: 2.60119728518 , training acc: 0.492237535869 , valid loss: 1.71816542612 , valid acc: 0.678547396666
epoch: 1 , pass acc: 0.678547396666 , current acc: 0.734249680555
epoch: 2 , training loss: 1.48295797434 , training acc: 0.737414498278 , valid loss: 1.363624875 , valid acc: 0.734249680555
epoch: 2 , pass acc: 0.734249680555 , current acc: 0.74009601838
epoch: 3 , training loss: 1.22629718503 , training acc: 0.77064185337 , valid loss: 1.22613368787 , valid acc: 0.74009601838
epoch: 3 , pass acc: 0.74009601838 , current acc: 0.742785093593
epoch: 4 , training loss: 1.07522352561 , training acc: 0.785602861507 , valid loss: 1.13900834 , valid acc: 0.742785093593
epoch: 5 , training loss: 0.974301998996 , training acc: 0.795821817785 , valid loss: 1.08851915784 , valid acc: 0.739543796659
epoch: 6 , training loss: 0.904681019874 , training acc: 0.803524277867 , valid loss: 1.08474090091 , valid acc: 0.7351140