In [15]:
import tensorflow as tf
import numpy as np
import os
from sklearn.preprocessing import LabelEncoder
import re
import collections
import random
import time

class Model_vec:
    
    def __init__(self, batch_size, dimension_size, learning_rate, vocabulary_size):
        self.train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
        self.train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
        embeddings = tf.Variable(tf.random_uniform([vocabulary_size, dimension_size], -1.0, 1.0))
        embed = tf.nn.embedding_lookup(embeddings, self.train_inputs)
        self.nce_weights = tf.Variable(tf.truncated_normal([vocabulary_size, dimension_size], stddev = 1.0 / np.sqrt(dimension_size)))
        self.nce_biases = tf.Variable(tf.zeros([vocabulary_size]))
        self.loss = tf.reduce_mean(tf.nn.nce_loss(weights = self.nce_weights, biases = self.nce_biases, labels = self.train_labels,
                                                  inputs=embed, num_sampled = batch_size / 2, num_classes = vocabulary_size))
        self.optimizer = tf.train.AdamOptimizer(learning_rate).minimize(self.loss)
        self.norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
        self.normalized_embeddings = embeddings / self.norm
        
class Model:
    def __init__(self, sequence_length, dimension_input, dimension_output, 
                 learning_rate, filter_sizes, out_dimension):
        self.X = tf.placeholder(tf.float32, shape=[None, sequence_length, dimension_input, 1])
        self.Y = tf.placeholder(tf.float32, shape=[None, dimension_output])
        pooled_outputs = []
        for i in filter_sizes:
            w = tf.Variable(tf.truncated_normal([i, dimension_input, 1, out_dimension], stddev=0.1))
            b = tf.Variable(tf.truncated_normal([out_dimension], stddev = 0.01))
            conv = tf.nn.relu(tf.nn.conv2d(self.X, w, strides=[1, 1, 1, 1],padding="VALID") + b)
            pooled = tf.nn.max_pool(conv,ksize=[1, sequence_length - i + 1, 1, 1],strides=[1, 1, 1, 1],padding='VALID')
            pooled_outputs.append(pooled)
        h_pool = tf.concat(pooled_outputs, 3)
        h_pool_flat = tf.nn.dropout(tf.reshape(h_pool, [-1, out_dimension * len(filter_sizes)]), 0.1)
        w = tf.Variable(tf.truncated_normal([out_dimension * len(filter_sizes), dimension_output], stddev=0.1))
        b = tf.Variable(tf.truncated_normal([dimension_output], stddev = 0.01))
        self.logits = tf.matmul(h_pool_flat, w) + b
        self.cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits = self.logits, labels = self.Y))
        l2 = sum(0.0005 * tf.nn.l2_loss(tf_var) for tf_var in tf.trainable_variables())
        self.cost += l2
        self.optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(self.cost)
        self.correct_pred = tf.equal(tf.argmax(self.logits, 1), tf.argmax(self.Y, 1))
        self.accuracy = tf.reduce_mean(tf.cast(self.correct_pred, tf.float32))

In [2]:
def clearstring(string):
    string = re.sub('[^\'\"A-Za-z0-9 ]+', '', string)
    string = string.split(' ')
    string = filter(None, string)
    string = [y.strip() for y in string]
    return ' '.join(string)

def read_data():
    list_folder = os.listdir('data/')
    label = list_folder
    label.sort()
    outer_string, outer_label = [], []
    for i in range(len(list_folder)):
        list_file = os.listdir('data/' + list_folder[i])
        strings = []
        for x in range(len(list_file)):
            with open('data/' + list_folder[i] + '/' + list_file[x], 'r') as fopen:
                strings += fopen.read().split('\n')
        strings = list(filter(None, strings))
        for k in range(len(strings)):
            strings[k] = clearstring(strings[k])
        labels = [i] * len(strings)
        outer_string += strings
        outer_label += labels
    
    dataset = np.array([outer_string, outer_label])
    dataset = dataset.T
    np.random.shuffle(dataset)
    
    string = []
    for i in range(dataset.shape[0]):
        string += dataset[i][0].split()
    
    return string, dataset, label

In [3]:
def build_dataset(words, vocabulary_size):
    count = []
    count.extend(collections.Counter(words).most_common(vocabulary_size))
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary) + 1
    data = []
    unk_count = 0
    for word in words:
        if word in dictionary:
            index = dictionary[word]
        data.append(index)
    dictionary['PAD'] = 0
    reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, dictionary, reverse_dictionary

def generate_batch_skipgram(words, batch_size, num_skips, skip_window):
    data_index = 0
    assert batch_size % num_skips == 0
    assert num_skips <= 2 * skip_window
    batch = np.ndarray(shape=(batch_size), dtype=np.int32)
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    span = 2 * skip_window + 1
    buffer = collections.deque(maxlen=span)
    for i in range(span):
        buffer.append(words[data_index])
        data_index = (data_index + 1) % len(words)
    for i in range(batch_size // num_skips):
        target = skip_window
        targets_to_avoid = [skip_window]
        for j in range(num_skips):
            while target in targets_to_avoid:
                target = random.randint(0, span - 1)
            targets_to_avoid.append(target)
            batch[i * num_skips + j] = buffer[skip_window]
            labels[i * num_skips + j, 0] = buffer[target]
        buffer.append(words[data_index])
        data_index = (data_index + 1) % len(words)
    data_index = (data_index + len(words) - span) % len(words)
    return batch, labels

def generatevector(dimension, batch_size, skip_size, skip_window, num_skips, iteration, words_real):
    
    print ("Data size:", len(words_real))
    data, dictionary, reverse_dictionary = build_dataset(words_real, len(words_real))
    sess = tf.InteractiveSession()
    print ("Creating Word2Vec model..")
    model = Model_vec(batch_size, dimension, 0.1, len(dictionary))
    sess.run(tf.global_variables_initializer())
    last_time = time.time()
    for step in range(iteration):
        new_time = time.time()
        batch_inputs, batch_labels = generate_batch_skipgram(data, batch_size, num_skips, skip_window)
        feed_dict = {model.train_inputs: batch_inputs, model.train_labels: batch_labels}
        _, loss = sess.run([model.optimizer, model.loss], feed_dict=feed_dict)
        if ((step + 1) % 1000) == 0:
            print ("epoch:", step + 1, ", loss:", loss, ", speed:", (time.time() - new_time) * 1000, "s / 1000 epoch")
    tf.reset_default_graph()       
    return dictionary, reverse_dictionary, model.normalized_embeddings.eval()

In [10]:
string, data, label = read_data()
maxlen = 50
location = os.getcwd()
dimension = 512
skip_size = 8
skip_window = 1
num_skips = 2
iteration_train_vectors = 20000
size_layer = 128
learning_rate = 0.0001
epoch = 100
filter_sizes = [2,3,4,5]
batch = 100

In [11]:
dictionary, reverse_dictionary, vectors = generatevector(dimension, 32, skip_size, skip_window, num_skips, iteration_train_vectors, string)

Data size: 72822
Creating Word2Vec model..
epoch: 1000 , loss: 26.4191 , speed: 10.108232498168945 s / 1000 epoch
epoch: 2000 , loss: 9.94636 , speed: 10.172843933105469 s / 1000 epoch
epoch: 3000 , loss: 21.9353 , speed: 10.111331939697266 s / 1000 epoch
epoch: 4000 , loss: 0.229738 , speed: 10.18381118774414 s / 1000 epoch
epoch: 5000 , loss: 2.6353 , speed: 10.280370712280273 s / 1000 epoch
epoch: 6000 , loss: 0.594088 , speed: 10.149240493774414 s / 1000 epoch
epoch: 7000 , loss: 1.70112 , speed: 10.061502456665039 s / 1000 epoch
epoch: 8000 , loss: 10.6174 , speed: 10.412931442260742 s / 1000 epoch
epoch: 9000 , loss: 0.461871 , speed: 10.251045227050781 s / 1000 epoch
epoch: 10000 , loss: 0.0962568 , speed: 10.248184204101562 s / 1000 epoch
epoch: 11000 , loss: 1.08778 , speed: 10.14256477355957 s / 1000 epoch
epoch: 12000 , loss: 0.103811 , speed: 10.224103927612305 s / 1000 epoch
epoch: 13000 , loss: 0.171167 , speed: 9.927749633789062 s / 1000 epoch
epoch: 14000 , loss: 1.0941

In [17]:
from sklearn.cross_validation import train_test_split
train_X, test_X, train_Y, test_Y = train_test_split(data[:, 0], data[:, 1], test_size = 0.2)



In [19]:
tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Model(maxlen, dimension, len(label), learning_rate, [2,3,4], size_layer)
sess.run(tf.global_variables_initializer())
saver = tf.train.Saver(tf.global_variables())
EARLY_STOPPING, CURRENT_CHECKPOINT, CURRENT_ACC, EPOCH = 100, 0, 0, 0
while True:
    if CURRENT_CHECKPOINT == EARLY_STOPPING:
        print('break epoch:', EPOCH)
        break
    train_acc, train_loss, test_acc, test_loss = 0, 0, 0, 0
    for i in range(0, (train_X.shape[0] // batch) * batch, batch):
        batch_x = np.zeros((batch, maxlen, dimension))
        batch_y = np.zeros((batch, len(label)))
        for k in range(batch):
            tokens = train_X[i + k].split()[:maxlen]
            for no, text in enumerate(tokens[::-1]):
                try:
                    batch_x[k, -1 - no, :] += vectors[dictionary[text], :]
                except:
                    continue
            batch_y[k, int(train_Y[i + k])] = 1.0
        batch_x = np.expand_dims(batch_x, axis=-1)
        loss, _ = sess.run([model.cost, model.optimizer], feed_dict = {model.X : batch_x, model.Y : batch_y})
        train_loss += loss
        train_acc += sess.run(model.accuracy, feed_dict = {model.X : batch_x, model.Y : batch_y})
    
    for i in range(0, (test_X.shape[0] // batch) * batch, batch):
        batch_x = np.zeros((batch, maxlen, dimension))
        batch_y = np.zeros((batch, len(label)))
        for k in range(batch):
            tokens = test_X[i + k].split()[:maxlen]
            for no, text in enumerate(tokens[::-1]):
                try:
                    batch_x[k, -1 - no, :] += vectors[dictionary[text], :]
                except:
                    continue
            batch_y[k, int(test_Y[i + k])] = 1.0
        batch_x = np.expand_dims(batch_x, axis=-1)
        loss, acc = sess.run([model.cost, model.accuracy], feed_dict = {model.X : batch_x, model.Y : batch_y})
        test_loss += loss
        test_acc += acc
        
    train_loss /= (train_X.shape[0] // batch)
    train_acc /= (train_X.shape[0] // batch)
    test_loss /= (test_X.shape[0] // batch)
    test_acc /= (test_X.shape[0] // batch)
    if test_acc > CURRENT_ACC:
        print('epoch:', EPOCH, ', pass acc:', CURRENT_ACC, ', current acc:', test_acc)
        CURRENT_ACC = test_acc
        CURRENT_CHECKPOINT = 0
        saver.save(sess, os.getcwd() + "/model-cnn-vector.ckpt")
    else:
        CURRENT_CHECKPOINT += 1
    EPOCH += 1
    print('epoch:', EPOCH, ', training loss:', train_loss, ', training acc:', train_acc, ', valid loss:', test_loss, ', valid acc:', test_acc)

epoch: 0 , pass acc: 0 , current acc: 0.569999953111
epoch: 1 , training loss: 2.1690743821 , training acc: 0.534285690103 , valid loss: 2.09954635302 , valid acc: 0.569999953111
epoch: 2 , training loss: 2.1325142639 , training acc: 0.555714270898 , valid loss: 2.14290912946 , valid acc: 0.543333311876
epoch: 3 , training loss: 2.09860954114 , training acc: 0.570714273623 , valid loss: 2.1156903108 , valid acc: 0.513333340486
epoch: 4 , training loss: 2.06246621268 , training acc: 0.600714262043 , valid loss: 2.20896625519 , valid acc: 0.523333321015
epoch: 4 , pass acc: 0.569999953111 , current acc: 0.599999964237
epoch: 5 , training loss: 2.0325673478 , training acc: 0.612142833216 , valid loss: 2.13578724861 , valid acc: 0.599999964237
epoch: 5 , pass acc: 0.599999964237 , current acc: 0.619999965032
epoch: 6 , training loss: 2.0317456552 , training acc: 0.612857120378 , valid loss: 2.01069108645 , valid acc: 0.619999965032
epoch: 7 , training loss: 2.01486952816 , training acc: 0.

epoch: 62 , training loss: 1.5464478646 , training acc: 0.682857117483 , valid loss: 1.74197351933 , valid acc: 0.623333315055
epoch: 63 , training loss: 1.54342799527 , training acc: 0.681428534644 , valid loss: 1.73484798272 , valid acc: 0.59333328406
epoch: 64 , training loss: 1.55317475966 , training acc: 0.689285687038 , valid loss: 1.68947911263 , valid acc: 0.589999953906
epoch: 65 , training loss: 1.55169016974 , training acc: 0.680714275156 , valid loss: 1.7280617555 , valid acc: 0.596666634083
epoch: 66 , training loss: 1.53982486044 , training acc: 0.677857117993 , valid loss: 1.69142993291 , valid acc: 0.629999975363
epoch: 67 , training loss: 1.53782448598 , training acc: 0.684285687549 , valid loss: 1.73403048515 , valid acc: 0.589999953906
epoch: 68 , training loss: 1.4959212201 , training acc: 0.703571417502 , valid loss: 1.58885852496 , valid acc: 0.616666654746
epoch: 69 , training loss: 1.50776251725 , training acc: 0.679999985865 , valid loss: 1.63167945544 , valid 

epoch: 126 , training loss: 1.23340810197 , training acc: 0.735714261021 , valid loss: 1.43108602365 , valid acc: 0.669999976953
epoch: 127 , training loss: 1.22415469374 , training acc: 0.760714279754 , valid loss: 1.46215625604 , valid acc: 0.616666634878
epoch: 128 , training loss: 1.23342826537 , training acc: 0.758571407625 , valid loss: 1.45790040493 , valid acc: 0.623333295186
epoch: 129 , training loss: 1.24410293783 , training acc: 0.739999979734 , valid loss: 1.43600241343 , valid acc: 0.633333305518
epoch: 130 , training loss: 1.20977280821 , training acc: 0.73142855508 , valid loss: 1.42746881644 , valid acc: 0.616666674614
epoch: 131 , training loss: 1.2327112811 , training acc: 0.76857141086 , valid loss: 1.38090348244 , valid acc: 0.656666636467
epoch: 132 , training loss: 1.22347086668 , training acc: 0.749285710709 , valid loss: 1.46551601092 , valid acc: 0.606666624546
epoch: 133 , training loss: 1.22394770384 , training acc: 0.752857106073 , valid loss: 1.47016521295

epoch: 190 , training loss: 1.03363832406 , training acc: 0.816428567682 , valid loss: 1.29850240548 , valid acc: 0.649999976158
epoch: 191 , training loss: 1.03149575421 , training acc: 0.808571406773 , valid loss: 1.29158624013 , valid acc: 0.626666665077
epoch: 192 , training loss: 1.02905701739 , training acc: 0.800714271409 , valid loss: 1.29826243718 , valid acc: 0.646666646004
epoch: 193 , training loss: 1.02497036116 , training acc: 0.813571406262 , valid loss: 1.3480070432 , valid acc: 0.639999985695
epoch: 194 , training loss: 1.0166658674 , training acc: 0.802857130766 , valid loss: 1.32548379898 , valid acc: 0.609999994437
epoch: 195 , training loss: 1.02206483058 , training acc: 0.822142839432 , valid loss: 1.29571600755 , valid acc: 0.636666635672
epoch: 196 , training loss: 0.994606775897 , training acc: 0.817142848458 , valid loss: 1.31538939476 , valid acc: 0.616666634878
epoch: 197 , training loss: 1.02512193578 , training acc: 0.816428567682 , valid loss: 1.325972755