In [15]:
import tensorflow as tf
import numpy as np

class Model_vec:
    
    def __init__(self, batch_size, dimension_size, learning_rate, vocabulary_size):
        self.train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
        self.train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
        embeddings = tf.Variable(tf.random_uniform([vocabulary_size, dimension_size], -1.0, 1.0))
        embed = tf.nn.embedding_lookup(embeddings, self.train_inputs)
        self.nce_weights = tf.Variable(tf.truncated_normal([vocabulary_size, dimension_size], stddev = 1.0 / np.sqrt(dimension_size)))
        self.nce_biases = tf.Variable(tf.zeros([vocabulary_size]))
        self.loss = tf.reduce_mean(tf.nn.nce_loss(weights = self.nce_weights, biases = self.nce_biases, labels = self.train_labels,
                                                  inputs=embed, num_sampled = batch_size / 2, num_classes = vocabulary_size))
        self.optimizer = tf.train.AdamOptimizer(learning_rate).minimize(self.loss)
        self.norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
        self.normalized_embeddings = embeddings / self.norm
        
class Model:
    
    def __init__(self, dimension_input, size_layer, dimension_output, learning_rate):
        self.X = tf.placeholder(tf.float32, [None, dimension_input])
        self.Y = tf.placeholder(tf.float32, [None, dimension_output])
        layer1 = tf.Variable(tf.random_normal([dimension_input, size_layer], stddev=0.5))
        bias1 = tf.Variable(tf.random_normal([size_layer], stddev=0.1))
        layer2 = tf.Variable(tf.random_normal([size_layer, size_layer], stddev=0.5))
        bias2 = tf.Variable(tf.random_normal([size_layer], stddev=0.1))
        layer3 = tf.Variable(tf.random_normal([size_layer, size_layer], stddev=0.5))
        bias3 = tf.Variable(tf.random_normal([size_layer], stddev=0.1))
        layer4 = tf.Variable(tf.random_normal([size_layer, dimension_output], stddev=0.5))
        bias4 = tf.Variable(tf.random_normal([dimension_output], stddev=0.1))
        feed = tf.nn.tanh(tf.matmul(self.X, layer1) + bias1)
        feed = tf.nn.tanh(tf.matmul(feed, layer2) + bias2)
        feed = tf.nn.tanh(tf.matmul(feed, layer3) + bias3)
        self.logits = tf.matmul(feed, layer4) + bias4
        self.cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits = self.logits, labels = self.Y))
        l2 = sum(0.0005 * tf.nn.l2_loss(tf_var) for tf_var in tf.trainable_variables())
        self.cost += l2
        self.optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(self.cost)
        self.correct_pred = tf.equal(tf.argmax(self.logits, 1), tf.argmax(self.Y, 1))
        self.accuracy = tf.reduce_mean(tf.cast(self.correct_pred, tf.float32))

In [2]:
import os
from sklearn.preprocessing import LabelEncoder
import re
import collections
import random
import time

In [3]:
def clearstring(string):
    string = re.sub('[^\'\"A-Za-z0-9 ]+', '', string)
    string = string.split(' ')
    string = filter(None, string)
    string = [y.strip() for y in string]
    return ' '.join(string)

def read_data():
    list_folder = os.listdir('data/')
    label = list_folder
    label.sort()
    outer_string, outer_label = [], []
    for i in range(len(list_folder)):
        list_file = os.listdir('data/' + list_folder[i])
        strings = []
        for x in range(len(list_file)):
            with open('data/' + list_folder[i] + '/' + list_file[x], 'r') as fopen:
                strings += fopen.read().split('\n')
        strings = list(filter(None, strings))
        for k in range(len(strings)):
            strings[k] = clearstring(strings[k])
        labels = [i] * len(strings)
        outer_string += strings
        outer_label += labels
    
    dataset = np.array([outer_string, outer_label])
    dataset = dataset.T
    np.random.shuffle(dataset)
    
    string = []
    for i in range(dataset.shape[0]):
        string += dataset[i][0].split()
    
    return string, dataset, label

In [4]:
def build_dataset(words, vocabulary_size):
    count = []
    count.extend(collections.Counter(words).most_common(vocabulary_size))
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary) + 1
    data = []
    unk_count = 0
    for word in words:
        if word in dictionary:
            index = dictionary[word]
        data.append(index)
    dictionary['PAD'] = 0
    reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, dictionary, reverse_dictionary

def generate_batch_skipgram(words, batch_size, num_skips, skip_window):
    data_index = 0
    assert batch_size % num_skips == 0
    assert num_skips <= 2 * skip_window
    batch = np.ndarray(shape=(batch_size), dtype=np.int32)
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    span = 2 * skip_window + 1
    buffer = collections.deque(maxlen=span)
    for i in range(span):
        buffer.append(words[data_index])
        data_index = (data_index + 1) % len(words)
    for i in range(batch_size // num_skips):
        target = skip_window
        targets_to_avoid = [skip_window]
        for j in range(num_skips):
            while target in targets_to_avoid:
                target = random.randint(0, span - 1)
            targets_to_avoid.append(target)
            batch[i * num_skips + j] = buffer[skip_window]
            labels[i * num_skips + j, 0] = buffer[target]
        buffer.append(words[data_index])
        data_index = (data_index + 1) % len(words)
    data_index = (data_index + len(words) - span) % len(words)
    return batch, labels

def generatevector(dimension, batch_size, skip_size, skip_window, num_skips, iteration, words_real):
    
    print ("Data size:", len(words_real))
    data, dictionary, reverse_dictionary = build_dataset(words_real, len(words_real))
    sess = tf.InteractiveSession()
    print ("Creating Word2Vec model..")
    model = Model_vec(batch_size, dimension, 0.1, len(dictionary))
    sess.run(tf.global_variables_initializer())
    last_time = time.time()
    for step in range(iteration):
        new_time = time.time()
        batch_inputs, batch_labels = generate_batch_skipgram(data, batch_size, num_skips, skip_window)
        feed_dict = {model.train_inputs: batch_inputs, model.train_labels: batch_labels}
        _, loss = sess.run([model.optimizer, model.loss], feed_dict=feed_dict)
        if ((step + 1) % 1000) == 0:
            print ("epoch:", step + 1, ", loss:", loss, ", speed:", (time.time() - new_time) * 1000, "s / 1000 epoch")
    tf.reset_default_graph()       
    return dictionary, reverse_dictionary, model.normalized_embeddings.eval()

In [5]:
string, data, label = read_data()
location = os.getcwd()
dimension = 512
skip_size = 8
skip_window = 1
num_skips = 2
iteration_train_vectors = 20000
num_layers = 3
size_layer = 256
learning_rate = 0.0001
epoch = 100
batch = 100
maxlen = 50

In [6]:
dictionary, reverse_dictionary, vectors = generatevector(dimension, 32, skip_size, skip_window, num_skips, iteration_train_vectors, string)

Data size: 8007324
Creating Word2Vec model..
epoch: 1000 , loss: 20.2202 , speed: 76.17592811584473 s / 1000 epoch
epoch: 2000 , loss: 21.1476 , speed: 75.83856582641602 s / 1000 epoch
epoch: 3000 , loss: 60.0386 , speed: 76.28560066223145 s / 1000 epoch
epoch: 4000 , loss: 23.9346 , speed: 76.0960578918457 s / 1000 epoch
epoch: 5000 , loss: 31.1849 , speed: 76.11989974975586 s / 1000 epoch
epoch: 6000 , loss: 31.3076 , speed: 76.03096961975098 s / 1000 epoch
epoch: 7000 , loss: 30.9381 , speed: 76.13253593444824 s / 1000 epoch
epoch: 8000 , loss: 30.5808 , speed: 76.02190971374512 s / 1000 epoch
epoch: 9000 , loss: 7.30779 , speed: 76.32803916931152 s / 1000 epoch
epoch: 10000 , loss: 23.1935 , speed: 76.1103630065918 s / 1000 epoch
epoch: 11000 , loss: 30.0706 , speed: 76.21216773986816 s / 1000 epoch
epoch: 12000 , loss: 12.0578 , speed: 76.06911659240723 s / 1000 epoch
epoch: 13000 , loss: 12.3838 , speed: 76.26795768737793 s / 1000 epoch
epoch: 14000 , loss: 15.965 , speed: 76.462

In [17]:
from sklearn.cross_validation import train_test_split
train_X, test_X, train_Y, test_Y = train_test_split(data[:, 0], data[:, 1], test_size = 0.25)

In [22]:
tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Model(dimension, 128, len(label), learning_rate)
sess.run(tf.global_variables_initializer())
saver = tf.train.Saver(tf.global_variables())
EARLY_STOPPING, CURRENT_CHECKPOINT, CURRENT_ACC, EPOCH = 20, 0, 0, 0
batch_size = 200
while True:
    if CURRENT_CHECKPOINT == EARLY_STOPPING:
        print('break epoch:', EPOCH)
        break
    train_acc, train_loss, test_acc, test_loss = 0, 0, 0, 0
    for i in range(0, (train_X.shape[0] // batch) * batch, batch):
        batch_x = np.zeros((batch, dimension))
        batch_y = np.zeros((batch, len(label)))
        for k in range(batch):
            tokens = train_X[i + k].split()
            for no, text in enumerate(tokens):
                try:
                    batch_x[k, :] += vectors[dictionary[text], :]
                except:
                    continue
            batch_y[k, int(train_Y[i + k])] = 1.0
        loss, _ = sess.run([model.cost, model.optimizer], feed_dict = {model.X : batch_x, model.Y : batch_y})
        train_loss += loss
        train_acc += sess.run(model.accuracy, feed_dict = {model.X : batch_x, model.Y : batch_y})
    
    for i in range(0, (test_X.shape[0] // batch) * batch, batch):
        batch_x = np.zeros((batch, dimension))
        batch_y = np.zeros((batch, len(label)))
        for k in range(batch):
            tokens = test_X[i + k].split()
            for no, text in enumerate(tokens):
                try:
                    batch_x[k, :] += vectors[dictionary[text], :]
                except:
                    continue
            batch_y[k, int(test_Y[i + k])] = 1.0
        loss, acc = sess.run([model.cost, model.accuracy], feed_dict = {model.X : batch_x, model.Y : batch_y})
        test_loss += loss
        test_acc += acc
        
    train_loss /= (train_X.shape[0] // batch)
    train_acc /= (train_X.shape[0] // batch)
    test_loss /= (test_X.shape[0] // batch)
    test_acc /= (test_X.shape[0] // batch)
    if test_acc > CURRENT_ACC:
        print('epoch:', EPOCH, ', pass acc:', CURRENT_ACC, ', current acc:', test_acc)
        CURRENT_ACC = test_acc
        CURRENT_CHECKPOINT = 0
        saver.save(sess, os.getcwd() + "/model-rnn-vector.ckpt")
    else:
        CURRENT_CHECKPOINT += 1
    EPOCH += 1
    print('epoch:', EPOCH, ', training loss:', train_loss, ', training acc:', train_acc, ', valid loss:', test_loss, ', valid acc:', test_acc)

epoch: 0 , pass acc: 0 , current acc: 0.277380028622
epoch: 1 , training loss: 9.78156181886 , training acc: 0.25695456574 , valid loss: 8.53324538 , valid acc: 0.277380028622
epoch: 1 , pass acc: 0.277380028622 , current acc: 0.327456803245
epoch: 2 , training loss: 7.90836092546 , training acc: 0.301628268588 , valid loss: 7.43985049601 , valid acc: 0.327456803245
epoch: 2 , pass acc: 0.327456803245 , current acc: 0.349126667957
epoch: 3 , training loss: 7.07367070775 , training acc: 0.339158658131 , valid loss: 6.66108467071 , valid acc: 0.349126667957
epoch: 3 , pass acc: 0.349126667957 , current acc: 0.375844517433
epoch: 4 , training loss: 6.07985553403 , training acc: 0.360930890193 , valid loss: 5.44558788124 , valid acc: 0.375844517433
epoch: 4 , pass acc: 0.375844517433 , current acc: 0.405834920461
epoch: 5 , training loss: 4.79457229189 , training acc: 0.393272540434 , valid loss: 4.2004150333 , valid acc: 0.405834920461
epoch: 5 , pass acc: 0.405834920461 , current acc: 0.

epoch: 43 , pass acc: 0.771333956615 , current acc: 0.772744705661
epoch: 44 , training loss: 0.815229911798 , training acc: 0.81295903621 , valid loss: 0.922611611647 , valid acc: 0.772744705661
epoch: 44 , pass acc: 0.772744705661 , current acc: 0.773618026395
epoch: 45 , training loss: 0.807733850779 , training acc: 0.814478550366 , valid loss: 0.9168575469 , valid acc: 0.773618026395
epoch: 45 , pass acc: 0.773618026395 , current acc: 0.774520138289
epoch: 46 , training loss: 0.800606312007 , training acc: 0.81601725758 , valid loss: 0.911440972403 , valid acc: 0.774520138289
epoch: 46 , pass acc: 0.774520138289 , current acc: 0.775652573971
epoch: 47 , training loss: 0.793819363062 , training acc: 0.817735109833 , valid loss: 0.906340670792 , valid acc: 0.775652573971
epoch: 47 , pass acc: 0.775652573971 , current acc: 0.776353150663
epoch: 48 , training loss: 0.787347490827 , training acc: 0.819309005067 , valid loss: 0.901536854181 , valid acc: 0.776353150663
epoch: 48 , pass ac

epoch: 85 , pass acc: 0.794587313614 , current acc: 0.794788849445
epoch: 86 , training loss: 0.654205057343 , training acc: 0.851769028588 , valid loss: 0.817991905701 , valid acc: 0.794788849445
epoch: 86 , pass acc: 0.794788849445 , current acc: 0.795249501311
epoch: 87 , training loss: 0.65220199063 , training acc: 0.852290462357 , valid loss: 0.816871017847 , valid acc: 0.795249501311
epoch: 87 , pass acc: 0.795249501311 , current acc: 0.795566198522
epoch: 88 , training loss: 0.650240615029 , training acc: 0.852872675474 , valid loss: 0.815784011572 , valid acc: 0.795566198522
epoch: 88 , pass acc: 0.795566198522 , current acc: 0.7959980602
epoch: 89 , training loss: 0.648319590372 , training acc: 0.853362120102 , valid loss: 0.814730613566 , valid acc: 0.7959980602
epoch: 89 , pass acc: 0.7959980602 , current acc: 0.796506697923
epoch: 90 , training loss: 0.646437645588 , training acc: 0.853909144398 , valid loss: 0.81371073891 , valid acc: 0.796506697923
epoch: 90 , pass acc: 0

epoch: 128 , pass acc: 0.804788850033 , current acc: 0.804913609934
epoch: 129 , training loss: 0.593657608758 , training acc: 0.868490083833 , valid loss: 0.793491426448 , valid acc: 0.804913609934
epoch: 130 , training loss: 0.592658165339 , training acc: 0.868717211622 , valid loss: 0.793302770654 , valid acc: 0.804769656194
epoch: 130 , pass acc: 0.804913609934 , current acc: 0.805047966812
epoch: 131 , training loss: 0.591671077872 , training acc: 0.868982727129 , valid loss: 0.793123232884 , valid acc: 0.805047966812
epoch: 131 , pass acc: 0.805047966812 , current acc: 0.805115145193
epoch: 132 , training loss: 0.590696145423 , training acc: 0.869280231243 , valid loss: 0.792952059813 , valid acc: 0.805115145193
epoch: 132 , pass acc: 0.805115145193 , current acc: 0.805182324433
epoch: 133 , training loss: 0.589733168206 , training acc: 0.869510558997 , valid loss: 0.792789072571 , valid acc: 0.805182324433
epoch: 133 , pass acc: 0.805182324433 , current acc: 0.805230308174
epoch

epoch: 177 , pass acc: 0.80771591144 , current acc: 0.807744703572
epoch: 178 , training loss: 0.556206755335 , training acc: 0.879245047701 , valid loss: 0.791787581172 , valid acc: 0.807744703572
epoch: 179 , training loss: 0.555632170755 , training acc: 0.879436985354 , valid loss: 0.791869146267 , valid acc: 0.807658331637
epoch: 180 , training loss: 0.555063358005 , training acc: 0.879712098658 , valid loss: 0.791953244273 , valid acc: 0.807648734202
epoch: 180 , pass acc: 0.807744703572 , current acc: 0.807792687256
epoch: 181 , training loss: 0.554500225533 , training acc: 0.87998721034 , valid loss: 0.792039563297 , valid acc: 0.807792687256
epoch: 182 , training loss: 0.553942674515 , training acc: 0.880115168795 , valid loss: 0.792128325853 , valid acc: 0.807744703057
epoch: 182 , pass acc: 0.807792687256 , current acc: 0.807898253603
epoch: 183 , training loss: 0.553390664221 , training acc: 0.880287914381 , valid loss: 0.792218853582 , valid acc: 0.807898253603
epoch: 184 ,