In [1]:
import numpy as np
import tensorflow as tf
import random
tf.compat.v1.disable_eager_execution()
tf.compat.v1.reset_default_graph()


In [2]:
NUM_CLASSES = 10

In [3]:
class MLP:
    def __init__(self, vocab_size, hidden_size):
        self._vocab_size = vocab_size
        self._hidden_size = hidden_size

    def build_graph(self):
        self._X = tf.compat.v1.placeholder(tf.float32, shape=[None, self._vocab_size])
        self._real_Y = tf.compat.v1.placeholder(tf.int32, shape=[None, ])

        weights_1 = tf.compat.v1.get_variable(name='weights_input_hidden', shape=(self._vocab_size, self._hidden_size), initializer=tf.random_normal_initializer(seed=2018))
        biases_1 = tf.compat.v1.get_variable(name='biases_input_hidden', shape=(self._hidden_size), initializer=tf.random_normal_initializer(seed=2018))
        weights_2 = tf.compat.v1.get_variable(name='weights_output_hidden', shape=(self._hidden_size, NUM_CLASSES), initializer=tf.random_normal_initializer(seed=2018))
        biases_2 = tf.compat.v1.get_variable(name='biases_output_hidden', shape=(NUM_CLASSES), initializer=tf.random_normal_initializer(seed=2018))     

        hidden = tf.matmul(self._X, weights_1) + biases_1
        hidden = tf.sigmoid(hidden)
        logits = tf.matmul(hidden, weights_2) + biases_2
        labels_one_hot = tf.one_hot(indices=self._real_Y, depth=NUM_CLASSES, dtype=tf.float32)
        loss = tf.nn.softmax_cross_entropy_with_logits(labels=labels_one_hot, logits=logits)
        loss = tf.reduce_mean(loss)

        probs = tf.nn.softmax(logits)
        predicted_labels = tf.argmax(probs, axis=1)
        predicted_labels = tf.squeeze(predicted_labels)

        return predicted_labels, loss
    def trainer(self, loss, learning_rate):
        train_op = tf.compat.v1.train.AdamOptimizer(learning_rate).minimize(loss)
        return train_op

    


In [4]:
with open('../datasets/20news-bydate/words_idfs.txt') as f:
    vocab_size = len(f.read().splitlines())

mlp = MLP(
    vocab_size=vocab_size,
    hidden_size=50
)
predicted_labels, loss = mlp.build_graph()
train_op = mlp.trainer(loss=loss, learning_rate=0.1)

In [5]:
class DataReader:
    def __init__(self, data_path, batch_size, vocab_size):
        self._batch_size = batch_size
        with open(data_path) as f:
            d_lines = f.read().splitlines()
        
        self._data = []
        self._labels = []
        for data_id, line in enumerate(d_lines):
            vector = [0.0 for _ in range(vocab_size)]
            features = line.split('<fff>')
            label, doc_id = int(features[0]), int(features[1])
            tokens = features[2].split()
            for token in tokens:
                index,value = int(token.split(':')[0]), float(token.split(':')[1])
                vector[index] = value
            self._data.append(vector)
            self._labels.append(label)
            
        self._data = np.array(self._data)
        self._labels = np.array(self._labels)
        self._num_epoch = 0
        self._batch_id = 0

    def next_batch(self):
        start = self._batch_id * self._batch_size
        end = start + self._batch_size
        self._batch_id += 1
        
        if end + self._batch_size > len(self._data):  
            end = len(self._data)
            self._num_epoch += 1
            self._batch_id = 0
            indices = list(range(len(self._data)))
            random.seed(2018)
            random.shuffle(indices)
            self._data, self._labels = self._data[indices], self._labels[indices]

        return self._data[start:end], self._labels[start:end]

In [6]:
def load_dataset():
    train_data_reader = DataReader(
        data_path='../datasets/20news-bydate/train_tf_idf.txt',
        batch_size=50,
        vocab_size=vocab_size
    )
    test_data_reader = DataReader(
        data_path='../datasets/20news-bydate/test_tf_idf.txt',
        batch_size=50,
        vocab_size=vocab_size
    )
    return train_data_reader, test_data_reader

In [7]:
def save_parameters(name, value, epoch):
    filename = name.replace(':', '-colon-') + '-epoch-{}.txt'.format(epoch)
    if len(value.shape) == 1:
        string_form = ','.join([str(number) for number in value])
    else:
        string_form = '\n'.join([','.join([str(number) for number in value[row]]) for row in range(value.shape[0])])

    with open('../datasets/saved-paras/'+filename, 'w') as f:
        f.write(string_form)

In [8]:
with tf.compat.v1.Session() as sess:
    train_data_reader, test_data_reader = load_dataset()
    step, MAX_STEP = 0, 100**2
    
    sess.run(tf.compat.v1.global_variables_initializer())
    while step < MAX_STEP:
        train_data, train_labels = train_data_reader.next_batch()
        plabels_eval, loss_eval, _ = sess.run(
            [predicted_labels, loss, train_op],
            feed_dict= {
                mlp._X: train_data,
                mlp._real_Y: train_labels
            }
        )
        step += 1
        print('step: {}, loss: {}'.format(step, loss_eval))   
    variables = tf.compat.v1.trainable_variables()
    for var in variables:
        save_parameters(name=var.name, value=var.eval(),epoch=train_data_reader._num_epoch)

step: 1, loss: 2.385011672973633
step: 2, loss: 0.05340058356523514
step: 3, loss: 0.0010517913615331054
step: 4, loss: 3.8310623494908214e-05
step: 5, loss: 3.0159897050907603e-06
step: 6, loss: 3.9100635262911965e-07
step: 7, loss: 0.0
step: 8, loss: 0.0
step: 9, loss: 0.0
step: 10, loss: 9.413915634155273
step: 11, loss: 21.154146194458008
step: 12, loss: 17.496200561523438
step: 13, loss: 12.8384428024292
step: 14, loss: 8.523136138916016
step: 15, loss: 4.672196865081787
step: 16, loss: 1.800886631011963
step: 17, loss: 0.4612773060798645
step: 18, loss: 0.15451422333717346
step: 19, loss: 0.09243655949831009
step: 20, loss: 0.09362736344337463
step: 21, loss: 0.07769663631916046
step: 22, loss: 4.246654033660889
step: 23, loss: 5.437527656555176
step: 24, loss: 4.275923252105713
step: 25, loss: 3.7616355419158936
step: 26, loss: 2.8618884086608887
step: 27, loss: 2.4857499599456787
step: 28, loss: 2.1409740447998047
step: 29, loss: 1.992520809173584
step: 30, loss: 1.640207529067

In [9]:
def restore_parameters(name, epoch):
    filename = name.replace(':', '-colon-') + '-epoch-{}.txt'.format(epoch)
    with open('../datasets/saved-paras/'+filename) as f:
        lines = f.read().splitlines()
    if len(lines) == 1:
        value = [float(number) for number in lines[0].split(',')]
    else:
        value = [[float(number) for number in lines[row].split(',')]for row in range(len(lines))]
    return value
    


In [10]:
test_data_reader = DataReader(
    data_path='../datasets/20news-bydate/test_tf_idf.txt',
    batch_size=50,
    vocab_size=vocab_size
)
with tf.compat.v1.Session() as sess:
    epoch = 44
    trainable_variables = tf.compat.v1.trainable_variables()
    for variable in trainable_variables:
        saved_value = restore_parameters(variable.name, epoch)
        assign_op = variable.assign(saved_value)
        sess.run(assign_op)
    num_true_preds = 0
    while True:
        test_data, test_labels = test_data_reader.next_batch()
        test_plabels_eval = sess.run(
            predicted_labels,
            feed_dict = {
                mlp._X:test_data,
                mlp._real_Y:test_labels
            }
        )
        matches = np.equal(test_plabels_eval, test_labels)
        num_true_preds += np.sum(matches.astype(float))

        if test_data_reader._batch_id == 0:
            break
    print('Epoch', epoch)
    print('Accuracy on test data: ', num_true_preds/ len(test_data_reader._data))
    

Epoch 44
Accuracy on test data:  0.04580456718003186
