In [1]:
import numpy as np
import tensorflow as tf
import random

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
ROOT_PATH = '/content/drive/MyDrive/Colab Notebooks/'

In [4]:
class MLP:
    def __init__(self, vocab_size, hidden_size):
        self._vocab_size = vocab_size
        self._hidden_size = hidden_size

    def build_graph(self):
        NUM_CLASSES = 20
        self._X = tf.compat.v1.placeholder(tf.float32, shape=[None, self._vocab_size])
        self._real_Y = tf.compat.v1.placeholder(tf.int32, shape=[None, ])

        weights_1 = tf.compat.v1.get_variable(
            name='weights_input_hidden',
            shape=(self._vocab_size, self._hidden_size),
            initializer=tf.random_normal_initializer(seed=2023),
        )

        biases_1 = tf.compat.v1.get_variable(
            name='biases_input_hidden',
            shape=self._hidden_size,
            initializer=tf.random_normal_initializer(seed=2023),
        )

        weights_2 = tf.compat.v1.get_variable(
            name='weights_hidden_output',
            shape=(self._hidden_size, NUM_CLASSES),
            initializer=tf.random_normal_initializer(seed=2023),
        )

        biases_2 = tf.compat.v1.get_variable(
            name='biases_hidden_output',
            shape=NUM_CLASSES,
            initializer=tf.random_normal_initializer(seed=2023),
        )

        hidden = tf.matmul(self._X, weights_1) + biases_1
        hidden = tf.sigmoid(hidden)
        logits = tf.matmul(hidden, weights_2) + biases_2

        labels_one_hot = tf.one_hot(indices=self._real_Y, depth=NUM_CLASSES, dtype=tf.float32)
        loss = tf.nn.softmax_cross_entropy_with_logits(labels=labels_one_hot, logits=logits)
        loss = tf.reduce_mean(loss)

        probs = tf.nn.softmax(logits)
        predicted_labels = tf.argmax(probs, axis=1)
        predicted_labels = tf.squeeze(predicted_labels)

        return predicted_labels, loss

    def trainer(self, loss, learning_rate):
        train_op = tf.compat.v1.train.AdamOptimizer(learning_rate).minimize(loss)
        return train_op

In [5]:
class DataReader:
    def __init__(self, data_path, batch_size, vocab_size, purpose):
        assert(purpose in ('train', 'test'))            
        self._batch_size = batch_size
        with open(data_path) as f:
            d_lines = f.read().splitlines()

        self._data = []
        self._labels = []

        for data_id, line in enumerate(d_lines):
            vector = [(0.0) for _ in range(vocab_size)]
            features = line.split('<fff>')
            label = int(features[0])
            # doc_id = int(features[1])
            tokens = features[2].split()
            for token in tokens:
                index = int(token.split(':')[0])
                value = float(token.split(':')[1])
                vector[index] = value
            self._data.append(vector)
            self._labels.append(label)

        indices = list(range(len(self._data)))
        random.seed(128)
        random.shuffle(self._data)
        random.seed(128)
        random.shuffle(self._labels)
        _splitter = int(0.8 * len(self._data))
        if purpose == 'train': 
            self._data = np.array(self._data[:_splitter])
            self._labels = np.array(self._labels[:_splitter])
        else:
            self._data = np.array(self._data[_splitter:])
            self._labels = np.array(self._labels[_splitter:])

        self._num_epoch = 0
        self._batch_id = 0

    def next_batch(self):
        start = self._batch_id * self._batch_size
        end = start + self._batch_size
        self._batch_id += 1

        if end + self._batch_size > len(self._data):
            end = len(self._data)
            self._num_epoch += 1
            self._batch_id = 0
            indices = list(range(len(self._data)))
            random.seed(2023)
            random.shuffle(indices)
            self._data = self._data[indices]
            self._labels = self._labels[indices]

        return self._data[start:end], self._labels[start:end]

In [6]:
with open(ROOT_PATH + 'test-word_idfs.txt', 'rb') as f:
  # print(f)
  # print(len(f.read().splitlines()))
  vocab_size = len(f.read().splitlines())
  print(vocab_size)

70350


In [7]:
def save_parameters(name, value, epoch):
    filename = name.replace(':', '-colon') + '-epoch-{}.txt'.format(epoch)
    if len(value.shape) == 1:  # is a list
        string_form = ','.join([str(number) for number in value])
    else:
        string_form = '\n'.join([','.join([str(number) for number in value[row]])
                                 for row in range(value.shape[0])])

    with open(ROOT_PATH + 'saved-params/' + filename, 'w') as f:
        f.write(string_form)

In [8]:
def load_dataset():
    _test_data_reader = DataReader(
        data_path=ROOT_PATH + 'test-tf-idf.txt',
        batch_size=50,
        vocab_size=vocab_size,
        purpose='test'
    )
    _train_data_reader = DataReader(
        data_path=ROOT_PATH + 'test-tf-idf.txt',
        batch_size=50,
        vocab_size=vocab_size,
        purpose='train'
    ) 
    return _train_data_reader, _test_data_reader

In [9]:
tf.compat.v1.disable_eager_execution()

In [10]:
mlp = MLP(
        vocab_size=vocab_size,
        hidden_size=50,
    )
predicted_labels, loss = mlp.build_graph()
train_op = mlp.trainer(loss=loss, learning_rate=0.1)

In [11]:
train_data_reader, test_data_reader = load_dataset()

In [12]:
with tf.compat.v1.Session() as session:
    step = 0
    MAX_STEP = 3000

    session.run(tf.compat.v1.global_variables_initializer())
    while step < MAX_STEP:
        train_data, train_labels = train_data_reader.next_batch()
        plabels_eval, loss_eval, _ = session.run(
            [predicted_labels, loss, train_op],
            feed_dict = {
                mlp._X: train_data,
                mlp._real_Y: train_labels
            }
        )
        step += 1
        if step % 100 == 0: print('step: {}, loss: {}'.format(step, loss_eval))
    trainable_variables = tf.compat.v1.trainable_variables()
    for variable in trainable_variables:
      save_parameters(
          name=variable.name,
          value=variable.eval(),
          epoch=train_data_reader._num_epoch
      )
    

step: 100, loss: 0.3776494562625885
step: 200, loss: 0.01657969318330288
step: 300, loss: 0.0012482856400310993
step: 400, loss: 0.0007727577467449009
step: 500, loss: 0.0005408198921941221
step: 600, loss: 0.0001748702343320474
step: 700, loss: 0.00011850064765894786
step: 800, loss: 0.0001043211595970206
step: 900, loss: 0.0001256909454241395
step: 1000, loss: 8.385296678170562e-05
step: 1100, loss: 6.866596959298477e-05
step: 1200, loss: 9.408150799572468e-05
step: 1300, loss: 3.583434590836987e-05
step: 1400, loss: 6.453356036217883e-05
step: 1500, loss: 3.231943992432207e-05
step: 1600, loss: 4.201159754302353e-05
step: 1700, loss: 6.0002730606356636e-05
step: 1800, loss: 0.01597045734524727
step: 1900, loss: 9.061457240022719e-05
step: 2000, loss: 1.496758977737045e-05
step: 2100, loss: 3.6726269172504544e-05
step: 2200, loss: 4.82721661683172e-05
step: 2300, loss: 1.536058698548004e-05
step: 2400, loss: 1.4307875972008333e-05
step: 2500, loss: 2.7775608032243326e-05
step: 2600, 

In [13]:
def restore_parameters(name, epoch):
    filename = name.replace(':', '-colon') + '-epoch-{}.txt'.format(epoch)
    with open(ROOT_PATH + 'saved-params/' + filename) as f:
        lines = f.read().splitlines()
    if len(lines) == 1: # is a vector
        value = [float(number) for number in lines[0].split(',')]
    else:
        value = [[float(number) for number in lines[row].split(',')]
                 for row in range(len(lines))]
    return value

In [14]:
with tf.compat.v1.Session() as session:
    _num_epoch = train_data_reader._num_epoch

    trainable_variables = tf.compat.v1.trainable_variables()
    for variable in trainable_variables:
        _saved_value = restore_parameters(variable.name, _num_epoch)
        # print(_saved_value)
        _assign_op = variable.assign(_saved_value)
        session.run(_assign_op)

    num_true_preds = 0
    while True:
        test_data, test_labels = test_data_reader.next_batch()
        # print(test_labels)
        test_pred_labels_eval = session.run(
            predicted_labels,
            feed_dict = {
                mlp._X: test_data,
                mlp._real_Y: test_labels
            }
        )
        matches = np.equal(test_pred_labels_eval, test_labels)
        num_true_preds += np.sum(matches.astype(float))

        if test_data_reader._batch_id == 0: break
    print('Accuracy on test data: ', num_true_preds / len(test_data_reader._data))

Accuracy on test data:  0.9051094890510949
