In [0]:
!rm -R sentiment_analysis
!git clone https://github.com/KisuYang/sentiment_analysis.git

In [0]:
import os
import collections
import numpy as np

class Data():
    def __init__(self, args):
        self.hparams = args.hparams
        self.data_path = os.path.join(args.base_dir, args.data)
        self.max_sentence_length = 0
        self.max_word_length = 0

        #word_vocab -> train.vocab
        self.get_vocab()
        #char_vocab
        self.get_char_vocab()

    def get_vocab(self):

        #train_vocab
        with open(os.path.join(self.data_path,"train.vocab"),"r") as f_handle:
            self.id2word = [line.strip() for line in list(f_handle) if len(line.strip()) > 0]

        self.word2id = dict()
        for i, word in enumerate(self.id2word):
            self.word2id[word] = i

        #label.vocab
        with open(os.path.join(self.data_path,"label.vocab"),"r") as f_handle:
            labels = [l.strip() for l in list(f_handle) if len(l.strip()) > 0]
        self.id2label = labels
        self.label2id = dict()

        for i, label in enumerate(labels):
            self.label2id[label] = i

    def get_char_vocab(self):
        self.id2char = list()

        with open(os.path.join(self.data_path,"train.inputs"),"r") as f_handle:
            text = [l.strip() for l in list(f_handle) if len(l.strip()) > 0]
            full_text = ""
            for sentence in text:
                full_text += "".join(sentence.split(" "))

        alphabet_counter = collections.Counter(full_text).most_common()
        for alphabet, count in alphabet_counter:
            self.id2char.append(alphabet)

        self.char2id = dict()
        self.id2char.insert(0, "<PAD>")

        for i, char in enumerate(self.id2char):
            self.char2id[char] = i

    def load_data(self, data_type="train"):
        inputs, labels, lengths = [], [], []

        char_inputs, char_inputs_temp = [], []
        char_lengths, char_lengths_temp = [], []

        with open(os.path.join(self.data_path,"%s.inputs" % data_type),"r") as f_handle:
            for i, sentence in enumerate(list(f_handle)):

                inputs.append(sentence.strip().split(' '))
                sentence_len = len(sentence.strip().split(' '))

                if len(sentence.strip().split(' ')) < self.max_sentence_length:
                    self.max_sentence_length = sentence_len

                #make the list about char lengths
                for words in sentence.strip().split(' '):
                    char_inputs_temp.append(list(words))
                    char_lengths_temp.append(len(list(words)))

                    if len(list(words)) > self.max_word_length:
                        self.max_word_length = len(list(words))

                char_inputs.append(char_inputs_temp)
                char_lengths.append(char_lengths_temp)
                char_inputs_temp = []
                char_lengths_temp = []

        with open(os.path.join(self.data_path, "%s.labels" % data_type), "r") as f_handle:
            for i, sentence in enumerate(list(f_handle)):
                labels.append(sentence.strip().split(' '))

        for sentence in inputs:
            lengths.append(len(sentence))

        return (char_inputs, char_lengths), (inputs, labels, lengths)

    def data_id(self, inputs, labels, chars):
        inputs_id = inputs
        labels_id = labels
        chars_id = chars

        for sentence in inputs_id:
            for i, word in enumerate(sentence):
                try:
                    sentence[i] = self.word2id[word]

                except KeyError:
                    sentence[i] = len(self.word2id)

        for sentence in labels_id:
            for i, label in enumerate(sentence):
                sentence[i] = self.label2id[label]

        for sentence in chars_id:
            for i, word in enumerate(sentence):
                for j, char in enumerate(word):
                    try:
                        sentence[i][j] = self.char2id[char]
                    except KeyError:
                        print("char key error : ", char)
                        self.char2id[char] = len(self.id2char)
                        sentence[i][j] = self.char2id[char]

        return inputs_id, labels_id, chars_id

    def get_batch_data(self, input_id, labels_id, train_lengths, chars_id, char_lengths, iter, batch_size):
        idx = iter * batch_size
        batch_inputs = input_id[idx:idx + batch_size]
        batch_labels = labels_id[idx:idx + batch_size]
        batch_lengths = train_lengths[idx:idx + batch_size]

        batch_char_inputs = chars_id[idx:idx + batch_size]
        batch_char_lengths = char_lengths[idx:idx + batch_size]

        max_sentence_len = max(batch_lengths)

        max_word_length = 0
        for char_len_sentence in batch_char_lengths:
            if max_word_length < max(char_len_sentence):
                max_word_length = max(char_len_sentence)

        #sentence padding
        for sentence in batch_inputs:
            if len(sentence) < max_sentence_len:
                sentence.extend([0]*(max_sentence_len-len(sentence)))

        #batch_char_inputs: padding
        for words_list in batch_char_inputs:
            if len(words_list) < max_sentence_len:
                for i in range(max_sentence_len - len(words_list)):
                    words_list.append([0])

            for word in words_list:
                if len(word) < max_word_length:
                    word.extend([0]*(max_word_length - len(word)))

        #batch_char_lengths: padding
        for words_length in batch_char_lengths:
            if len(words_length) < max_sentence_len:
                for i in range(max_sentence_len - len(words_length)):
                    words_length.append(0)

        batch_labels_temp = list()
        for sentence in batch_labels:
            batch_labels_temp.extend(sentence)

        batch_labels = batch_labels_temp

        return batch_inputs, batch_labels, batch_lengths, batch_char_inputs, batch_char_lengths


In [0]:
import os
import logging
import tensorflow as tf
from tensorflow.contrib import rnn
import numpy as np

class TextClassifier:
    def __init__(self, args, hparams):
        self.hparams = hparams
        self.data_dir = args.data
        self.eval_dir = args.eval_dir
        self.base_dir = args.base_dir
        #logger
        self._logger = logging.getLogger(__name__)

        #data_process
        self.data_process = Data(args)
        (self.char_inputs, self.char_lengths), (self.inputs, self.labels, self.lengths) = \
            self.data_process.load_data()

        # word, id
        self.word2id = self.data_process.word2id  # dict()
        self.id2word = self.data_process.id2word  # vocabulary

        # label, id
        self.label2id = self.data_process.label2id
        self.id2label = self.data_process.id2label

        # pre-trained word2vec
        with np.load(os.path.join(self.base_dir, self.hparams.glove_dir, "glove.6B.300d.trimmed.npz")) as pretrained_data:
            self.word_embeddings = pretrained_data["embeddings"]
            print(np.shape(self.word_embeddings))

    def _inference(self, inputs, lengths, char_inputs, char_lengths):
        print("Building graph for model: Text Classifier")

        # Number of possible output cateIories.
        output_dim = len(self.id2label) # output_dim -> 2

        word_embeddings = tf.Variable(
            self.word_embeddings,
            name="word_embeddings",
            dtype=tf.float32,
            trainable=True
        )

        ## shape = [batch_size, time, embed_dim]
        word_embedded = tf.nn.embedding_lookup(word_embeddings, inputs)
        word_feature_map = tf.expand_dims(word_embedded, -1)

        # Convolution & Maxpool
        features = []
        for size in self.hparams.filter_size:
            with tf.variable_scope("CNN_filter_%d" % size):
                # Add padding to mark the beginning and end of words.
                pad_height = size - 1
                pad_shape = [[0, 0], [pad_height, pad_height], [0, 0], [0, 0]]
                word_feature_map = tf.pad(word_feature_map, pad_shape)
                feature = tf.layers.conv2d(
                    inputs=word_feature_map,
                    filters=self.hparams.num_filters,
                    kernel_size=[size, self.hparams.embedding_dim],
                    use_bias=False
                )
                # shape = [batch, time, 1, out_channels]
                feature = tf.reduce_max(feature, axis=1)
                feature = tf.squeeze(feature)
                feature = tf.reshape(feature, [tf.shape(inputs)[0], self.hparams.num_filters])
                # shape = [batch, out_channels]
                print(feature.shape)
                self.feature_shape = tf.shape(feature)
                # feature = tf.Print(feature, [feature], message="convolution feature")
                features.append(feature)

        # shape = [batch, out_channels * num_filters]
        layer_out = tf.concat(features, axis=1)
        print(layer_out.shape)

        with tf.variable_scope("layer_out"):
            logits = tf.layers.dense(
                inputs=layer_out,
                units=output_dim,
                activation=None,
                kernel_initializer=tf.initializers.variance_scaling(
                    scale=2.0, mode="fan_in", distribution="normal"
                )
            )

        return logits

    def make_placeholder(self):

        self.inputs_ph = tf.placeholder(tf.int32, shape=[None, None], name="train_input_ph")
        self.labels_ph = tf.placeholder(tf.int32, shape=[None], name="train_label_ph")
        self.lengths_ph = tf.placeholder(tf.int32, shape=[None], name="train_lengths_ph")

        #[batch_size, word_time, char_time]
        self.char_inputs_ph = tf.placeholder(tf.int32, shape=[None, None, None], name="char_input_ph")
        self.char_lengths_ph = tf.placeholder(tf.int32, shape=[None, None], name="char_lengths_ph")

        self._dropout_keep_prob_ph = tf.placeholder(tf.float32, shape=[], name="dropout_keep_prob")

    def make_feed_dict(self, batch_data):
        feed_dict = {}
        batch_inputs, batch_labels, batch_lengths, batch_char_inputs, batch_char_lengths = batch_data

        # word-level
        feed_dict[self.inputs_ph] = batch_inputs
        feed_dict[self.labels_ph] = batch_labels
        feed_dict[self.lengths_ph] = batch_lengths

        # char-level
        feed_dict[self.char_inputs_ph] = batch_char_inputs
        feed_dict[self.char_lengths_ph] = batch_char_lengths
        feed_dict[self._dropout_keep_prob_ph] = self.hparams.dropout_keep_prob

        return feed_dict

    def build_graph(self):

        self.global_step = tf.Variable(0, name='global_step', trainable=False)
        # logits
        with tf.variable_scope("inference", reuse=False):
            logits = self._inference(self.inputs_ph, self.lengths_ph, self.char_inputs_ph, self.char_lengths_ph)

        with tf.name_scope("cross_entropy"):
            loss_op = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=self.labels_ph,
                                                                     name="cross_entropy")
            self.loss_op = tf.reduce_mean(loss_op, name='cross_entropy_mean')
            self.train_op = tf.train.AdamOptimizer().minimize(loss_op, global_step=self.global_step)

        eval = tf.nn.in_top_k(logits, self.labels_ph, 1)
        correct_count = tf.reduce_sum(tf.cast(eval, tf.int32))
        with tf.name_scope("accuracy"):
            self.accuracy = tf.divide(correct_count, tf.shape(self.labels_ph)[0])

    def train(self):
        sess = tf.Session()

        with sess.as_default():
            global_step = tf.Variable(0, name='global_step', trainable=False)

            # build placeholder
            self.make_placeholder()
            # build train graph
            self.build_graph()

            # checkpoint file saver
            saver = tf.train.Saver()

            # get data
            inputs_id, labels_id, chars_id = \
                self.data_process.data_id(self.inputs, self.labels, self.char_inputs)

            total_batch = int(len(inputs_id) / self.hparams.batch_size) + 1
            tf.global_variables_initializer().run()
            for epochs_completed in range(self.hparams.num_epochs):

                for iter in range(total_batch):
                    batch_data = self.data_process.get_batch_data(inputs_id, labels_id, self.lengths,
                                                                  chars_id, self.char_lengths,
                                                                  iter, self.hparams.batch_size)

                    accuracy_val, loss_val, global_step_val, _ = sess.run(
                        [self.accuracy, self.loss_op, self.global_step, self.train_op],
                        feed_dict=self.make_feed_dict(batch_data)
                    )

                    if global_step_val % 10 == 0:
                        self._logger.info("[Step %d] loss: %.4f, accuracy: %.2f%%" % (
                            global_step_val, loss_val, accuracy_val * 100))

                self._logger.info("End of epoch %d." % (epochs_completed + 1))
                save_path = saver.save(sess, "%s/model.ckpt" % self.hparams.model, global_step=global_step_val)
                self._logger.info("Model saved at: %s" % save_path)
            
            
            # evaluation
            (self.char_inputs, self.char_lengths), (self.inputs, self.labels, self.lengths) = \
                self.data_process.load_data(data_type='test')        

            inputs_id, labels_id, chars_id = \
                self.data_process.data_id(self.inputs, self.labels, self.char_inputs)

            batch_data = self.data_process.get_batch_data(inputs_id, labels_id, self.lengths,
                                                                  chars_id, self.char_lengths,
                                                                  0, len(inputs_id))

            accuracy_val, loss_val, global_step_val, _ = sess.run(
                        [self.accuracy, self.loss_op, self.global_step, self.train_op],
                        feed_dict=self.make_feed_dict(batch_data)
                    )

            self._logger.info("[Test] loss: %.4f, accuracy: %.2f%%" % (
                    loss_val, accuracy_val * 100))

In [0]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="-1"
import argparse
import json
import collections
from datetime import datetime

import logging

def init_logger(path):
    if not os.path.exists(path):
        os.makedirs(path)
    logger = logging.getLogger()
    logger.handlers = []
    logger.setLevel(logging.DEBUG)
    debug_fh = logging.FileHandler(os.path.join(path, "debug.log"))
    debug_fh.setLevel(logging.DEBUG)

    info_fh = logging.FileHandler(os.path.join(path, "info.log"))
    info_fh.setLevel(logging.INFO)

    ch = logging.StreamHandler()
    ch.setLevel(logging.INFO)

    info_formatter = logging.Formatter('%(asctime)s | %(levelname)-8s | %(message)s')
    debug_formatter = logging.Formatter('%(asctime)s | %(levelname)-8s | %(message)s | %(lineno)d:%(funcName)s')

    ch.setFormatter(info_formatter)
    info_fh.setFormatter(info_formatter)
    debug_fh.setFormatter(debug_formatter)

    logger.addHandler(ch)
    logger.addHandler(debug_fh)
    logger.addHandler(info_fh)

    return logger

def train_model(args, builder_class):
    hparams_path = args.hparams

    with open(os.path.join(args.base_dir, hparams_path), "r") as f_handle:
        hparams_dict = json.load(f_handle)

    timestamp = datetime.now().strftime('%Y%m%d-%H%M%S')
    root_dir = os.path.join(hparams_dict["root_dir"], "%s/" % timestamp)

    logger = init_logger(root_dir)
    logger.info("Loaded hyper-parameter configuration from file: %s" %hparams_path)
    logger.info("Hyper-parameters: %s" %str(hparams_dict))
    hparams_dict["root_dir"] = root_dir

    hparams = collections.namedtuple("HParams", sorted(hparams_dict.keys()))(**hparams_dict)

    with open(os.path.join(root_dir, "hparams.json"), "w") as f_handle:
        json.dump(hparams._asdict(), f_handle, indent=2)

    # Build graph
    model = builder_class(args, hparams)
    model.train()

if __name__ == "__main__":
    class Args:
        base_dir = 'sentiment_analysis/'
        hparams = 'hparams/default.json'
        data = 'amazon_reviews/'
        eval_dir = None
    args=Args()

    train_model(args, TextClassifier)