In [22]:
import numpy as np
import pandas as pd
import tensorflow as tf

In [23]:
test_df = pd.read_csv('https://s3.amazonaws.com/ccwf-ml-data/jigsaw/processed_test.csv')
train_df = pd.read_csv('https://s3.amazonaws.com/ccwf-ml-data/jigsaw/processed_train.csv')

In [24]:
y_train = train_df['target'].apply(lambda x: 0 if x <= 0.5 else 1)

In [25]:
FAST_TEXT = '/home/luis/ml-data/crawl-300d-2M.vec'
GLOVE = '/home/luis/ml-data/glove.840B.300d.txt'

In [26]:
from tensorflow.python.ops.rnn import bidirectional_dynamic_rnn as bi_rnn
from tensorflow.contrib.cudnn_rnn import CudnnCompatibleLSTMCell

In [27]:
class ABLSTM(object):
    def __init__(self, config):
        self.max_len = config["max_len"]
        self.hidden_size = config["hidden_size"]
        self.vocab_size = config["vocab_size"]
        self.embedding_size = config["embedding_size"]
        self.n_class = config["n_class"]
        self.learning_rate = config["learning_rate"]

        # placeholder
        self.x = tf.placeholder(tf.int32, [None, self.max_len])
        self.label = tf.placeholder(tf.int32, [None])
        self.keep_prob = tf.placeholder(tf.float32)

    def build_graph(self):
        print("building graph")
        # Word embedding
        embeddings_var = tf.Variable(tf.random_uniform([self.vocab_size, self.embedding_size], -1.0, 1.0),
                                     trainable=True)
        batch_embedded = tf.nn.embedding_lookup(embeddings_var, self.x)

        rnn_outputs, _ = bi_rnn(CudnnCompatibleLSTMCell(self.hidden_size),
                                CudnnCompatibleLSTMCell(self.hidden_size),
                                inputs=batch_embedded, dtype=tf.float32)

        fw_outputs, bw_outputs = rnn_outputs

        W = tf.Variable(tf.random_normal([self.hidden_size], stddev=0.1))
        H = fw_outputs + bw_outputs  # (batch_size, seq_len, HIDDEN_SIZE)
        M = tf.tanh(H)  # M = tanh(H)  (batch_size, seq_len, HIDDEN_SIZE)

        self.alpha = tf.nn.softmax(tf.reshape(tf.matmul(tf.reshape(M, [-1, self.hidden_size]),
                                                        tf.reshape(W, [-1, 1])),
                                              (-1, self.max_len)))  # batch_size x seq_len
        r = tf.matmul(tf.transpose(H, [0, 2, 1]),
                      tf.reshape(self.alpha, [-1, self.max_len, 1]))
        r = tf.squeeze(r)
        h_star = tf.tanh(r)  # (batch , HIDDEN_SIZE

        h_drop = tf.nn.dropout(h_star, self.keep_prob)

        # Fully connected layer（dense layer)
        FC_W = tf.Variable(tf.truncated_normal([self.hidden_size, self.n_class], stddev=0.1))
        FC_b = tf.Variable(tf.constant(0., shape=[self.n_class]))
        y_hat = tf.nn.xw_plus_b(h_drop, FC_W, FC_b)

        self.loss = tf.reduce_mean(
            tf.nn.sparse_softmax_cross_entropy_with_logits(logits=y_hat, labels=self.label))

        # prediction
        self.prediction = tf.argmax(tf.nn.softmax(y_hat), 1)

        # optimization
        loss_to_minimize = self.loss
        tvars = tf.trainable_variables()
        gradients = tf.gradients(loss_to_minimize, tvars, aggregation_method=tf.AggregationMethod.EXPERIMENTAL_TREE)
        grads, global_norm = tf.clip_by_global_norm(gradients, 1.0)

        self.global_step = tf.Variable(0, name="global_step", trainable=False)
        self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate)
        self.train_op = self.optimizer.apply_gradients(zip(grads, tvars), global_step=self.global_step,
                                                       name='train_step')
        print("graph built successfully!")

In [28]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.utils import shuffle

def data_preprocessing_v2(train, test, max_len, max_words=50000):
    tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=max_words)
    tokenizer.fit_on_texts(train)
    train_idx = tokenizer.texts_to_sequences(train)
    test_idx = tokenizer.texts_to_sequences(test)
    train_padded = pad_sequences(train_idx, maxlen=max_len, padding='post', truncating='post')
    test_padded = pad_sequences(test_idx, maxlen=max_len, padding='post', truncating='post')
    # vocab size = len(word_docs) + 2  (<UNK>, <PAD>)
    return train_padded, test_padded, max_words + 2

def split_dataset(x_test, y_test, dev_ratio):
    """split test dataset to test and dev set with ratio """
    test_size = len(x_test)
    dev_size = (int)(test_size * dev_ratio)
    x_dev = x_test[:dev_size]
    x_test = x_test[dev_size:]
    y_dev = y_test[:dev_size]
    y_test = y_test[dev_size:]
    return x_test, x_dev, y_test, y_dev, dev_size, test_size - dev_size

def fill_feed_dict(data_X, data_Y, batch_size):
    """Generator to yield batches"""
    # Shuffle data first.
    shuffled_X, shuffled_Y = shuffle(data_X, data_Y)
    # print("before shuffle: ", data_Y[:10])
    # print(data_X.shape[0])
    # perm = np.random.permutation(data_X.shape[0])
    # data_X = data_X[perm]
    # shuffled_Y = data_Y[perm]
    # print("after shuffle: ", shuffled_Y[:10])
    for idx in range(data_X.shape[0] // batch_size):
        x_batch = shuffled_X[batch_size * idx: batch_size * (idx + 1)]
        y_batch = shuffled_Y[batch_size * idx: batch_size * (idx + 1)]
        yield x_batch, y_batch
        
def make_train_feed_dict(model, batch):
    """make train feed dict for training"""
    feed_dict = {model.x: batch[0],
                 model.label: batch[1],
                 model.keep_prob: .5}
    return feed_dict


def make_test_feed_dict(model, batch):
    feed_dict = {model.x: batch[0],
                 model.label: batch[1],
                 model.keep_prob: 1.0}
    return feed_dict


def run_train_step(model, sess, batch):
    feed_dict = make_train_feed_dict(model, batch)
    to_return = {
        'train_op': model.train_op,
        'loss': model.loss,
        'global_step': model.global_step,
    }
    return sess.run(to_return, feed_dict)


def run_eval_step(model, sess, batch):
    feed_dict = make_test_feed_dict(model, batch)
    prediction = sess.run(model.prediction, feed_dict)
    acc = np.sum(np.equal(prediction, batch[1])) / len(prediction)
    return acc


def get_attn_weight(model, sess, batch):
    feed_dict = make_train_feed_dict(model, batch)
    return sess.run(model.alpha, feed_dict)


In [29]:
x_train, x_test, vocab_size = data_preprocessing_v2(train_df['processed_comment_text'], 
                                                        test_df['processed_comment_text'], 600)

In [30]:
# split dataset to test and dev
x_train, x_dev, y_train, y_dev, dev_size, train_size = \
    split_dataset(x_train, y_train, 0.1)
print("Validation Size: ", dev_size)

Validation Size:  180487


In [33]:
config = {
    "max_len": 600,
    "hidden_size": 200,
    "vocab_size": vocab_size,
    "embedding_size": 128,
    "n_class": 2,
    "learning_rate": 1e-3,
    "batch_size": 512,
    "train_epoch": 1
}

In [None]:
import time 
start = time.time()
with tf.Graph().as_default():
    session_conf = tf.ConfigProto(
        allow_soft_placement=True,
        log_device_placement=True)
    session_conf.gpu_options.allow_growth = True
    sess = tf.Session(config=session_conf)
    with sess.as_default():
        classifier = ABLSTM(config)
        classifier.build_graph()
        sess.run(tf.global_variables_initializer())
        for x_batch, y_batch in fill_feed_dict(x_train, y_train, config["batch_size"]):
            return_dict = run_train_step(classifier, sess, (x_batch, y_batch))
            attn = get_attn_weight(classifier, sess, (x_batch, y_batch))
            if return_dict['global_step'] % 500 == 0:
                # Training loop. For each batch...
                accuracy = 0.0
                iter_cnt = 0
                for x_val, y_val in fill_feed_dict(x_dev, y_dev, config["batch_size"]):
                    dev_batch = (x_val, y_val)
                    dev_acc = run_eval_step(classifier, sess, dev_batch)
                    accuracy += dev_acc
                    iter_cnt += 1
                accuracy /= iter_cnt
                print("validation accuracy: %.3f " % accuracy)
                print("Current loss is {0}".format(return_dict['loss']))
        out_dir = os.path.abspath(os.path.join(os.path.curdir, "run/model.ckpt"))
        save_path = saver.save(out_dir)

building graph
graph built successfully!
validation accuracy: 0.954 
Current loss is 0.13391779363155365
validation accuracy: 0.963 
Current loss is 0.0949307307600975
validation accuracy: 0.964 
Current loss is 0.09550566226243973
validation accuracy: 0.964 
Current loss is 0.12179908901453018


In [None]:
save_path = saver.save(sess, "/tmp/model.ckpt")