# Sentence classification by MorphConv
Implementation of [Convolutional Neural Networks for Sentence Classification](https://arxiv.org/abs/1408.5882) to classify sentiment of movie review

### Explanation of this notebook
* Dataset : [Naver sentiment movie corpus v1.0](https://github.com/e9t/nsmc)
    + train, validation : splitting `ratings_train.txt` (150k reviews) for train (120k reviews) and validation (30k reviews)
    + test : `ratings_test.txt` (50k reviews)
* Preprocessing
    + Morphological analysis by Mecab wrapped by [konlpy](http://konlpy.org/en/latest/)
    + Using [FastText](https://arxiv.org/abs/1607.04606) embedding by [gluonnlp package](https://gluon-nlp.mxnet.io/)

### Setup

In [1]:
import os, sys
import konlpy
import gluonnlp as nlp
import numpy as np
import pandas as pd
import tensorflow as tf
import itertools
from tensorflow import keras
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tqdm import tqdm
from pprint import pprint

print(tf.__version__)

1.12.0


### Loading dataset

In [2]:
ratings = pd.read_csv('./data/ratings_train.txt', sep = '\t')[['document', 'label']]
ratings_tst = pd.read_csv('./data/ratings_test.txt', sep = '\t')[['document', 'label']]

# ratings, ratings_tst의 document column에 nan 값이 있으므로 이를 빈 문자열로 대체
print(sum(ratings.document.isna()), sum(ratings_tst.document.isna()))

ratings.document[ratings.document.isna()] = ''
ratings_tst.document[ratings_tst.document.isna()] = ''

print(sum(ratings.document.isna()), sum(ratings_tst.document.isna()))

5 3
0 0


In [3]:
val_indices = np.random.choice(a = range(ratings.shape[0]), size = int(ratings.shape[0] * .2),
                               replace = False)
tr_indices = np.delete(arr = range(ratings.shape[0]), obj = val_indices, axis = 0)

ratings_tr = ratings.iloc[tr_indices,:]
ratings_val = ratings.iloc[val_indices,:]

print(ratings_tr.shape, ratings_val.shape, ratings_tst.shape)

(120000, 2) (30000, 2) (50000, 2)


### Preprocessing dataset

In [4]:
mecab = konlpy.tag.Mecab() # 어떠한 분석기라도 상관이 없음

In [5]:
%%time
# train
X_tr = ratings_tr.document.apply(mecab.morphs).tolist()
y_tr = ratings_tr.label.tolist()

# validation
X_val = ratings_val.document.apply(mecab.morphs).tolist()
y_val = ratings_val.label.tolist()

# test
X_tst = ratings_tst.document.apply(mecab.morphs).tolist()
y_tst = ratings_tst.label.tolist()

CPU times: user 13.3 s, sys: 100 ms, total: 13.4 s
Wall time: 13.5 s


#### Building vocabulary and connecting vocabulary with fasttext embedding
https://gluon-nlp.mxnet.io/examples/word_embedding/word_embedding.html

In [6]:
# training dataset 기반으로 vocab 생성
counter = nlp.data.count_tokens(itertools.chain.from_iterable([c for c in X_tr]))
vocab = nlp.Vocab(counter,bos_token=None, eos_token=None, min_freq=15)

In [7]:
# Loading fasttext embedding 
fasttext_simple = nlp.embedding.create('fasttext', source='wiki.ko')

# vocab에 embedding 연결
vocab.set_embedding(fasttext_simple)

In [8]:
%%time
# final preprocessing

X_tr = list(map(lambda sen : [vocab.token_to_idx[token] for token in sen], X_tr))
X_tr = pad_sequences(sequences = X_tr, maxlen = 30, padding = 'post', value = 1.)

X_val = list(map(lambda sen : [vocab.token_to_idx[token] for token in sen], X_val))
X_val = pad_sequences(sequences = X_val, maxlen = 30, padding = 'post', value = 1.)

X_tst = list(map(lambda sen : [vocab.token_to_idx[token] for token in sen], X_tst))
X_tst = pad_sequences(sequences = X_tst, maxlen = 30, padding = 'post', value = 1.)

CPU times: user 2.84 s, sys: 15.8 ms, total: 2.85 s
Wall time: 2.85 s


### Define MorphConv class

In [9]:
class MorphConv:
    def __init__(self, X, y, n_of_classes, embedding):
        
        with tf.variable_scope('input_layer'):
            self.__X = X
            self.__y = y
            self.is_training = tf.placeholder(dtype = tf.bool)
        
        with tf.variable_scope('embedding_layer'):
            static_embed = tf.get_variable(name = 'static', initializer = embedding,
                                           trainable = False)
            non_static_embed = tf.get_variable(name = 'non_static', initializer = embedding,
                                               trainable = True)
            static_batch = tf.nn.embedding_lookup(params = static_embed, ids = self.__X)
            non_static_batch = tf.nn.embedding_lookup(params = non_static_embed, ids = self.__X)
            
        with tf.variable_scope('convoluion_layer'):
            with tf.variable_scope('tri_gram'):
                
                tri_gram = keras.layers.Conv1D(filters = 100, kernel_size = 3,
                                               activation = keras.activations.relu,
                                               kernel_initializer = 'he_uniform', padding = 'valid')
                static_3 = tri_gram(static_batch)
                non_static_3 = tri_gram(non_static_batch)
            
            with tf.variable_scope('tetra_gram'):
                tetra_gram = keras.layers.Conv1D(filters = 100, kernel_size = 4,
                                                 activation = keras.activations.relu,
                                                 kernel_initializer = 'he_uniform', padding = 'valid')
                
                static_4 = tetra_gram(static_batch)
                non_static_4 = tetra_gram(non_static_batch)
            
            with tf.variable_scope('penta_gram'):
                penta_gram = keras.layers.Conv1D(filters = 100, kernel_size = 5,
                                                 activation = keras.activations.relu,
                                                 kernel_initializer = 'he_uniform', padding = 'valid')
                
                static_5 = penta_gram(static_batch)
                non_static_5 = penta_gram(non_static_batch)

            fmap_3 = tf.reduce_max(static_3 + non_static_3, axis = 1)
            fmap_4 = tf.reduce_max(static_4 + non_static_4, axis = 1)
            fmap_5 = tf.reduce_max(static_5 + non_static_5, axis = 1)
            
        with tf.variable_scope('output_layer'):
            flattened = tf.concat([fmap_3, fmap_4, fmap_5], axis = -1)
            score = keras.layers.Dense(units = n_of_classes,
                                       kernel_regularizer = keras.regularizers.l2(.7))(flattened)
            
            self.__score = keras.layers.Dropout(rate = .5)(score, training = self.is_training)

        with tf.variable_scope('loss'):
            ce_loss = tf.losses.sparse_softmax_cross_entropy(labels = self.__y, logits = self.__score)
            reg_term = tf.reduce_sum(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
            self.total_loss = ce_loss + reg_term
        
        with tf.variable_scope('prediction'):
            self.prediction = tf.argmax(self.__score, axis = -1)
        
        # predict instance method for small dataset
        def predict(self, sess, x_data, is_training = False):
            feed_prediction = {self.__X : x_data, self.is_training : is_training}
            return sess.run(self.prediction, feed_dict = feed_prediction)

### Create a model of MorphConv

In [10]:
# hyper-parameter
lr = .003
epochs = 5
batch_size = 100
total_step = int(X_tr.shape[0] / batch_size)
print(total_step)

1200


In [11]:
# train
tr_dataset = tf.data.Dataset.from_tensor_slices((X_tr, y_tr))
tr_dataset = tr_dataset.shuffle(buffer_size = 1000000)
tr_dataset = tr_dataset.batch(batch_size = batch_size)
tr_iterator = tr_dataset.make_initializable_iterator()

In [12]:
# val
val_dataset = tf.data.Dataset.from_tensor_slices((X_val, y_val))
val_dataset = val_dataset.batch(batch_size = batch_size)
val_iterator = val_dataset.make_initializable_iterator()

In [13]:
# anonymous iterator
handle = tf.placeholder(dtype = tf.string)
iterator = tf.data.Iterator.from_string_handle(string_handle = handle,
                                               output_types = tr_iterator.output_types,
                                               output_shapes = tr_iterator.output_shapes)
x_data, y_data = iterator.get_next()

In [14]:
morph_conv = MorphConv(X = x_data, y = y_data, n_of_classes = 2,
                       embedding = vocab.embedding.idx_to_vec.asnumpy())

In [15]:
# create training op
opt = tf.train.AdamOptimizer(learning_rate = lr)
training_op = opt.minimize(loss = morph_conv.total_loss)

### Training

In [16]:
sess_config = tf.ConfigProto(gpu_options=tf.GPUOptions(allow_growth=True))
sess = tf.Session(config = sess_config)
sess.run(tf.global_variables_initializer())
tr_handle, val_handle = sess.run(fetches = [tr_iterator.string_handle(), val_iterator.string_handle()])

In [17]:
%%time

tr_loss_hist = []
val_loss_hist = []

for epoch in tqdm(range(epochs)):

    avg_tr_loss = 0
    avg_val_loss = 0
    tr_step = 0
    val_step = 0

    # for mini-batch training
    sess.run(tr_iterator.initializer)    
    try:
        
        while True:
            _, tr_loss = sess.run(fetches = [training_op, morph_conv.total_loss],
                                             feed_dict = {handle : tr_handle, morph_conv.is_training : True})
            avg_tr_loss += tr_loss
            tr_step += 1

    except tf.errors.OutOfRangeError:
        pass

    # for validation
    sess.run(val_iterator.initializer)
    try:
        while True:
            val_loss = sess.run(fetches = morph_conv.total_loss,
                                feed_dict = {handle : val_handle, morph_conv.is_training : False})
            avg_val_loss += val_loss
            val_step += 1
    
    except tf.errors.OutOfRangeError:
        pass

    avg_tr_loss /= tr_step
    avg_val_loss /= val_step
    tr_loss_hist.append(avg_tr_loss)
    val_loss_hist.append(avg_val_loss)
    
    print('epoch : {:3}, tr_loss : {:.3f}, val_loss : {:.3f}'.format(epoch + 1, avg_tr_loss, avg_val_loss))

 20%|██        | 1/5 [00:10<00:40, 10.01s/it]

epoch :   1, tr_loss : 0.713, val_loss : 0.392


 40%|████      | 2/5 [00:18<00:28,  9.52s/it]

epoch :   2, tr_loss : 0.476, val_loss : 0.433


 60%|██████    | 3/5 [00:26<00:18,  9.04s/it]

epoch :   3, tr_loss : 0.434, val_loss : 0.535


 80%|████████  | 4/5 [00:35<00:08,  8.95s/it]

epoch :   4, tr_loss : 0.440, val_loss : 0.361


100%|██████████| 5/5 [00:42<00:00,  8.63s/it]

epoch :   5, tr_loss : 0.372, val_loss : 0.380
CPU times: user 47.3 s, sys: 8.72 s, total: 56 s
Wall time: 42.9 s





### Test

In [18]:
tst_dataset = tf.data.Dataset.from_tensor_slices((X_tst, y_tst))
tst_dataset = tst_dataset.batch(batch_size = batch_size)
tst_iterator = tst_dataset.make_initializable_iterator()

In [19]:
tst_handle = sess.run(tst_iterator.string_handle())

In [20]:
y_tst_hat = np.array([])

sess.run(tst_iterator.initializer)

try:
    while True:
        y_tst_tmp = sess.run(morph_conv.prediction,
                            feed_dict = {handle : tst_handle,
                                         morph_conv.is_training : False})
        y_tst_hat= np.append(y_tst_hat,y_tst_tmp)

except tf.errors.OutOfRangeError:
    pass

In [21]:
print('test acc : {:.2%}'.format(np.mean(y_tst_hat == np.array(y_tst))))

test acc : 85.09%
