<a href="https://colab.research.google.com/github/onlyabhilash/NLP-Projects/blob/main/4_Text_classification/CNN_text_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Text classification by Convolution Neural Networks
### Model structure and dimensions
- Input embedding layer
    - `(batch_size, seq_length, embedding_size, 1)`
- `for f in filter_sizes`
    - Convolution
        - Filter shape: `(f, embedding_size, 1, num_filters)`
        - Output shape: `(batch_size, seq_length-f+1, 1, num_filters)` with `stride=1`
    - Max-pool
        - `ksize=[1, seq_length-f+1, 1, 1]`
        - Output shape: reshape `(batch_size, 1, 1, num_filters)` to `(batch_size, num_filters)`
- Concatenate outputs of all filters together
    - `tf.concat(xx, axis=-1)`
    - Output shape: `(batch_size, len(filter_sizes)*num_filters)`
- FC1 with drop-out
    - Output shape: `(batch_size, len(filter_sizes)*num_filters)`
- FC2
    - Output shape: `(batch_size, num_classes)`

### How to add He initialization ?
- `he_init = tf.contrib.layers.variance_scaling_initializer()`
- `fc1 = tf.contrib.layers.fully_connected(inputs=conv_res, ..., weights_initializer=he_init)

### How to perform L2 regularization using advanced API `tf.contrib.layers` ? 
- `tf.contrib.layers.fully_connected(inputs, num_outputs, weights_regularizer=tf.contrib.layers.l2_regularizer(scale))`
- `reg_ws = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)`
- `cost = cross_entropy_cost + tf.reduce_sum(reg_ws)` # adds to total cost

References:  
[1] https://stackoverflow.com/questions/37107223/how-to-add-regularizations-in-tensorflow

In [None]:
import os
import codecs
import itertools
from collections import Counter
from random import shuffle
import tensorflow as tf
import numpy as np
from sklearn import metrics
import pickle

Class `DataGenerator` is used to read input files, convert words to index and generate batch training or testing data. 

Since the CNN input has fixed length. `Arguments.MAX_SEQ_LENGTH` records the maximum sequence length in training data. All sequences that lengths less than `Arguments.MAX_SEQ_LENGTH` are padded to `Arguments.MAX_SEQ_LENGTH`.  

Two extra words are introduced, `PAD` for padding shorter sequences and `OOV` for representing out-of-vocabulary words.

In [None]:
class DataGenerator():
    """
    reading each training and testing files, and generating batch data.
    """
    
    def __init__(self, args):
        self.folder_path = args.FOLDER_PATH
        self.batch_size = args.BATCH_SIZE
        self.vocab_size = args.VOCAB_SIZE
        self.num_epoch = args.NUM_EPOCH
        self.read_build_input()
        self.single_generator_training = self.generate_sample_training()
        self.single_generator_testing = self.generate_sample_testing()
        self.label_dict = {0:'auto', 1:'business', 2:'IT', 3:'health', 4:'sports', 5:'yule'}
    
    def one_hot_encode(self, x, n_classes=6):
        return np.eye(n_classes)[[x]][0]
        
    def read_build_input(self):
        training_src = []
        testing_src = []
        article_len = []

        for cur_category in range(1, 7):
            
            print('parsing file >>>>>>>>>>>>>>> ', cur_category)
            print('-'*100)
            
            training_input_file = codecs.open(filename=os.path.join(self.folder_path, 'training_' + str(cur_category) + '.cs'), mode='r', encoding='utf-8')
            for tmp_line in training_input_file:
                training_src.append((tmp_line.split(), cur_category-1))
                article_len.append(len(tmp_line.split()))

            testing_input_file = codecs.open(filename=os.path.join(self.folder_path, 'testing_' + str(cur_category) + '.cs'), mode='r', encoding='utf-8')
            for tmp_line in testing_input_file:
                testing_src.append((tmp_line.split(), cur_category-1))                

        shuffle(training_src)
        shuffle(testing_src)
        
        args.MAX_SEQ_LENGTH = max(article_len)
        print('='*100)
        print('Size of training data:', len(training_src))
        print('Size of testing data:', len(testing_src))
        print('Maximum length of training articles', args.MAX_SEQ_LENGTH)
    
        self.TRAINING_SIZE = len(training_src)
        args.TESTING_SIZE = len(testing_src)
        
        training_X_src = [pair[0] for pair in training_src]
        testing_X_src = [pair[0] for pair in testing_src]
        all_data = list(itertools.chain.from_iterable(training_X_src + testing_X_src))
        word_counter = Counter(all_data).most_common(self.vocab_size-2)
        del all_data
        
        print('='*100)
        if os.path.isfile(args.MAPFILE):
            print('Reload word2idx')
            with open(args.MAPFILE, 'rb') as f:
                self.word2idx, self.idx2word = pickle.load(f)
        else:
            print('top 10 frequent words:')
            print(word_counter[0:10])
            self.word2idx = {val[0]: idx+1 for idx, val in enumerate(word_counter)}
            self.word2idx['PAD'] = 0 # padding word
            self.word2idx['OOV'] = self.vocab_size - 1 # out-of-vocabulary
            self.idx2word = dict(zip(self.word2idx.values(), self.word2idx.keys()))
            print('Total vocabulary size:{}'.format(len(self.word2idx)))            
        
            with open(args.MAPFILE, 'wb') as f:
                pickle.dump([self.word2idx, self.idx2word], f)
        
        self.training = [([self.word2idx[w] if w in self.word2idx else self.word2idx['OOV'] for w in tmp_pair[0][0:args.MAX_SEQ_LENGTH]], tmp_pair[1]) for tmp_pair in training_src]
        self.testing_ori =  [([self.word2idx[w] if w in self.word2idx else self.word2idx['OOV'] for w in tmp_pair[0][0:args.MAX_SEQ_LENGTH]], tmp_pair[1]) for tmp_pair in testing_src]
        self.testing = [(tmp_pair[0] + [self.word2idx['PAD']] * (args.MAX_SEQ_LENGTH - len(tmp_pair[0])), tmp_pair[1]) if len(tmp_pair[0]) < args.MAX_SEQ_LENGTH else tmp_pair for tmp_pair in self.testing_ori]
    
    def generate_sample_training(self):
        """
        If len(each article) < self.max_seq_len:
            padding them with 0
        else:
            truncating them to self.max_seq_len
        """
        outer_index = 0
        for X_y_pair in itertools.cycle(self.training):  # infinite loop each article
            tmp_input_len = len(X_y_pair[0])
            if tmp_input_len < args.MAX_SEQ_LENGTH:
                input_X = X_y_pair[0] + [self.word2idx['PAD']] * (args.MAX_SEQ_LENGTH - tmp_input_len)
            else:
                input_X = X_y_pair[0]
            
            output_y = X_y_pair[1]
            if outer_index in [0, 10]:
                print('='*100)
                print('Training text:', ' '.join([self.idx2word[tmp_id] for tmp_id in input_X]))
                print('Training text length:', len(input_X))
                print('Training label:', self.label_dict[output_y])
                
            yield input_X, output_y
            outer_index += 1
    
    def generate_sample_testing(self):
        """
        If len(each article) < self.max_seq_len:
            padding them with 0
        else:
            truncating them to self.max_seq_len
        """
        outer_index = 0
        for X_y_pair in self.testing: #itertools.cycle(self.testing):  # infinite loop each article
            tmp_input_len = len(X_y_pair[0])
            if tmp_input_len < args.MAX_SEQ_LENGTH:
                input_X = X_y_pair[0] + [self.word2idx['PAD']] * (args.MAX_SEQ_LENGTH - tmp_input_len)
            else:
                input_X = X_y_pair[0]
            
            output_y = X_y_pair[1]
            if outer_index in [0, 10]:
                print('='*100)
                print('Testing text:', ' '.join([self.idx2word[tmp_id] for tmp_id in input_X]))
                print('Testing text length:', len(input_X))
                print('Testing label:', self.label_dict[output_y])
                
            yield input_X, output_y
            outer_index += 1
        

    def next_batch_training(self):
        input_X_batch = []
        output_y_batch = []
        for idx in range(self.batch_size):
            tmp_X, tmp_y = next(self.single_generator_training)
            input_X_batch.append(tmp_X)
            output_y_batch.append(self.one_hot_encode(tmp_y))
        return np.array(input_X_batch, dtype=np.int32), np.array(output_y_batch, dtype=np.int32)
    
    def next_testing(self):
        testing_X = np.array([tmp_pair[0] for tmp_pair in self.testing], dtype=np.int32)
        testing_y = np.array([tmp_pair[1] for tmp_pair in self.testing], dtype=np.int32)
        return testing_X, testing_y     
    
    def testing_data(self):
        testing_X = np.array([tmp_pair[0] for tmp_pair in self.testing], dtype=np.int32)
        testing_y = np.array([self.one_hot_encode(tmp_pair[1]) for tmp_pair in self.testing], dtype=np.int32)
        # here I limit first 10000 samples since my computer memeory limitation
        return testing_X[:1000], testing_y[:1000]  
        #return testing_X, testing_y  
    
    
    def next_batch_testing(self):
        input_X_batch = []
        output_y_batch = []
        for idx in range(self.batch_size):
            tmp_X, tmp_y = next(self.single_generator_testing)
            input_X_batch.append(tmp_X)
            output_y_batch.append(self.one_hot_encode(tmp_y))
        return np.array(input_X_batch, dtype=np.int32), np.array(output_y_batch, dtype=np.int32)

Hyper-parameters for this model.

In [None]:
class Arguments:
    """
    main hyper-parameters
    """
    REGULARIZATION = 0.01
    EMBED_SIZE = 100 # embedding dimensions
    BATCH_SIZE = 32
    VOCAB_SIZE = 300000 # vocabulary size
    NUM_CLASSES = 6 # number of classes
    NUM_FILTERS = 65
    # [3, 4, 5] performs best according to http://ruder.io/deep-learning-nlp-best-practices/
    FILTER_SIZES = [3, 4, 5]
    KEEP_PROB = 0.5
    FOLDER_PATH = 'sogou_corpus'
    NUM_EPOCH = 10
    CHECKPOINTS_DIR = 'text_classification_CNN_model' + '_lambda_' + str(REGULARIZATION)
    LOGDIR = 'text_classification_CNN_logdir' + '_lambda_' + str(REGULARIZATION)
    MAPFILE = 'cnn_maps.pkl'

### `conv = tf.nn.conv2d(input, filter, strides, padding, data_format='NHWC')`
- `conv` and `input` are both 4-D tensor of shape `(batch_size, in_height, in_width, in_channels)`.
- `filter` is a 4-D tensor of shape `(filter_height, filter_width, in_channels, out_channels)`.
- `strides` is a list of `ints`, `(batch_stride=1, vertical_stride, horizontal_stride, channels_stride=1)`
- `padding`: `SAME` or `VALID`.

### `pooled = tf.nn.max_pool(value, ksize, strides, padding, data_format='NHWC')`
- `pooled` and `value` are both 4-D tensor of `data_format`.
- `ksize` and `strides` are both a list of `ints`

In [None]:
class TextClassificationModel:
    """
    Model class.
    reference: http://www.wildml.com/2015/12/implementing-a-cnn-for-text-classification-in-tensorflow/
    """
    def __init__(self, args):
        self.l2_reg = args.REGULARIZATION
        self.embedding_size = args.EMBED_SIZE
        self.batch_size = args.BATCH_SIZE
        # for each filter size, how many filters we have
        self.num_filters = args.NUM_FILTERS
        self.filter_sizes = args.FILTER_SIZES
        self.num_classes = args.NUM_CLASSES
        self.vocab_size = args.VOCAB_SIZE + 2
        self.seq_length = args.MAX_SEQ_LENGTH
        self.global_step = tf.Variable(initial_value=0, dtype=tf.int32, trainable=False, name='global_step')
        self.best_accuracy = tf.Variable(initial_value=0.0, dtype=tf.float32, trainable=False, name='best_accuracy')
        
        self.input_X = tf.placeholder(dtype=tf.int32, shape=[None, self.seq_length], name='input_X')
        self.output_y = tf.placeholder(dtype=tf.int32, shape = [None, self.num_classes], name='output_y')
        # when training, feed keep_prob; when testing, feed 1.0
        self.keep_prob = tf.placeholder(dtype=tf.float32, name='keep_prob')
        
        self.embedding_inputs = self.embedding_layer(self.input_X)
        self.conv_res = self.conv_max_pool_layer(self.embedding_inputs)
        self.scores, self.predictions, self.accuracy = self.score_layer(self.conv_res)
        self.cost = self.cost_layer(self.scores)
        self.optimize = self.optimizer(self.cost)
        
    
    def embedding_layer(self, word_inputs):
        with tf.variable_scope('word_embedding', initializer=tf.contrib.layers.xavier_initializer()), tf.device('/cpu:0'):
            embedding_matrix = tf.get_variable(name='embedding_matrix', shape=[self.vocab_size, self.embedding_size])
            inputs = tf.nn.embedding_lookup(params=embedding_matrix, ids=word_inputs, name='embed')
            print('inputs shape: ', inputs.get_shape()) # shape: (None, seq_length, embedding_size)
            # expand one dimension at last dimension: axis=-1
            inputs_expanded = tf.expand_dims(input=inputs, axis=-1)  
            print('inputs_expanded shape: ', inputs_expanded.get_shape()) # shape: (None, seq_length, embedding_size, 1)
        return inputs_expanded
    
    def conv_max_pool_layer(self, embed_input):
        pooled_outputs = []
        for idx, filter_size in enumerate(self.filter_sizes):
            with tf.variable_scope('conv-maxpool-%s' % filter_size):
                # initialization from https://www.tensorflow.org/programmers_guide/variables
                # shape: (filter_height, filter_width, in_channels, out_channels)
                filter_W = tf.get_variable(name='filter_W', shape=[filter_size, self.embedding_size, 1, self.num_filters], initializer=tf.random_normal_initializer())
                filter_b = tf.get_variable(name='filter_b', shape=[self.num_filters], initializer=tf.constant_initializer(0.0))
                conv = tf.nn.conv2d(input=embed_input, filter=filter_W, strides=[1, 1, 1, 1], padding='VALID', name='conv')
                conv_activation = tf.nn.relu(tf.nn.bias_add(conv, filter_b), name='relu')
                print('filter_size:{}, conv_activation shape:{}'.format(filter_size, conv_activation.get_shape()))
                # max-pooling
                pooled = tf.nn.max_pool(value=conv_activation, ksize=[1, self.seq_length-filter_size+1, 1, 1], strides=[1, 1, 1, 1], padding='VALID', name='max-pool')
                print('filter_size:{}, pooled shape:{}'.format(filter_size, pooled.get_shape()))
                pooled_outputs.append(pooled)
        conv_pooled = tf.concat(values=pooled_outputs, axis=-1)
        print('='*100)
        print('conv_pooled shape: ', conv_pooled.get_shape()) # shape: (None, 1, 1, self.num_filters*len(self.filter_sizes))
        conv_pooled_flat = tf.reshape(conv_pooled, [-1, self.num_filters*len(self.filter_sizes)]) 
        print('conv_pooled_flat shape: ', conv_pooled_flat.get_shape()) # shape: (None, self.num_filters*len(self.filter_sizes)
        return conv_pooled_flat
            
    def score_layer(self, conv_res):
        with tf.variable_scope('score'):
            # fc1 with He initialization
            he_init = tf.contrib.layers.variance_scaling_initializer()
            fc1 = tf.contrib.layers.fully_connected(inputs=conv_res, 
                                                    num_outputs=self.num_filters*len(self.filter_sizes),
                                                    weights_initializer=he_init,
                                                    weights_regularizer=tf.contrib.layers.l2_regularizer(scale=self.l2_reg))
            print('fc1 shape: ', fc1.get_shape()) # shape: (None, self.num_filters*len(self.filter_sizes))
                        
            fc1_drop = tf.nn.dropout(fc1, self.keep_prob)
            
            scores = tf.contrib.layers.fully_connected(inputs=fc1_drop, 
                                                       num_outputs=self.num_classes, 
                                                       activation_fn=None,
                                                       weights_regularizer=tf.contrib.layers.l2_regularizer(scale=self.l2_reg))
            print('scores shape: ', scores.get_shape()) # shape: (None, self.num_classes)
            probs = tf.nn.softmax(scores)
            predictions = tf.argmax(probs, 1, name='predictions')
            print('predictions shape: ', predictions.get_shape()) # shape: (None, )
            accuracy = tf.reduce_mean(tf.cast(tf.equal(predictions, tf.argmax(self.output_y, 1)), tf.float32))
            tf.summary.scalar(name='accuracy', tensor=accuracy)
        return scores, predictions, accuracy
        
    
    def cost_layer(self, scores):
        with tf.variable_scope('cost'):
            cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=scores, labels=self.output_y))
            # https://stackoverflow.com/questions/37107223/how-to-add-regularizations-in-tensorflow
            reg_ws = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
            for w in reg_ws:
                shp = w.get_shape().as_list()
                print('{} shape: {} size: {}'.format(w.name, shp, np.prod(shp)))
            cost = cost + tf.reduce_sum(reg_ws)
            tf.summary.scalar(name='loss', tensor=cost)
            self.summary_op = tf.summary.merge_all()
        return cost
    
    def optimizer(self, cost):
        with tf.name_scope('optimizer'):
            optimizer = tf.train.AdamOptimizer(learning_rate=1e-3)
            optimize = optimizer.minimize(loss=cost, global_step=self.global_step)
        return optimize

`train` method to train model.

`train_writer` and `test_writer` used as indicator of `accuracy` and `loss` for training and testing.

In [None]:
def train(data, model, args):
    saver = tf.train.Saver()
    with tf.Session() as sess:
        train_writer = tf.summary.FileWriter(logdir=args.LOGDIR + '/train', graph=sess.graph)
        test_writer = tf.summary.FileWriter(logdir=args.LOGDIR + '/test')
        
        sess.run(tf.global_variables_initializer())
        ckpt = tf.train.get_checkpoint_state(checkpoint_dir=args.CHECKPOINTS_DIR)
        if ckpt and ckpt.model_checkpoint_path:
            saver.restore(sess=sess, save_path=ckpt.model_checkpoint_path)
            print(ckpt)
        
        max_iteration_num = args.NUM_EPOCH * data.TRAINING_SIZE // args.BATCH_SIZE        
        for idx in range(1, max_iteration_num):
            batch_X, batch_y = data.next_batch_training()
            
            feed_dict = {model.input_X: batch_X, model.output_y: batch_y, model.keep_prob: args.KEEP_PROB}
            tmp_accuracy, tmp_cost, _, tmp_summary = sess.run([model.accuracy, model.cost, model.optimize, model.summary_op], feed_dict=feed_dict)
            train_writer.add_summary(summary=tmp_summary, global_step=model.global_step.eval())
            
            if idx % 50 == 0:
                print('='*100)
                print('Step: {} / {}, training loss:{:4f}, training accuracy:{:4f}'.format(idx, max_iteration_num, tmp_cost, tmp_accuracy))
                
            if idx % 200 == 0:
                test_cost = 0.0
                test_accuracy = 0.0
                cc = 0
                test_batch_X, test_batch_y = data.testing_data()
                test_feed_dict = {model.input_X: test_batch_X, model.output_y: test_batch_y, model.keep_prob:1.0}
                test_tmp_cost, test_tmp_accuracy, test_tmp_summary = sess.run([model.cost, model.accuracy, model.summary_op], feed_dict=test_feed_dict)
                test_writer.add_summary(summary=test_tmp_summary, global_step=model.global_step.eval())
                print('-'*100)
                #print('Step:{}, testing accuracy:{:4f}'.format(model.global_step.eval(), test_tmp_accuracy))
                print('Step: {} / {}, testing loss:{:4f}, testing accuracy:{:4f}'.format(idx, max_iteration_num, test_tmp_cost, test_tmp_accuracy))
                if test_tmp_accuracy > model.best_accuracy.eval():
                    print('Best model accuracy: {:4f}'.format(test_tmp_accuracy))
                    sess.run(model.best_accuracy.assign(test_tmp_accuracy))
                    saver.save(sess=sess, save_path=os.path.join(args.CHECKPOINTS_DIR, 'text_classification_cnn.ckpt'), global_step=model.global_step.eval())
                else:                    
                    print('Best model accuracy: {:4f}, Current model accuracy: {:4f}'.format(model.best_accuracy.eval(), test_tmp_accuracy))
                    

`test` method to load the trained model and test model.

In [None]:
def test(data, model, args):
    categories = ['auto', 'business', 'IT', 'health', 'sports', 'yule']
    saver = tf.train.Saver()    
    with tf.Session() as sess:
        ckpt = tf.train.get_checkpoint_state(checkpoint_dir=args.CHECKPOINTS_DIR)
        if ckpt and ckpt.model_checkpoint_path:
            saver.restore(sess=sess, save_path=ckpt.model_checkpoint_path)
            print(ckpt)
        test_X, test_y = data.testing_data()
        print('test_X shape: ', test_X.shape)
        print('test_y shape: ', test_y.shape)
        test_feed_dict = {model.input_X: test_X, model.output_y:test_y, model.keep_prob:1.0}
        test_predictions = sess.run(model.predictions, feed_dict=test_feed_dict)
        print('Precision, Recall and F1-Score:')
        test_y = np.argmax(test_y, 1)
        print(metrics.classification_report(test_y, test_predictions, target_names=categories))

        print('Confusion matrix:')
        cm = metrics.confusion_matrix(test_y, test_predictions)
        print(cm)

In [None]:
if __name__ == '__main__':
    args = Arguments()
    data = DataGenerator(args)
    model = TextClassificationModel(args)
    
    # for training
    #train(data, model, args)   
    
    # testing
    test(data, model, args)