<a href="https://colab.research.google.com/github/mesahwi/TextAnlaysis/blob/master/Naver_News_Analysis/Text_CNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Text CNN implementation on analyzing Korean news articles <br>
(In this particular example, we will tell '조선일보' and '한겨례' political articles apart)

Install konlpy

In [1]:
!apt-get install python3-dev; pip3 install konlpy

Reading package lists... Done
Building dependency tree       
Reading state information... Done
python3-dev is already the newest version (3.6.7-1~18.04).
The following package was automatically installed and is no longer required:
  libnvidia-common-410
Use 'apt autoremove' to remove it.
0 upgraded, 0 newly installed, 0 to remove and 4 not upgraded.


Mount Google drive

In [2]:
from google.colab import auth
auth.authenticate_user()

from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


Import necessary packages

In [3]:
import numpy as np
import tensorflow as tf
import gensim
import os
import datetime
import time
import sklearn
import nltk
nltk.download('punkt')

from gensim.models.word2vec import Word2Vec

import glob
import warnings, os
warnings.filterwarnings('ignore')

from konlpy.tag import *

# 아래와 같은 분석기가 있음.
hnm = Hannanum()
kkma = Kkma()
okt = Okt()

base_dir = 'gdrive/Shared drives/텍스트마이닝/News Analysis/'

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Import Korean stop words, found in 'https://www.ranks.nl/stopwords/korean'

In [0]:
f = open(base_dir+'stopwords.txt', 'r')
stopwords = f.read()
stopwords = stopwords.split('\n')

Define functions and objects that will be used

In [0]:
# preprocess, single documents, not whole corpus.
# This function uses nouns. For a more sophisticated analysis, using morphs is recommended
def preprocess_single_doc(text, tokenizer_type = 1):
  
  #choose tokenizer
  if tokenizer_type == 1:
    tokenizer = Hannanum()
  elif tokenizer_type == 2:
    tokenizer = Kkma()
  elif tokenizer_type == 3:
    tokenizer = Okt()
    
  #tokenize
  tokens = tokenizer.nouns(text)
  
  #remove short words, but probably should not apply to Korean
  #tokens = [token for token in tokens if len(token) > 1] 
  
  #stop words
  my_stopwords = ['조선일보', '조선닷컴', '닷컴', 'Chosun', 'Copyrights', '&', '바로가기', '기자', '구독', '메인' 'ⓒ', '배포', '한겨례', '한겨례신문', '▶', '◀', '네이버', '[', ']', 'co', 'kr', 'hani']
  tokens = [token for token in tokens if token not in stopwords and token not in my_stopwords]
  
  #numbers are already left out, from tokenizer.nouns()
  #tokens = [word for word in tokens if not any(char.isdigit() for char in word)]

  preprocessed = ' '.join(tokens)
  return preprocessed

In [0]:
## Customize FLAGS, because I don't find the tf flags easy to use :-(
class Flags():
  def __init__(self):
    self.training_sample_percentage = float(0.7)
    self.max_doc_length = int(350)

    self.embedding_dim = int(64)
    self.filter_sizes = str('3,4,5')
    self.num_filters = int(128)
    self.dropout_keep_prob = float(0.5)
    self.l2_reg_lambda = float(0.0)
    self.learning_rate = float(1e-3)

    self.batch_size = int(64)
    self.num_epochs = int(50)
    
FLAGS = Flags()

In [0]:
# Naive implementation of reshaping y : 0,1,2 => [1,0,0], [0,1,0], [0,0,1]
def expand_y(yVec):
  nrow = len(yVec)
  y_unique = np.unique(yVec)
  ncol = len(y_unique)  
  out = np.zeros((nrow, ncol))
  
  for i, y in enumerate(yVec):
    for j, val in enumerate(y_unique):
      if y==val:
        out[i,j] = 1
        
  return np.array(out)

In [0]:
#Text CNN class. Embedding is done outside a Text CNN object
class TextCNN(object):
  def __init__(self, sequence_length, num_classes, embedding_size, filter_sizes, num_filters, learning_rate, l2_reg_lambda):
    self.input_x = tf.placeholder(tf.float32, [None, sequence_length, embedding_size], name='input_x')
    self.input_y = tf.placeholder(tf.float32, [None, num_classes], name='input_y')
    self.dropout_keep_prob = tf.placeholder(tf.float32, name='prob')
    
    self.embedded_expanded = tf.expand_dims(self.input_x, -1)
    
    l2_loss = tf.constant(0.0)
    
    # Architecture
    pooled_outputs = []
    for i, filter_size in enumerate(filter_sizes):
      with tf.name_scope('conv-maxpool-%s' % filter_size):
        filter_shape = [filter_size, embedding_size, 1, num_filters]
        W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name='W')
        b = tf.Variable(tf.constant(0.1, shape=[num_filters]), name='b')
        
        conv = tf.nn.conv2d(self.embedded_expanded, W, strides=[1,1,1,1], padding='VALID', name='conv')
        h = tf.nn.relu(tf.nn.bias_add(conv, b), name='relu')
        pooled = tf.nn.max_pool(h, ksize=[1, sequence_length-filter_size+1, 1, 1], strides=[1,1,1,1], padding='VALID', name='pool')
        
        pooled_outputs.append(pooled)
        
    num_filters_total = num_filters*len(filter_sizes)
    self.h_pool = tf.concat(pooled_outputs, 3)
    self.h_pool_flat = tf.reshape(self.h_pool, [-1, num_filters_total])
    
    with tf.name_scope('dropout'):
      self.h_drop = tf.nn.dropout(self.h_pool_flat, self.dropout_keep_prob)
      
    #Scores and predictions
    with tf.name_scope('output'):
      W = tf.get_variable('W', shape=[num_filters_total, num_classes], initializer=tf.contrib.layers.xavier_initializer())
      b = tf.Variable(tf.constant(0.1, shape=[num_classes]), name='b')
      l2_loss += (tf.nn.l2_loss(W) + tf.nn.l2_loss(b))
      self.scores = tf.nn.xw_plus_b(self.h_drop, W, b, name='scores')
      self.predictions = tf.argmax(self.scores, 1, name='predictions')
      
    #Loss : Cross Entropy
    with tf.name_scope('loss'):
      losses = tf.nn.softmax_cross_entropy_with_logits(logits=self.scores, labels=self.input_y)
      self.loss = tf.reduce_mean(losses) + l2_reg_lambda*l2_loss
      self.optimizer = tf.train.AdamOptimizer(learning_rate).minimize(self.loss)
      
    #Performance
    with tf.name_scope('accuracy'):
      correct_predictions = tf.equal(self.predictions, tf.argmax(self.input_y, 1))
      self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, 'float'), name='accuracy')

In [0]:
# Training procedure
def train(x_train, y_train, num_epochs):
  
  print('Training Text CNN, with parameters \nnum_epochs:{:d}, \nlearning_rate:{:f}, \
  \nembedding_dim:{:d}, \nmax_doc_length:{:d}, \nfilter_sizes:{:s}, \nnum_filters:{:d}, \nbatch_size:{:d}\n\n'.format(\
        FLAGS.num_epochs, FLAGS.learning_rate, FLAGS.embedding_dim, FLAGS.max_doc_length, FLAGS.filter_sizes, FLAGS.num_filters, FLAGS.batch_size))
  
  with tf.Graph().as_default():
    sess = tf.Session()
    with sess.as_default():
      cnn = TextCNN(sequence_length = FLAGS.max_doc_length,
                   num_classes = y_train.shape[1],
                   embedding_size = FLAGS.embedding_dim,
                   filter_sizes = list(map(int, FLAGS.filter_sizes.split(','))),
                   num_filters = FLAGS.num_filters,
                   learning_rate = FLAGS.learning_rate,
                   l2_reg_lambda = FLAGS.l2_reg_lambda)
      
      sess.run(tf.global_variables_initializer())
      
      data_size = len(y_train)
      num_batches_per_epoch = int((data_size-1)/FLAGS.batch_size) + 1
      
      for epoch in range(num_epochs):
        total_cost = 0
        total_accr = 0
        
        shuffled_indeces = np.random.permutation(np.arange(data_size))
        shuffled_x = [x_train[i] for i in shuffled_indeces]
        shuffled_y = y_train[shuffled_indeces]
        
        for batch_num in range(num_batches_per_epoch):
          start_idx = batch_num * FLAGS.batch_size
          end_idx = min((batch_num+1)*FLAGS.batch_size, data_size)
          batch_xs = shuffled_x[start_idx:end_idx]
          batch_ys = shuffled_y[start_idx:end_idx]
          
          feed_dict = {cnn.input_x:batch_xs, cnn.input_y:batch_ys, cnn.dropout_keep_prob:FLAGS.dropout_keep_prob}
          _, cost_val, accuracy = sess.run([cnn.optimizer, cnn.loss, cnn.accuracy], feed_dict)
          total_cost += cost_val
          total_accr += accuracy
          
        print('Epoch:', '%04d' % (epoch + 1), 'Avg. cost =', '{:.3f}'.format(total_cost / num_batches_per_epoch), 'Avg. accuracy =','{:.3f}'.format(total_accr / num_batches_per_epoch))
        
      print('Training Complete!')
      
    return sess, cnn
          

In [0]:
# Testing procedure
def test(x_test, y_test, sess, cnn):
  feed_dict = {cnn.input_x:x_test, cnn.input_y:y_test, cnn.dropout_keep_prob : 1.0}
  print('Test set Accuracy : ', sess.run(cnn.accuracy, feed_dict = feed_dict))

Now let's get started!! <br>
Say we want to see if the political articles in 조선일보 can be told apart from the political articles in 한겨례

First, we import data (500 per class in this example) and do some preprocessing

 - Prepare data

In [11]:
# prepare corpus  

nrow_per_type = 500
ntype = 2

data, label = np.empty([nrow_per_type * ntype,1], dtype=object), np.zeros((nrow_per_type * ntype, 1))

files_0 = glob.glob(base_dir + 'Chosun/politics/*.txt')
files_1 = glob.glob(base_dir + 'Han/politics/*.txt')

files_total = np.append(files_0[:nrow_per_type], files_1[:nrow_per_type])

print('reading data...')
for i, name in enumerate(files_total):
  with open(name,'r') as handle:
    data[i,0] = handle.read()
    label[i,0] = int(i/nrow_per_type)

reading data...


 - Shuffle and split the prepared data

In [0]:
# shuffle and split into train/test set

train_percentage = FLAGS.training_sample_percentage

data_size = len(label)
shuffled_indeces = np.random.permutation(np.arange(data_size))
shuffled_data = data[shuffled_indeces]
shuffled_label = label[shuffled_indeces]

split_idx = int(data_size*train_percentage)

data_train = shuffled_data[:split_idx]
data_test = shuffled_data[split_idx:]
label_train = shuffled_label[:split_idx]
label_test = shuffled_label[split_idx:]

del shuffled_data, shuffled_label, data, label

 - Preprocess the training data

In [13]:
print('processing training data(x)...')
x_in_train = [preprocess_single_doc(x[0], 3) for x in data_train]
print('processing training data(y)...')
y_train = expand_y(label_train)

processing training data(x)...
processing training data(y)...


 - Train word2vec for the training set

In [14]:
print('training word2vec..., for training set')
#Now want each word in x_in_train to be represented as vectors!
train_corpus = [nltk.word_tokenize(sentence) for sentence in x_in_train]
wv_model_trainset = Word2Vec(size=FLAGS.embedding_dim, min_count=2)
wv_model_trainset.build_vocab(train_corpus)
wv_model_trainset.train(train_corpus, total_examples=wv_model_trainset.corpus_count, epochs=40)

training word2vec..., for training set


(7231318, 7920320)

 - The length of each input must be fixed. If longer than 'max_doc_length', crop off the rest. If shorter, then add padding

In [0]:
max_doc_length = FLAGS.max_doc_length
x_train = []
for i, sent in enumerate(x_in_train):
  words = sent.split(' ')
  word_cnt = 0
  sentList = []
  for word in words:
    if word in wv_model_trainset.wv.vocab:
      if word_cnt < max_doc_length:
        wordVec = wv_model_trainset[word]
        sentList.append(wordVec)
        word_cnt += 1
      else:
        break
    
  if len(sentList) < max_doc_length:
    last_idx = len(sentList)-1
    for j in range(last_idx+1, max_doc_length):
      sentList.append(np.zeros(FLAGS.embedding_dim))
  
  x_train.append(sentList)

Train our model!

In [16]:
session, cnnModel = train(x_train, y_train, FLAGS.num_epochs)

W0712 08:57:57.745912 139960581597056 deprecation.py:506] From <ipython-input-8-56bcd47713fe>:30: calling dropout (from tensorflow.python.ops.nn_ops) with keep_prob is deprecated and will be removed in a future version.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


Training Text CNN, with parameters 
num_epochs:50, 
learning_rate:0.001000,   
embedding_dim:64, 
max_doc_length:350, 
filter_sizes:3,4,5, 
num_filters:128, 
batch_size:64




W0712 08:57:58.490174 139960581597056 lazy_loader.py:50] 
The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

W0712 08:57:58.516240 139960581597056 deprecation.py:323] From <ipython-input-8-56bcd47713fe>:42: softmax_cross_entropy_with_logits (from tensorflow.python.ops.nn_ops) is deprecated and will be removed in a future version.
Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See `tf.nn.softmax_cross_entropy_with_logits_v2`.



Epoch: 0001 Avg. cost = 2.943 Avg. accuracy = 0.631
Epoch: 0002 Avg. cost = 1.295 Avg. accuracy = 0.846
Epoch: 0003 Avg. cost = 0.739 Avg. accuracy = 0.900
Epoch: 0004 Avg. cost = 0.609 Avg. accuracy = 0.914
Epoch: 0005 Avg. cost = 0.549 Avg. accuracy = 0.929
Epoch: 0006 Avg. cost = 0.464 Avg. accuracy = 0.929
Epoch: 0007 Avg. cost = 0.529 Avg. accuracy = 0.927
Epoch: 0008 Avg. cost = 0.346 Avg. accuracy = 0.956
Epoch: 0009 Avg. cost = 0.308 Avg. accuracy = 0.952
Epoch: 0010 Avg. cost = 0.283 Avg. accuracy = 0.951
Epoch: 0011 Avg. cost = 0.329 Avg. accuracy = 0.947
Epoch: 0012 Avg. cost = 0.241 Avg. accuracy = 0.964
Epoch: 0013 Avg. cost = 0.243 Avg. accuracy = 0.959
Epoch: 0014 Avg. cost = 0.191 Avg. accuracy = 0.963
Epoch: 0015 Avg. cost = 0.316 Avg. accuracy = 0.952
Epoch: 0016 Avg. cost = 0.179 Avg. accuracy = 0.961
Epoch: 0017 Avg. cost = 0.138 Avg. accuracy = 0.976
Epoch: 0018 Avg. cost = 0.140 Avg. accuracy = 0.971
Epoch: 0019 Avg. cost = 0.122 Avg. accuracy = 0.970
Epoch: 0020 

Training is complete, so now, test performance with test data set

 - Preprocess and reshape the test dataset in the same manner

In [17]:
print('processing test data(x)...')
x_in_test = [preprocess_single_doc(x[0], 3) for x in data_test]
print('processing test data(y)...')
y_test = expand_y(label_test)

processing test data(x)...
processing test data(y)...


In [18]:
print('training word2vec..., for test set')
#Now want each word in x_in_train to be represented as vectors!
test_corpus = [nltk.word_tokenize(sentence) for sentence in x_in_test]
wv_model_testset = Word2Vec(size=FLAGS.embedding_dim, min_count=2)
wv_model_testset.build_vocab(test_corpus)
wv_model_testset.train(test_corpus, total_examples=wv_model_testset.corpus_count, epochs=40)

training word2vec..., for test set


(3124965, 3487720)

In [0]:
max_doc_length = FLAGS.max_doc_length
x_test = []
for i, sent in enumerate(x_in_test):
  words = sent.split(' ')
  word_cnt = 0
  sentList = []
  for word in words:
    if word in wv_model_testset.wv.vocab:
      if word_cnt < max_doc_length:
        wordVec = wv_model_testset[word]
        sentList.append(wordVec)
        word_cnt += 1
      else:
        break
    
  if len(sentList) < max_doc_length:
    last_idx = len(sentList)-1
    for j in range(last_idx+1, max_doc_length):
      sentList.append(np.zeros(FLAGS.embedding_dim))
  
  x_test.append(sentList)

In [20]:
test(x_test, y_test, session, cnnModel)

Test set Accuracy :  0.95


Conclusion : An astounding proportion of 0.95 of the test dataset are correctly classified ('조선일보' / '한겨례')