<a href="https://colab.research.google.com/github/mesahwi/TextAnlaysis/blob/master/Naver_News_Analysis/RNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

RNN implementation on analyzing Korean news articles <br>
(In this particular example, we will tell '조선일보' and '한겨례' political articles apart)

Install konlpy

In [1]:
!apt-get install python3-dev; pip3 install konlpy

Reading package lists... Done
Building dependency tree       
Reading state information... Done
python3-dev is already the newest version (3.6.7-1~18.04).
The following package was automatically installed and is no longer required:
  libnvidia-common-410
Use 'apt autoremove' to remove it.
0 upgraded, 0 newly installed, 0 to remove and 4 not upgraded.


Mount Google drive

In [2]:
from google.colab import auth
auth.authenticate_user()

from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


Import necessary packages

In [0]:
import numpy as np
import tensorflow as tf
import gensim
import os
import datetime
import time
import sklearn
import nltk
nltk.download('punkt')

from gensim.models.word2vec import Word2Vec

import glob
import warnings, os
warnings.filterwarnings('ignore')

from konlpy.tag import *

# 아래와 같은 분석기가 있음.
hnm = Hannanum()
kkma = Kkma()
okt = Okt()

base_dir = 'gdrive/Shared drives/텍스트마이닝/News Analysis/'

Import Korean stop words, found in 'https://www.ranks.nl/stopwords/korean'

In [0]:
f = open(base_dir+'stopwords.txt', 'r')
stopwords = f.read()
stopwords = stopwords.split('\n')

Define functions and objects that will be used

In [0]:
# preprocess, single documents, not whole corpus.
# This function uses nouns. For a more sophisticated analysis, using morphs is recommended
def preprocess_single_doc(text, tokenizer_type = 1):
  
  #choose tokenizer
  if tokenizer_type == 1:
    tokenizer = Hannanum()
  elif tokenizer_type == 2:
    tokenizer = Kkma()
  elif tokenizer_type == 3:
    tokenizer = Okt()
    
  #tokenize
  tokens = tokenizer.nouns(text)
  
  #remove short words, but probably should not apply to Korean
  #tokens = [token for token in tokens if len(token) > 1] 

  #stop words
  my_stopwords = ['조선일보', '조선닷컴', '닷컴', 'Chosun', 'Copyrights', '&', '바로가기', '기자', '구독', '메인' 'ⓒ', '배포', '한겨례', '한겨례신문', '▶', '◀', '네이버', '[', ']', 'co', 'kr', 'hani']
  tokens = [token for token in tokens if token not in stopwords and token not in my_stopwords]
  
  #numbers are already left out, from tokenizer.nouns()
  #tokens = [word for word in tokens if not any(char.isdigit() for char in word)]

  preprocessed = ' '.join(tokens)
  return preprocessed

In [0]:
## Customize FLAGS, because I don't find the tf flags easy to use :-(
class Flags():
  def __init__(self):
    self.training_sample_percentage = float(0.7)
    
    self.n_step = int(350)  # how many words there will be per document
    self.n_hidden = int(32)  # number of recurrent steps
    
    self.embedding_dim = int(64)
    self.learning_rate = float(1e-3)
    self.batch_size = int(64)
    self.num_epochs = int(30)
    self.cell_type = int(1)
    
    
FLAGS = Flags()

In [0]:
# Naive implementation of reshaping y : 0,1,2 => [1,0,0], [0,1,0], [0,0,1]
def expand_y(yVec):
  nrow = len(yVec)
  y_unique = np.unique(yVec)
  ncol = len(y_unique)  
  out = np.zeros((nrow, ncol))
  
  for i, y in enumerate(yVec):
    for j, val in enumerate(y_unique):
      if y==val:
        out[i,j] = 1
        
  return np.array(out)

In [0]:
# RNN class. Embedding is done outside an RNN object, and fed in
class RNN(object):
  def __init__(self, learning_rate, embedding_dim, n_step, n_hidden, n_class, cell_type):
    self.input_x = tf.placeholder(tf.float32, [None, n_step, embedding_dim])
    self.input_y = tf.placeholder(tf.float32, [None, n_class])
    
    W = tf.Variable(tf.random_normal([n_hidden, n_class]))
    b = tf.Variable(tf.random_normal([n_class]))
    
    if cell_type==0:
      cell = tf.nn.rnn_cell.BasicRNNCell(n_hidden)
    elif cell_type==1:
      cell = tf.nn.rnn_cell.BasicLSTMCell(n_hidden)
    elif cell_type==2:
      cell = tf.nn.rnn_cell.GRUCell(n_hidden)
    
    outputs, states = tf.nn.dynamic_rnn(cell, self.input_x, dtype=tf.float32)
    #outputs here has shape [batch_size, n_step, n_hidden]. So, reshpae to [batch_size, n_hidden]
    outputs = tf.transpose(outputs, [1,0,2])
    outputs = outputs[-1]
    
    self.model = tf.matmul(outputs, W) + b
    
    
    with tf.name_scope('output'):
      self.cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=self.model, labels=self.input_y))
      self.optimizer = tf.train.AdamOptimizer(learning_rate).minimize(self.cost)

      is_correct = tf.equal(tf.argmax(self.model, 1), tf.argmax(self.input_y, 1))
      self.accuracy = tf.reduce_mean(tf.cast(is_correct, tf.float32))


In [0]:
# Training procedure
def train(x_train, y_train, num_epochs):
  
  print('Training RNN, with parameters \nnum_epochs:{:d}, \nlearning_rate:{:f}, \
  \nembedding_dim:{:d}, \nn_step:{:d}, \nn_hidden:{:d}, \ncell_type:{:d}, \nbatch_size:{:d}\n\n'.format(\
        FLAGS.num_epochs, FLAGS.learning_rate, FLAGS.embedding_dim, FLAGS.n_step, FLAGS.n_hidden, FLAGS.cell_type, FLAGS.batch_size))
  
  with tf.Graph().as_default():
    sess = tf.Session()
    with sess.as_default():
      rnn = RNN(learning_rate=FLAGS.learning_rate,
                embedding_dim=FLAGS.embedding_dim,
                n_step=FLAGS.n_step,
                n_hidden=FLAGS.n_hidden,
                n_class=y_train.shape[1], 
                cell_type=FLAGS.cell_type)
      
      sess.run(tf.global_variables_initializer())
      
      data_size = len(y_train)
      num_batches_per_epoch = int((len(y_train) - 1) / FLAGS.batch_size) + 1
      
      for epoch in range(num_epochs):
        total_cost = 0
        total_accr = 0
        
        shuffled_indeces = np.random.permutation(np.arange(len(y_train)))
        shuffled_x = [x_train[i] for i in shuffled_indeces]
        shuffled_y = y_train[shuffled_indeces]
        
        for batch_num in range(num_batches_per_epoch):
          start_idx = batch_num * FLAGS.batch_size
          end_idx = min((batch_num + 1) * FLAGS.batch_size, data_size)
          batch_xs = shuffled_x[start_idx:end_idx]
          batch_ys = shuffled_y[start_idx:end_idx]

          feed_dict = {rnn.input_x:batch_xs, rnn.input_y:batch_ys}

          _, cost_val, accuracy = sess.run([rnn.optimizer, rnn.cost, rnn.accuracy], feed_dict)
          total_cost += cost_val
          total_accr += accuracy

        print('Epoch:', '%04d' % (epoch + 1), 'Avg. cost =', '{:.3f}'.format(total_cost / num_batches_per_epoch), 'Avg. accuracy =','{:.3f}'.format(total_accr / num_batches_per_epoch))

      print('Training Complete!')


    return sess, rnn

In [0]:
# Testing procedure
def test(x_test, y_test, sess, rnn):
  feed_dict = {rnn.input_x : x_test, rnn.input_y : y_test}
  print('Test set Accuracy : ', sess.run(rnn.accuracy, feed_dict=feed_dict))

Now that we have defined the classes and functions, let's get it rolling<br>
Say we want to see if our model can tell the political articles apart, between '조선일보' and '한겨례'

First, we import data (500 per class in this example) and do some preprocessing. 

 - Prepare data

In [11]:
# prepare corpus  

nrow_per_type = 500
ntype = 2

data, label = np.empty([nrow_per_type * ntype,1], dtype=object), np.zeros((nrow_per_type * ntype, 1))

files_0 = glob.glob(base_dir + 'Chosun/politics/*.txt')
files_1 = glob.glob(base_dir + 'Han/politics/*.txt')

files_total = np.append(files_0[:nrow_per_type], files_1[:nrow_per_type])

print('reading data...')
for i, name in enumerate(files_total):
  with open(name,'r') as handle:
    data[i,0] = handle.read()
    label[i,0] = int(i/nrow_per_type)

reading data...


 - Shuffle and split the prepared data

In [0]:
# shuffle and split into train/test set

train_percentage = FLAGS.training_sample_percentage

data_size = len(label)
shuffled_indeces = np.random.permutation(np.arange(data_size))
shuffled_data = data[shuffled_indeces]
shuffled_label = label[shuffled_indeces]

split_idx = int(data_size*train_percentage)

data_train = shuffled_data[:split_idx]
data_test = shuffled_data[split_idx:]
label_train = shuffled_label[:split_idx]
label_test = shuffled_label[split_idx:]

del shuffled_data, shuffled_label, data, label

 - Preprocess the training data

In [13]:
print('processing training data(x)...')
x_in_train = [preprocess_single_doc(x[0], 3) for x in data_train]
print('processing training data(y)...')
y_train = expand_y(label_train)

processing training data(x)...
processing training data(y)...


 - Build a word2vec model from our training data

In [14]:
print('training word2vec..., for training set')
#Now want each word in x_in_train to be represented as vectors!
train_corpus = [nltk.word_tokenize(sentence) for sentence in x_in_train]
wv_model_trainset = Word2Vec(size=FLAGS.embedding_dim, min_count=2)
wv_model_trainset.build_vocab(train_corpus)
wv_model_trainset.train(train_corpus, total_examples=wv_model_trainset.corpus_count, epochs=40)

training word2vec..., for training set


(7321497, 8028760)

 - The length of each input must be fixed. If longer than 'n_step', crop off the rest. If shorter, then add padding

In [0]:
n_step = FLAGS.n_step
x_train = []
for i, sent in enumerate(x_in_train):
  words = sent.split(' ')
  word_cnt = 0
  sentList = []
  for word in words:
    if word in wv_model_trainset.wv.vocab:
      if word_cnt < n_step:
        wordVec = wv_model_trainset[word]
        sentList.append(wordVec)
        word_cnt += 1
      else:
        break
    
  if len(sentList) < n_step:
    last_idx = len(sentList)-1
    for j in range(last_idx+1, n_step):
      sentList.append(np.zeros(FLAGS.embedding_dim))
  
  x_train.append(sentList)

Train RNN model

In [16]:
session, rnnModel = train(x_train, y_train, FLAGS.num_epochs)

W0712 08:42:09.367014 140449591834496 deprecation.py:323] From <ipython-input-8-8ae6d72a753e>:12: BasicLSTMCell.__init__ (from tensorflow.python.ops.rnn_cell_impl) is deprecated and will be removed in a future version.
Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.
W0712 08:42:09.369862 140449591834496 deprecation.py:323] From <ipython-input-8-8ae6d72a753e>:16: dynamic_rnn (from tensorflow.python.ops.rnn) is deprecated and will be removed in a future version.
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
W0712 08:42:09.471462 140449591834496 deprecation.py:506] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument in

Training RNN, with parameters 
num_epochs:30, 
learning_rate:0.001000,   
embedding_dim:64, 
n_step:350, 
n_hidden:32, 
cell_type:1, 
batch_size:64


Epoch: 0001 Avg. cost = 1.083 Avg. accuracy = 0.460
Epoch: 0002 Avg. cost = 0.807 Avg. accuracy = 0.503
Epoch: 0003 Avg. cost = 0.614 Avg. accuracy = 0.669
Epoch: 0004 Avg. cost = 0.562 Avg. accuracy = 0.708
Epoch: 0005 Avg. cost = 0.521 Avg. accuracy = 0.729
Epoch: 0006 Avg. cost = 0.471 Avg. accuracy = 0.765
Epoch: 0007 Avg. cost = 0.283 Avg. accuracy = 0.917
Epoch: 0008 Avg. cost = 0.045 Avg. accuracy = 0.990
Epoch: 0009 Avg. cost = 0.041 Avg. accuracy = 0.991
Epoch: 0010 Avg. cost = 0.025 Avg. accuracy = 0.999
Epoch: 0011 Avg. cost = 0.020 Avg. accuracy = 1.000
Epoch: 0012 Avg. cost = 0.017 Avg. accuracy = 1.000
Epoch: 0013 Avg. cost = 0.014 Avg. accuracy = 1.000
Epoch: 0014 Avg. cost = 0.012 Avg. accuracy = 1.000
Epoch: 0015 Avg. cost = 0.010 Avg. accuracy = 1.000
Epoch: 0016 Avg. cost = 0.009 Avg. accuracy = 1.000
Epoch: 0017 Avg. c

Training is complete, so we test the performance with our test set

 - Preprocess and reshpae test data as we did with the training set

In [17]:
print('processing test data(x)...')
x_in_test = [preprocess_single_doc(x[0], 3) for x in data_test]
print('processing test data(y)...')
y_test = expand_y(label_test)

processing test data(x)...
processing test data(y)...


In [18]:
print('training word2vec..., for test set')
#Now want each word in x_in_train to be represented as vectors!
test_corpus = [nltk.word_tokenize(sentence) for sentence in x_in_test]
wv_model_testset = Word2Vec(size=FLAGS.embedding_dim, min_count=2)
wv_model_testset.build_vocab(test_corpus)
wv_model_testset.train(test_corpus, total_examples=wv_model_testset.corpus_count, epochs=40)

training word2vec..., for test set


(3031714, 3379280)

In [0]:
n_step = FLAGS.n_step
x_test = []
for i, sent in enumerate(x_in_test):
  words = sent.split(' ')
  word_cnt = 0
  sentList = []
  for word in words:
    if word in wv_model_testset.wv.vocab:
      if word_cnt < n_step:
        wordVec = wv_model_testset[word]
        sentList.append(wordVec)
        word_cnt += 1
      else:
        break
    
  if len(sentList) < n_step:
    last_idx = len(sentList)-1
    for j in range(last_idx+1, n_step):
      sentList.append(np.zeros(FLAGS.embedding_dim))
  
  x_test.append(sentList)

Almost all done! Test the model now

In [20]:
test(x_test, y_test, session, rnnModel)

Test set Accuracy :  0.77666664


Using RNN with the above parameters (specified in the flag), 0.7767 of the test set were correctly classified 