In [5]:
from google.colab import auth
auth.authenticate_user()

from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [4]:
!pip install konlpy



In [2]:
%tensorflow_version 1.x
import os
import tensorflow as tf
from konlpy.tag import Okt

import gensim
import numpy as np

TensorFlow 1.x selected.


In [0]:
class Word2Vec():
    
    def __init__(self):
        None

    def tokenize(self, doc): # 토크나이즈 하는 부분
        pos_tagger = Okt()
        return ['/'.join(t) for t in pos_tagger.pos(doc, norm=True, stem=True)] # 품사 테깅하는 부분
    
    def read_data(self, filename): # 데이터 읽는 부분 
        with open(filename, 'r',encoding='utf-8') as f:
            data = [line.split('\t') for line in f.read().splitlines()]
            data = data[1:]
        return data  
    
    def Word2vec_model(self, model_name): # word2vec gensim모델을 불러들이는 부분
        model = gensim.models.word2vec.Word2Vec.load(model_name)
        return model
    
    # 한글 단어를 미리 프리트레이닝된 word2vec 모델에 lookup 테이블을 통해서 불러들이는 부분
    # 워드를 벡터로 바꾸는 부분
    def Convert2Vec(self, model_name, doc):  # Convert corpus into vectors
        word_vec = []
        model = gensim.models.word2vec.Word2Vec.load(model_name)
        for sent in doc:
            sub = []
            for word in sent: # 단어가 트레이닝된 단어장에 있으면 벡터를 불러들이고
                if(word in model.wv.vocab):
                    sub.append(model.wv[word])
                else: # 없다면 유니폼 분포를 따르는 어떤 램덤한 벡터를 생성하게 된다. 즉, OOV가 된다
                    sub.append(np.random.uniform(-0.25,0.25,300)) ## used for Out Of Vocaburaly words
            word_vec.append(sub)
        
        return np.array(word_vec)
    
    # 단어 사전에 제일 긴 단어길이에 맞춰 다른 단어들도 0으로 채워주기 위한 함수
    def Zero_padding(self, train_batch_X, Batch_size, Maxseq_length, Vector_size):
        zero_pad = np.zeros((Batch_size, Maxseq_length, Vector_size))
        for i in range(Batch_size):
            zero_pad[i,:np.shape(train_batch_X[i])[0],:np.shape(train_batch_X[i])[1]] = train_batch_X[i]
            
        return zero_pad
    
    # 원핫인코딩 기법
    def One_hot(self, data):
        index_dict = {value:index for index,value in enumerate(set(data))}
        result = []
        
        for value in data:
            one_hot = np.zeros(len(index_dict))
            index = index_dict[value]
            one_hot[index] = 1
            result.append(one_hot)
        
        return np.array(result)

In [0]:
class Bi_LSTM():

    def __init__(self, lstm_units, num_class, keep_prob):
        self.lstm_units = lstm_units

        with tf.variable_scope('forward', reuse=tf.AUTO_REUSE):
            self.lstm_fw_cell = tf.nn.rnn_cell.LSTMCell(lstm_units, forget_bias=1.0, state_is_tuple=True)
            self.lstm_fw_cell = tf.contrib.rnn.DropoutWrapper(self.lstm_fw_cell, output_keep_prob=keep_prob)

        with tf.variable_scope('backward', reuse=tf.AUTO_REUSE):
            self.lstm_bw_cell = tf.nn.rnn_cell.LSTMCell(lstm_units, forget_bias=1.0, state_is_tuple=True)
            self.lstm_bw_cell = tf.contrib.rnn.DropoutWrapper(self.lstm_bw_cell, output_keep_prob=keep_prob)

        with tf.variable_scope('Weights', reuse=tf.AUTO_REUSE):
            self.W = tf.get_variable(name="W", shape=[2 * lstm_units, num_class],
                                     dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer())
            self.b = tf.get_variable(name="b", shape=[num_class], dtype=tf.float32,
                                     initializer=tf.zeros_initializer())

    def logits(self, X, W, b, seq_len):
        (output_fw, output_bw), states = tf.nn.bidirectional_dynamic_rnn(self.lstm_fw_cell, self.lstm_bw_cell,
                                                                         dtype=tf.float32,
                                                                         inputs=X, sequence_length=seq_len)
        # concat fw, bw 
        outputs = tf.concat([states[0][1], states[1][1]], axis=1) # final states
        # final state를 fully connected layer를 통해서 prediction하게 된다.
        pred = tf.matmul(outputs, W) + b # softmax
        return pred

    def model_build(self, logits, labels, learning_rate=0.001):
        with tf.variable_scope("loss"):
            loss = tf.reduce_mean(
                tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits, labels=labels))  # Softmax loss
            optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss)  # Adam Optimizer
        return loss, optimizer

    # Tensorboard를 통해 그래프를 만들때 쓰는 부분
    def graph_build(self):
        self.loss = tf.placeholder(tf.float32)
        self.acc = tf.placeholder(tf.float32)
        tf.summary.scalar('Loss', self.loss)
        tf.summary.scalar('Accuracy', self.acc)
        merged = tf.summary.merge_all()
        return merged

In [6]:
W2V = Word2Vec()
os.chdir("..")
test_data = W2V.read_data("/content/gdrive/My Drive/Colab Notebooks/GraduationProject/Word2Vec/Movie_rating_data/ratings_test.txt")

# tokenize the data we have.
# about 40 mins
print("Tokenize Start!\nCould take minutes...")
tokens = [[W2V.tokenize(row[1]),int(row[2])] for row in test_data if W2V.tokenize(row[1]) != []]
tokens = np.array(tokens)
print("Tokenize Done!")

Tokenize Start!
Could take minutes...
Tokenize Done!


In [0]:
test_X = tokens[:,0]
test_Y = tokens[:,1]

Batch_size = 32
test_size = len(test_X)
test_batch = int(test_size / Batch_size)
seq_length = [len(x) for x in test_X]
Vector_size = 300
Maxseq_length = 95   ## Max length of training data
learning_rate = 0.001
lstm_units = 128
num_class = 2
keep_prob = 1.0

In [8]:
test_Y_ = W2V.One_hot(test_Y)  ## Convert to One-hot
test_X_ = W2V.Convert2Vec("/content/gdrive/My Drive/Colab Notebooks/GraduationProject/Word2Vec/Word2vec.model",test_X)  ## import word2vec model where you have trained before

X = tf.placeholder(tf.float32, shape = [None, Maxseq_length, Vector_size], name = 'X')
Y = tf.placeholder(tf.float32, shape = [None, num_class], name = 'Y')
seq_len = tf.placeholder(tf.int32, shape = [None])

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [11]:
BiLSTM = Bi_LSTM(lstm_units, num_class, keep_prob)

with tf.variable_scope("loss", reuse = tf.AUTO_REUSE):
    logits = BiLSTM.logits(X, BiLSTM.W, BiLSTM.b, seq_len)
    loss, optimizer = BiLSTM.model_build(logits, Y, learning_rate)

prediction = tf.nn.softmax(logits)
correct_pred = tf.equal(tf.argmax(prediction, 1), tf.argmax(Y, 1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

modelName = "/content/gdrive/My Drive/Colab Notebooks/GraduationProject/Bi_LSTM/BiLSTM_model_Epoch_13.ckpt"
init = tf.global_variables_initializer()
saver = tf.train.Saver()

Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.
The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
Please use `keras.layers.Bidirectional(keras.layers.RNN(cell))`, which is equivalent to this API
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
Instructions for updating:
Please use `layer.add_weight` method instead.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [12]:
with tf.Session() as sess:
    
    sess.run(init)
    # load the variables from disk.
    saver.restore(sess, modelName)
    print("Model restored")

    total_acc = 0

    for step in range(test_batch):

        test_batch_X = test_X_[step*Batch_size : step*Batch_size+Batch_size]
        test_batch_Y = test_Y_[step*Batch_size : step*Batch_size+Batch_size]
        batch_seq_length = seq_length[step*Batch_size : step*Batch_size+Batch_size]
        test_batch_X = W2V.Zero_padding(test_batch_X, Batch_size, Maxseq_length, Vector_size)

        acc = sess.run(accuracy , feed_dict={X: test_batch_X, Y: test_batch_Y, seq_len: batch_seq_length})
        print("step :{} Accuracy : {}".format(step+1,acc))
        total_acc += acc/test_batch

    print("Total Accuracy : {}".format(total_acc))

INFO:tensorflow:Restoring parameters from /content/gdrive/My Drive/Colab Notebooks/GraduationProject/Bi_LSTM/BiLSTM_model_Epoch_13.ckpt
Model restored
step :1 Accuracy : 0.84375
step :2 Accuracy : 0.8125
step :3 Accuracy : 0.875
step :4 Accuracy : 0.78125
step :5 Accuracy : 0.875
step :6 Accuracy : 0.875
step :7 Accuracy : 0.9375
step :8 Accuracy : 0.875
step :9 Accuracy : 0.84375
step :10 Accuracy : 0.84375
step :11 Accuracy : 0.75
step :12 Accuracy : 0.90625
step :13 Accuracy : 0.90625
step :14 Accuracy : 0.875
step :15 Accuracy : 0.78125
step :16 Accuracy : 0.71875
step :17 Accuracy : 0.78125
step :18 Accuracy : 0.9375
step :19 Accuracy : 0.875
step :20 Accuracy : 0.9375
step :21 Accuracy : 0.75
step :22 Accuracy : 0.8125
step :23 Accuracy : 0.84375
step :24 Accuracy : 0.8125
step :25 Accuracy : 0.875
step :26 Accuracy : 0.78125
step :27 Accuracy : 0.78125
step :28 Accuracy : 0.8125
step :29 Accuracy : 0.96875
step :30 Accuracy : 0.71875
step :31 Accuracy : 0.875
step :32 Accuracy :