In [1]:
import tensorflow as tf
import numpy as np
import pickle
tf.set_random_seed(10)

In [None]:
# 초성 리스트. 00 ~ 18 --> 19개
CHOSUNG_LIST = ['ᄀ', 'ᄁ', 'ᄂ', 'ᄃ', 'ᄄ', 'ᄅ', 'ᄆ', 'ᄇ', 'ᄈ', 'ᄉ', 'ᄊ', 'ᄋ', 'ᄌ', 'ᄍ', 'ᄎ', 'ᄏ', 'ᄐ', 'ᄑ', 'ᄒ']

# 중성 리스트. 00 ~ 20 --> 21개
JUNGSUNG_LIST = ['ᅡ', 'ᅢ', 'ᅣ', 'ᅤ', 'ᅥ', 'ᅦ', 'ᅧ', 'ᅨ', 'ᅩ', 'ᅪ', 'ᅫ', 'ᅬ', 'ᅭ', 'ᅮ', 'ᅯ', 'ᅰ', 'ᅱ', 'ᅲ', 'ᅳ', 'ᅴ',
                 'ᅵ']

# 종성 리스트. 00 ~ 27 + 1(1개는 종성없음코드) --> 28개
JONGSUNG_LIST = [' ', 'ᆨ', 'ᆩ', 'ᆪ', 'ᆫ', 'ᆬ', 'ᆭ', 'ᆮ', 'ᆯ', 'ᆰ', 'ᆱ', 'ᆲ', 'ᆳ', 'ᆴ', 'ᆵ', 'ᆶ', 'ᆷ', 'ᆸ', 'ᆹ', 'ᆺ',
                 'ᆻ', 'ᆼ', 'ᆽ', 'ᆾ', 'ᆿ', 'ᇀ', 'ᇁ', 'ᇂ']

# 독립 자소 리스트. --> 51개
INDI_LIST = ['ㄱ', 'ㄲ', 'ㄳ', 'ㄴ', 'ㄵ', 'ㄶ', 'ㄷ', 'ㄸ', 'ㄹ', 'ㄺ', 'ㄻ', 'ㄼ', 'ㄽ', 'ㄾ', 'ㄿ', 'ㅀ', 'ㅁ', 'ㅂ', 'ㅃ', 'ㅄ', 'ㅅ',
             'ㅆ', 'ㅇ', 'ㅈ', 'ㅉ', 'ㅊ', 'ㅋ', 'ㅌ', 'ㅍ', 'ㅎ', 'ㅏ', 'ㅐ', 'ㅑ', 'ㅒ', 'ㅓ', 'ㅔ', 'ㅕ', 'ㅖ', 'ㅗ', 'ㅘ', 'ㅙ', 'ㅚ',
             'ㅛ', 'ㅜ', 'ㅝ', 'ㅞ', 'ㅟ', 'ㅠ', 'ㅡ', 'ㅢ', 'ㅣ']

hangul_johab = range(44032,55204)
hangul_jaeum = range(12593,12623)
hangul_moeum = range(12623,12644)
hangul_chosung = range(4352,4371)
hangul_jungsung = range(4449,4470)
hangul_jongsung = range(4520,4547)
english1 = range(65,91)
english2 = range(97,123)
digit = range(48,58)
special_char = [ord('.'), ord('\''), ord('?'), ord(','), ord('!'), ord('%')] # 형태소 분석에 필요하다고 생각하는 특수문자 추가

def syllable(char):
    s = ord(char) - 44032
    cho = (s//21)//28
    jung = (s%(21*28))//28
    jong = (s%28)
    
    return CHOSUNG_LIST[cho], JUNGSUNG_LIST[jung], JONGSUNG_LIST[jong]

def read_data(file_path):
    sentence = []
    data = []
    label = []
    d_append = data.append
    with open(file_path,"r",encoding='utf-8') as f:
        for line in f.readlines():
            if line != '\n':
                w = line.split('\t')
                label.append(w[1].replace('\n',''))
                word = []
                w_append = word.append
                w_extend = word.extend
                for c in w[0]:
                    sign_unk = 0
                    
                    if ord(c) in hangul_johab or ord(c) in hangul_chosung or \
                       ord(c) in hangul_jungsung or ord(c) in hangul_jongsung or \
                       ord(c) in hangul_jaeum or ord(c) in hangul_moeum or \
                       ord(c) in english1 or ord(c) in english2 or \
                       ord(c) in digit or ord(c) in special_char: pass
                    else: sign_unk = 1 # 지정된 한글, 영어, 숫자, 특수문자 이외에 전부 UNK태그 지정

                    if sign_unk == 1:
                        w_append('<UNK>')
                    else:
                        if ord(c) in hangul_johab: # 조합형 한글은 자모를 분리
                            jaso = syllable(c)
                            w_extend(jaso)
                        else:
                            w_append(c) # 한글자모, 영어, 숫자는 그대로
                sentence.append((word,w[1].replace('\n',''))) # ([분리된 형태소],태그) 형태로 저장
            else:
                if sentence != []:
                    d_append(sentence) # sentence마다 구분지어서 저장
                    sentence = []
    return data,label

def make_dictionary(label):
    dictionary_char = dict()
    dictionary_label= dict()
    
    char_list = ['<PAD>','<UNK>']+CHOSUNG_LIST+JUNGSUNG_LIST+JONGSUNG_LIST+INDI_LIST+[chr(i) for i in english1]\
               + [chr(i) for i in english2] + [chr(i) for i in digit] + [chr(i) for i in special_char]

    for i in char_list:
        dictionary_char[i] = len(dictionary_char)
   
    label = sorted(list(set(label)))
    for i in label:
        dictionary_label[i] = len(dictionary_label)
    return dictionary_char, dictionary_label

# 데이터를 index로 치환
def make_dataSet(data, dictionary_char, dictionary_label):
    indexed_data = [] 
    d_append = indexed_data.append
    for sentence in data:
        sen = []
        s_append = sen.append
        for word in sentence:
            s_append(([dictionary_char[char] for char in word[0]], dictionary_label[word[1]]))
        d_append(sen)
    
    return indexed_data

data, label = read_data("data_v5_edit.txt") # data는 3차원 리스트로 
                                            # 전체 데이터 -> 문장 -> 형태소 순으로 저장됨
dictionary_char, dictionary_label = make_dictionary(label)
indexed_data = make_dataSet(data, dictionary_char, dictionary_label)

pickle.dump(indexed_data,open('indexed_data.pkl','wb'))
pickle.dump(dictionary_char,open('dictionary_char.pkl','wb'))
pickle.dump(dictionary_label,open('dictionary_label.pkl','wb'))

In [2]:
dictionary_char = pickle.load(open('dictionary_char.pkl', 'rb'))
dictionary_label = pickle.load(open('dictionary_label.pkl', 'rb'))
indexed_data = pickle.load(open('indexed_data.pkl', 'rb'))

In [3]:
dic_char_len = len(dictionary_char)
dic_label_len = len(dictionary_label)

word_max_len = 0
for sentence in indexed_data:
    for word in sentence:
        if word_max_len < len(word[0]): 
            word_max_len = len(word[0])

window_size = 7*2+1

In [4]:
# character composition model
# CNN
filter_num = 25
X = tf.placeholder(tf.float32, [None, window_size, word_max_len, dic_char_len, 1])
Y = tf.placeholder(tf.float32, [None, dic_label_len])
dropout_rate = tf.placeholder(tf.float32)

# filter 3
conv_layer1 = tf.layers.conv3d(X, filters=filter_num, kernel_size=[1, 3, dic_char_len], activation=tf.nn.relu, kernel_initializer=tf.contrib.layers.xavier_initializer())
conv_layer1 = tf.layers.max_pooling3d(conv_layer1, pool_size=[1, word_max_len-3+1, 1], strides=1)
conv_layer1 = tf.layers.dropout(conv_layer1,rate = dropout_rate)
# filter 5
conv_layer2 = tf.layers.conv3d(X, filters=filter_num, kernel_size=[1, 5, dic_char_len], activation=tf.nn.relu, kernel_initializer=tf.contrib.layers.xavier_initializer())
conv_layer2 = tf.layers.max_pooling3d(conv_layer2, pool_size=[1, word_max_len-5+1, 1], strides=1)
conv_layer2 = tf.layers.dropout(conv_layer2,rate = dropout_rate)
# filter 7
conv_layer3 = tf.layers.conv3d(X, filters=filter_num, kernel_size=[1, 7, dic_char_len], activation=tf.nn.relu, kernel_initializer=tf.contrib.layers.xavier_initializer())
conv_layer3 = tf.layers.max_pooling3d(conv_layer3, pool_size=[1, word_max_len-7+1, 1], strides=1)
conv_layer3 = tf.layers.dropout(conv_layer3,rate = dropout_rate)
# filter 9
conv_layer4 = tf.layers.conv3d(X, filters=filter_num, kernel_size=[1, 9, dic_char_len], activation=tf.nn.relu, kernel_initializer=tf.contrib.layers.xavier_initializer())
conv_layer4 = tf.layers.max_pooling3d(conv_layer4, pool_size=[1, word_max_len-9+1, 1], strides=1)
conv_layer4 = tf.layers.dropout(conv_layer4,rate = dropout_rate)
X

<tf.Tensor 'Placeholder:0' shape=(?, 15, 48, 189, 1) dtype=float32>

In [5]:
# character composition
char_conv_concat = tf.concat([conv_layer4, conv_layer3, conv_layer2, conv_layer1], 4)
char_conv_concat = tf.reshape(char_conv_concat,[-1, window_size, filter_num*4])
char_conv_concat

<tf.Tensor 'Reshape:0' shape=(?, 15, 100) dtype=float32>

In [6]:
# context encoding model
# binary features, position embedding
batch=100
bin_fea = [[[0],[0],[0],[0],[0],[0],[0],[1],[0],[0],[0],[0],[0],[0],[0]]]*batch
binary_features = tf.constant(bin_fea, dtype=tf.float32)

pos_emb = [[[1,0,0,1],
            [1,0,1,0],
            [1,0,1,1],
            [1,1,0,0],
            [1,1,0,1],
            [1,1,1,0],
            [1,1,1,1],
            [0,0,0,0],
            [0,0,0,1],
            [0,0,1,0],
            [0,0,1,1],
            [0,1,0,0],
            [0,1,0,1],
            [0,1,1,0],
            [0,1,1,1]]]*batch
position_embedding = tf.constant(pos_emb, dtype=tf.float32)

In [7]:
# context window
window = tf.concat([char_conv_concat, position_embedding, binary_features],2)
window = tf.expand_dims(window, -1)
window

<tf.Tensor 'ExpandDims:0' shape=(100, 15, 105, 1) dtype=float32>

In [8]:
# CNN
filter_num = 128

# filter 2
conv_layer1 = tf.layers.conv2d(window, filters=filter_num, kernel_size=[2, 105], activation=tf.nn.relu,kernel_initializer=tf.contrib.layers.xavier_initializer())
conv_layer1 = tf.layers.dropout(conv_layer1,rate = dropout_rate)
conv_layer1 = tf.layers.conv2d(conv_layer1, filters=filter_num, kernel_size=[2, 1], activation=tf.nn.relu,kernel_initializer=tf.contrib.layers.xavier_initializer())
conv_layer1 = tf.layers.max_pooling2d(conv_layer1, pool_size=[13, 1], strides=1)
conv_layer1 = tf.layers.dropout(conv_layer1,rate = dropout_rate)
# filter 3
conv_layer2 = tf.layers.conv2d(window, filters=filter_num, kernel_size=[3, 105], activation=tf.nn.relu,kernel_initializer=tf.contrib.layers.xavier_initializer())
conv_layer2 = tf.layers.dropout(conv_layer2,rate = dropout_rate)
conv_layer2 = tf.layers.conv2d(conv_layer2, filters=filter_num, kernel_size=[3, 1], activation=tf.nn.relu,kernel_initializer=tf.contrib.layers.xavier_initializer())
conv_layer2 = tf.layers.max_pooling2d(conv_layer2, pool_size=[11, 1], strides=1)
conv_layer2 = tf.layers.dropout(conv_layer2,rate = dropout_rate)
# filter 4
conv_layer3 = tf.layers.conv2d(window, filters=filter_num, kernel_size=[4, 105], activation=tf.nn.relu,kernel_initializer=tf.contrib.layers.xavier_initializer())
conv_layer3 = tf.layers.dropout(conv_layer3,rate = dropout_rate)
conv_layer3 = tf.layers.conv2d(conv_layer3, filters=filter_num, kernel_size=[4, 1], activation=tf.nn.relu,kernel_initializer=tf.contrib.layers.xavier_initializer())
conv_layer3 = tf.layers.max_pooling2d(conv_layer3, pool_size=[9, 1], strides=1)
conv_layer3 = tf.layers.dropout(conv_layer3,rate = dropout_rate)
# filter 5
conv_layer4 = tf.layers.conv2d(window, filters=filter_num, kernel_size=[5, 105], activation=tf.nn.relu,kernel_initializer=tf.contrib.layers.xavier_initializer())
conv_layer4 = tf.layers.dropout(conv_layer4,rate = dropout_rate)
conv_layer4 = tf.layers.conv2d(conv_layer4, filters=filter_num, kernel_size=[5, 1], activation=tf.nn.relu,kernel_initializer=tf.contrib.layers.xavier_initializer())
conv_layer4 = tf.layers.max_pooling2d(conv_layer4, pool_size=[7, 1], strides=1)
conv_layer4 = tf.layers.dropout(conv_layer4,rate = dropout_rate)

In [9]:
context = tf.concat([conv_layer4, conv_layer3, conv_layer2, conv_layer1], 3)
context = tf.reshape(context, [-1,filter_num*4])
context

<tf.Tensor 'Reshape_1:0' shape=(100, 512) dtype=float32>

In [10]:
# Fully Connected Layer
W = tf.Variable(tf.truncated_normal([512, dic_label_len], stddev = 0.01))
b = tf.Variable(tf.zeros([dic_label_len]))

logit = tf.add(tf.matmul(context,W),b)
Y_ = tf.nn.softmax(logit)
Y_

<tf.Tensor 'Softmax:0' shape=(100, 41) dtype=float32>

In [11]:
cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(labels = Y, logits = logit))
prediction = tf.equal(tf.argmax(Y_,1),tf.argmax(Y,1))
accuracy = tf.reduce_mean(tf.cast(prediction,tf.float32))
optimizer = tf.train.AdamOptimizer(0.001).minimize(cross_entropy)

In [12]:
def make_input_data(indexed_data):
    for sentence in indexed_data:
        for w_index in range(len(sentence)):
            X = np.zeros([15,48,189],dtype=np.float32)
            Y = np.zeros([1,41], dtype=np.float32)
            word_index = 5 - w_index
            for word in sentence:
                word_index = word_index + 1
                if word_index >= 15 or word_index < 0: 
                    continue
                if word_index == 6: 
                    Y[0][word[1]] = 1
                if len(word[0]) % 2 == 0:
                    char_index = 48//2 - len(word[0]) // 2
                else:
                    char_index = 48//2 - (len(word[0]) // 2 + 1)
                
                for i, char in enumerate(word[0]):
                    X[word_index][int(i+char_index)][char] = 1
            X = np.reshape(X, [1, 15, 48, 189, 1])
            
            yield X, Y
            
def indexed_data_size(indexed_data):
    size = 0
    for sentence in indexed_data:
        size += len(sentence)
    return size

def make_batch_input(indexed_data, batch):
    indexed_data = indexed_data[:3000]
    total_size = indexed_data_size(indexed_data)
    sentence_input = make_input_data(indexed_data)
    for i in range(total_size//batch):
        X_input = []
        Y_input = []
        for j in range(batch):
            X, Y =next(sentence_input)
            X_input.append(X)
            Y_input.append(Y)
        X_input = np.concatenate(X_input, 0)
        Y_input = np.concatenate(Y_input, 0)
        yield X_input, Y_input

In [None]:
# trainging
batch = 100
train_size = indexed_data_size(indexed_data[:3000])

with tf.Session() as sess:
    tf.global_variables_initializer().run()
    step=100
    input_batch = make_batch_input(indexed_data, batch)
    a_batch = []
    c_batch = []

    for i in range(train_size//batch):
        X_input, Y_input = next(input_batch)
        a, c, _ = sess.run([accuracy, cross_entropy, optimizer],feed_dict={X:X_input, Y:Y_input, dropout_rate:0.5})
        a_batch.append(a)
        c_batch.append(c)
        if i % step == 0 and i != 0:
            print(i, ' : ', 'accuracy : {:0.4f}   loss : {:0.4f}'.format(sum(a_batch)/step, sum(c_batch)/step))
            a_batch = []
            c_batch = []

100  :  accuracy : 0.2968   loss : 2.8725
200  :  accuracy : 0.4394   loss : 2.0030
300  :  accuracy : 0.6662   loss : 1.1722
400  :  accuracy : 0.8066   loss : 0.6808
500  :  accuracy : 0.8475   loss : 0.5308
600  :  accuracy : 0.8886   loss : 0.3895
700  :  accuracy : 0.9086   loss : 0.3154
800  :  accuracy : 0.9210   loss : 0.2720
900  :  accuracy : 0.9213   loss : 0.2559
1000  :  accuracy : 0.9260   loss : 0.2414
1100  :  accuracy : 0.9385   loss : 0.2127
1200  :  accuracy : 0.9478   loss : 0.1784
1300  :  accuracy : 0.9193   loss : 0.2448
1400  :  accuracy : 0.9207   loss : 0.2869
1500  :  accuracy : 0.8725   loss : 0.4128
1600  :  accuracy : 0.9170   loss : 0.2749
1700  :  accuracy : 0.9336   loss : 0.2169
1800  :  accuracy : 0.9384   loss : 0.1979
1900  :  accuracy : 0.9457   loss : 0.1814
2000  :  accuracy : 0.9438   loss : 0.1905
2100  :  accuracy : 0.9365   loss : 0.2265
2200  :  accuracy : 0.9270   loss : 0.2466
2300  :  accuracy : 0.9390   loss : 0.2020
2400  :  accuracy : 