In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import numpy as np 
import pandas as pd
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

df = pd.read_csv('https://raw.githubusercontent.com/HyeonhoonLee/Infra_Meditact_Meditact/master/data/preprocessed_clean20571.csv')
df

Unnamed: 0,nouns,label
0,이어폰 음량 귀,6
1,독감 예방주사,2
2,목 음식물 계속 물감,6
3,케겔 운동,10
4,등 통증 문의,11
...,...,...
20566,허리 통증 다리 저림,11
20567,척추 분리 증 글,11
20568,뼈 통증,17
20569,무릎 연골 수술 운동 방법,10


In [3]:
df['nounlist'] = df['nouns'].str.split()
df.dropna(inplace=True)
df

Unnamed: 0,nouns,label,nounlist
0,이어폰 음량 귀,6,"[이어폰, 음량, 귀]"
1,독감 예방주사,2,"[독감, 예방주사]"
2,목 음식물 계속 물감,6,"[목, 음식물, 계속, 물감]"
3,케겔 운동,10,"[케겔, 운동]"
4,등 통증 문의,11,"[등, 통증, 문의]"
...,...,...,...
20566,허리 통증 다리 저림,11,"[허리, 통증, 다리, 저림]"
20567,척추 분리 증 글,11,"[척추, 분리, 증, 글]"
20568,뼈 통증,17,"[뼈, 통증]"
20569,무릎 연골 수술 운동 방법,10,"[무릎, 연골, 수술, 운동, 방법]"


In [4]:
from sklearn.model_selection import train_test_split
x_train_, x_test_, y_train, y_test = train_test_split(
    df['nounlist'], df['label'], 
    test_size=0.2, random_state=1234, 
    stratify=df['label']
    )

x_train_.shape, x_test_.shape, y_train.shape, y_test.shape

((16336,), (4084,), (16336,), (4084,))

In [5]:
x_train_.head(10)

2760                        [안압]
12483                   [무대, 공포]
6704           [뇌출혈, 인지, 기능, 저하]
5829                    [요산, 수치]
4666             [대변, 볼때, 피, 가득]
17675    [냄새, 못, 얼굴, 경련, 증상, 검사]
20076                [추간판탈출증, 관]
4211                      [대상포진]
9775               [중학생, 식탐, 심해]
147                   [장, 위장, 것]
Name: nounlist, dtype: object

In [6]:
vocab_size = 8000
t = Tokenizer(num_words=vocab_size)
t.fit_on_texts(df['nounlist'])

t.word_index

{'통증': 1,
 '증상': 2,
 '무릎': 3,
 '허리': 4,
 '약': 5,
 '목': 6,
 '운동': 7,
 '치료': 8,
 '복용': 9,
 '수술': 10,
 '잠': 11,
 '왼쪽': 12,
 '우울증': 13,
 '머리': 14,
 '디스크': 15,
 '다리': 16,
 '어깨': 17,
 '가슴': 18,
 '스트레스': 19,
 '두통': 20,
 '검사': 21,
 '오른쪽': 22,
 '질문': 23,
 '증': 24,
 '골절': 25,
 '근육': 26,
 '수': 27,
 '방법': 28,
 '심장': 29,
 '발목': 30,
 '관절': 31,
 '부분': 32,
 '인대': 33,
 '몸': 34,
 '등': 35,
 '정신과': 36,
 '신경': 37,
 '불안': 38,
 '소리': 39,
 '뼈': 40,
 '병원': 41,
 '손가락': 42,
 '파열': 43,
 '불면증': 44,
 '수면': 45,
 '척추': 46,
 '문의': 47,
 '이상': 48,
 '공황장애': 49,
 '상담': 50,
 '주사': 51,
 '힘': 52,
 '요': 53,
 '발': 54,
 '술': 55,
 '손': 56,
 '걸': 57,
 '저림': 58,
 '염': 59,
 '연골': 60,
 '사람': 61,
 '장애': 62,
 '배': 63,
 '강박증': 64,
 '팔': 65,
 '눈': 66,
 '수치': 67,
 '증후군': 68,
 '부작용': 69,
 '건강': 70,
 '생각': 71,
 '진단': 72,
 '위': 73,
 '종아리': 74,
 '이유': 75,
 '감': 76,
 '번': 77,
 '걱정': 78,
 '호흡': 79,
 '과': 80,
 '살': 81,
 '골반': 82,
 '밤': 83,
 '정신': 84,
 '임신': 85,
 '아침': 86,
 '손목': 87,
 '생리': 88,
 '중': 89,
 '대한': 90,
 '허벅지': 91,
 '숨': 92,
 '개월': 9

In [7]:
x_train = t.texts_to_sequences(x_train_)
x_train

[[2033],
 [1889, 563],
 [380, 238, 217, 201],
 [945, 67],
 [423, 1298, 108, 4583],
 [365, 150, 157, 230, 2, 21],
 [471, 135],
 [720],
 [2030, 2200, 146],
 [575, 425, 446],
 [86, 538, 778],
 [297, 524, 358],
 [34],
 [248, 61, 4, 613],
 [54, 1157, 1884],
 [49, 8, 47],
 [37, 19, 589],
 [207],
 [18],
 [1225, 2],
 [384, 46, 59, 50],
 [49, 434, 752],
 [3371, 3372, 779, 9, 23],
 [1344],
 [1576, 1577, 2957, 2958, 704, 8, 89, 4006],
 [1199, 2883, 67],
 [109, 4557, 4558],
 [4232, 674],
 [13, 90],
 [516, 32, 5647, 2338, 829],
 [771, 72],
 [215, 17, 40, 1652],
 [3696, 58, 2],
 [19, 64, 149, 90],
 [475, 273],
 [66, 2875, 553, 89, 1293, 97, 1548, 76],
 [33, 323, 260],
 [5, 166, 1031],
 [608, 1],
 [88, 266, 2979, 1780, 463, 1569, 163],
 [144, 48, 2060],
 [12, 388, 17],
 [6284, 4, 7, 4, 39],
 [2860, 144, 37, 133],
 [18],
 [941, 3046, 108, 21, 125],
 [151, 1974, 88, 1234, 1543, 1286, 21, 991, 370],
 [29, 1231, 188, 97],
 [13, 908, 5784, 5785, 132],
 [996, 132],
 [215, 73, 1587, 1441, 1320, 984, 583],
 

In [8]:
sequence_length = 10
trunc_type = 'post'
padding_type = 'post'
padded_x_train = tf.keras.preprocessing.sequence.pad_sequences(x_train, truncating=trunc_type, padding=padding_type, maxlen=sequence_length)
padded_x_train.shape

(16336, 10)

In [9]:
x_test = t.texts_to_sequences(x_test_)
padded_x_test = tf.keras.preprocessing.sequence.pad_sequences(x_test, truncating=trunc_type, padding=padding_type, maxlen=sequence_length)
padded_x_test

array([[ 275, 4955,    0, ...,    0,    0,    0],
       [  30,  361,   10, ...,    0,    0,    0],
       [ 160,  524,   32, ...,    0,    0,    0],
       ...,
       [  16,    0,    0, ...,    0,    0,    0],
       [1796,    3,    0, ...,    0,    0,    0],
       [  28,    0,    0, ...,    0,    0,    0]], dtype=int32)

In [10]:
from sklearn.utils import class_weight
weight = class_weight.compute_class_weight('balanced', np.unique(y_train), y_train)
weight = {i : weight[i] for i in range(26)}

In [11]:
np.unique(y_train)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25])

In [12]:
y_train = keras.utils.to_categorical(y_train)
y_test = keras.utils.to_categorical(y_test)
y_train.shape

(16336, 26)

In [13]:
class BahdanauAttention(tf.keras.Model):
  def __init__(self, units):
    super(BahdanauAttention, self).__init__()
    self.W1 = Dense(units)
    self.W2 = Dense(units)
    self.V = Dense(1)

  def call(self, values, query): # 단, key와 value는 같음
    # query shape == (batch_size, hidden size)
    # hidden_with_time_axis shape == (batch_size, 1, hidden size)
    # score 계산을 위해 뒤에서 할 덧셈을 위해서 차원을 변경해줍니다.
    hidden_with_time_axis = tf.expand_dims(query, 1)

    # score shape == (batch_size, max_length, 1)
    # we get 1 at the last axis because we are applying score to self.V
    # the shape of the tensor before applying self.V is (batch_size, max_length, units)
    score = self.V(tf.nn.tanh(
        self.W1(values) + self.W2(hidden_with_time_axis)))

    # attention_weights shape == (batch_size, max_length, 1)
    attention_weights = tf.nn.softmax(score, axis=1)

    # context_vector shape after sum == (batch_size, hidden_size)
    context_vector = attention_weights * values
    context_vector = tf.reduce_sum(context_vector, axis=1)

    return context_vector, attention_weights

In [14]:
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Bidirectional, Dropout, Concatenate

embedding_dim = 1024

## LSTM model
# model = keras.models.Sequential([
#     Embedding(vocab_size, embedding_dim),
#     #Bidirectional(LSTM(1024, return_sequences=True)),
#     Bidirectional(LSTM(512)),
#     #Dense(256, activation='relu'),
#     #Dropout(0.5),
#     Dense(256, activation='relu'),
#     Dropout(0.25),
#     Dense(64, activation='relu'),
#     Dropout(0.25),
#     Dense(26, activation='softmax')
# ])


# LSTM + Attention model
input = Input(shape=(sequence_length,))
embed = Embedding(vocab_size, embedding_dim, input_length=sequence_length, )(input)
lstm = Bidirectional(LSTM(512, dropout=0.25, return_sequences = True))(embed)
lstm, forward_h, forward_c, backward_h, backward_c = Bidirectional \
  (LSTM(512, dropout=0.25, return_sequences=True, return_state=True))(lstm)
state_h = Concatenate()([forward_h, backward_h]) # 은닉 상태
state_c = Concatenate()([forward_c, backward_c]) # 셀 상태

## add attention layers
attention = BahdanauAttention(256) # 가중치 크기 정의
context_vector, attention_weights = attention(lstm, state_h)

x = Dense(256, activation='relu')(context_vector)
x = Dropout(0.25)(x)
x = Dense(64, activation='relu')(x)
x = Dropout(0.25)(x)
output = Dense(26, activation='softmax')(x)

model = tf.keras.Model(input, output)


import datetime
dt = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
log_dir = "logs/my_board/" + dt
tc = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)
es = keras.callbacks.EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=4)
mc = keras.callbacks.ModelCheckpoint('/content/drive/My Drive/Colab Notebooks/Meditact/model/lstm_20571_' + dt + '.h5', monitor='val_accuracy', mode='max', verbose=1, save_best_only=True)

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 10)]         0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 10, 1024)     8192000     input_1[0][0]                    
__________________________________________________________________________________________________
bidirectional (Bidirectional)   (None, 10, 1024)     6295552     embedding[0][0]                  
__________________________________________________________________________________________________
bidirectional_1 (Bidirectional) [(None, 10, 1024), ( 6295552     bidirectional[0][0]              
_______________________________________________________________________________________

In [15]:
history = model.fit(padded_x_train, y_train,
                    batch_size=128, epochs=30, 
                    callbacks=[mc, tc],
                    #class_weight=weight,
                    validation_data=(padded_x_test, y_test))

Epoch 1/30
Instructions for updating:
use `tf.profiler.experimental.stop` instead.
Epoch 00001: val_accuracy improved from -inf to 0.52057, saving model to /content/drive/My Drive/Colab Notebooks/Meditact/model/lstm_20571_20201020-045024.h5
Epoch 2/30
Epoch 00002: val_accuracy improved from 0.52057 to 0.63418, saving model to /content/drive/My Drive/Colab Notebooks/Meditact/model/lstm_20571_20201020-045024.h5
Epoch 3/30
Epoch 00003: val_accuracy improved from 0.63418 to 0.68487, saving model to /content/drive/My Drive/Colab Notebooks/Meditact/model/lstm_20571_20201020-045024.h5
Epoch 4/30
Epoch 00004: val_accuracy improved from 0.68487 to 0.70299, saving model to /content/drive/My Drive/Colab Notebooks/Meditact/model/lstm_20571_20201020-045024.h5
Epoch 5/30
Epoch 00005: val_accuracy improved from 0.70299 to 0.71131, saving model to /content/drive/My Drive/Colab Notebooks/Meditact/model/lstm_20571_20201020-045024.h5
Epoch 6/30
Epoch 00006: val_accuracy improved from 0.71131 to 0.71303, 

KeyboardInterrupt: ignored

In [None]:
from keras.models import load_model
model = load_model('/content/drive/My Drive/Colab Notebooks/Meditact/model/lstm_20571_20201016-132514.h5')



In [16]:
class_to_label = {'DERM': 0, 'GS': 1, 'IP': 2, 'GI':3, 'OPH':4,
                'NR': 5, 'ENT': 6, 'PSY': 7, 'HEON': 8, 'RHEU': 9,
                'REHM': 10, 'NS': 11, 'AN': 12, 'DENT': 13, 'PS': 14,
                'CS': 15, 'INFC': 16, 'OS': 17,
                'EMR': 18, 'ENDO': 19, 'CA': 20, 'KTM': 21, 'OBGY': 22,
                'URO': 23, 'ALL': 24, 'NPH': 25}
label_to_class = {value:key for key, value in class_to_label.items()}


In [20]:
!pip install konlpy

Collecting konlpy
[?25l  Downloading https://files.pythonhosted.org/packages/85/0e/f385566fec837c0b83f216b2da65db9997b35dd675e107752005b7d392b1/konlpy-0.5.2-py2.py3-none-any.whl (19.4MB)
[K     |████████████████████████████████| 19.4MB 161kB/s 
Collecting beautifulsoup4==4.6.0
[?25l  Downloading https://files.pythonhosted.org/packages/9e/d4/10f46e5cfac773e22707237bfcd51bbffeaf0a576b0a847ec7ab15bd7ace/beautifulsoup4-4.6.0-py3-none-any.whl (86kB)
[K     |████████████████████████████████| 92kB 12.7MB/s 
Collecting colorama
  Downloading https://files.pythonhosted.org/packages/44/98/5b86278fbbf250d239ae0ecb724f8572af1c91f4a11edf4d36a206189440/colorama-0.4.4-py2.py3-none-any.whl
Collecting JPype1>=0.7.0
[?25l  Downloading https://files.pythonhosted.org/packages/8b/f7/a368401e630f0e390dd0e62c39fb928e5b23741b53c2360ee7d376660927/JPype1-1.0.2-cp36-cp36m-manylinux2010_x86_64.whl (3.8MB)
[K     |████████████████████████████████| 3.8MB 57.8MB/s 
[?25hCollecting tweepy>=3.7.0
  Downloading 

In [21]:
from konlpy.tag import Okt
def preprocess_sentence(sentence):
    twitter = Okt()
    nouns = twitter.nouns(sentence)
    stopwords = ['질문', '문의', '관련', '그대로', '계속', '답변', '선생님', '관련문의',
            '한지', '자주', '좀', '쪽', '자꾸', '요즘', '몇개', '무조건', '하나요',
            '안해','요', '경우', '최근', '및', '몇', '달', '일반', '전날', '저번',
            '말', '일어나지', '며칠', '먹기', '지난번', '글', '때문', '너', '무',
            '오늘', '시', '잔', '뒤', '지속', '막', '것', '이건', '뭔가', '다시', '그',
                '무슨', '안', '난', '도', '기', '후', '거리', '이', '뭘', '저', '뭐', '답젼',
                '평생', '회복', '반', '감사', '의사', '보험', '학생', '제발', '살짝',
                '느낌', '제', '대해','갑자기','문제', '전','정도', '왜', '거', '가요',
                '의심', '어제', '추천', '를', '지금', '무엇', '내일', '관해', '리', '세',
                 '로', '목적', '그냥', '거의', '고민', '다음', '이틀', '항상', '뭐', '때',
                '요', '가끔', '이후', '혹시', ]
    for word in nouns:
        if word in stopwords:
            while word in nouns:
                nouns.remove(word)
    return nouns

def predict(sen):
    global model
    test = preprocess_sentence(sen)
    test_batch = []
    test_batch.append(test)
    pre_test = t.texts_to_sequences(test_batch)
    padded_pre_test = tf.keras.preprocessing.sequence.pad_sequences(pre_test, truncating=trunc_type, padding=padding_type, maxlen=sequence_length)

    result = model.predict(padded_pre_test)
    sorted = result.argsort()
    sp = result.argmax()
    val = result.max()
    base = '{0}확률로 {1}과를 방문하셔야 합니다'
    return base.format(val, label_to_class[sp])

In [22]:
sentence = "잠이 너무 안와서 다음날 몽롱해요"
predict(sentence)

'0.9999998807907104확률로 PSY과를 방문하셔야 합니다'

In [23]:
sentence = "통풍으로 엄지발가락이 부었어요"
predict(sentence)

'0.8111918568611145확률로 OS과를 방문하셔야 합니다'

In [24]:
sentence = "오래된 이명과 비염이 있어요"
predict(sentence)

'0.9991840720176697확률로 ENT과를 방문하셔야 합니다'

In [25]:
sentence = "뇌경색 이후에 어떤 운동을 하는게 좋은가요"
predict(sentence)

'0.9163525104522705확률로 REHM과를 방문하셔야 합니다'

In [26]:
sentence = "항문 주변이 따가워요"
predict(sentence)

'0.976958692073822확률로 GS과를 방문하셔야 합니다'