In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from tensorflow.keras.preprocessing.text import Tokenizer, text_to_word_sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Embedding
from tensorflow.keras.utils import to_categorical
import koreanize_matplotlib
from konlpy.tag import Okt


In [6]:
train_data = pd.read_csv("https://raw.githubusercontent.com/haram4th/data4mdai/main/hotelscom_review_train.csv")
test_data = pd.read_csv("./data/hotelscom_review_test.csv")

In [6]:
train_data.head()

Unnamed: 0,description,isgood
0,사람이 너무 많고 작은 수영장과 조식 수용한계로 모두 포기하고 옆의 아이파크몰에서 ...,0
1,방도크고 깨끗하여 아주 좋았어요,1
2,매년여름휴가철마다 찾는곳이예요 너무 좋아요 점점 더 좋아지는듯 직원분들도 너무 친절...,1
3,여수에서는 제일 유명한 호텔이래요 호텔 stay 가 필요하다면 소노캄 여수도 괜찮은...,1
4,가격대비 훌륭하지만 아무래도 오래된 느낌이 많이 드네요겉이불은 세탁하니까 깨끗히나 ...,1


In [4]:
test_data.head()

Unnamed: 0,description,isgood
0,쾌적한 시설과 특히 금진온천은 저에게는 특별한추억이었습니다,1
1,관광호텔로 알고 예약 후 직접 방문시 주변이 모두 모텔주변에 위치하였으며 관광호텔급...,0
2,신축이라 그런지 아주 청결합니다 조명은 여러가지 색깔이 들어오는데 다 켜면 잘 조화...,1
3,맨몸으로들어간 호텔에 치솔과 면도기가 없어서 황당했습니다 해외에도 다있고 국내 모텔...,1
4,도어락이 제대로 작동되지 않았고텔레비전도 아주 낡아서 중간에 소리도 안 들리고 화면...,0


# 토큰화

In [7]:
docs = train_data['description']


In [8]:
okt =Okt()
okt.morphs(docs[0])

['사람',
 '이',
 '너무',
 '많고',
 '작은',
 '수영장',
 '과',
 '조식',
 '수',
 '용한',
 '계',
 '로',
 '모두',
 '포기',
 '하고',
 '옆',
 '의',
 '아이파크몰',
 '에서',
 '그냥',
 '식사',
 '함']

In [14]:
type(docs)

pandas.core.series.Series

In [9]:
# 전체 문장을 토큰화 후 tokenized_docs에 저장
tokenized_docs = docs.apply(okt.morphs)

In [15]:
# 단어 인덱스 생성
token = Tokenizer(lower=False)
token.fit_on_texts(tokenized_docs)
print(len(token.word_index))

63571


In [16]:
X=token.texts_to_sequences(tokenized_docs)
print(X[0])

[147, 1, 10, 362, 381, 124, 24, 22, 39, 8795, 2621, 20, 126, 2252, 36, 179, 13, 3939, 12, 145, 258, 190]


In [10]:
y = train_data['isgood']

In [18]:
# 가장 긴 문장의 길이 구하기
max_len = max(len(i) for i in X)
print("가장 긴 문장의 길이(패딩에 사용): ", max_len)

가장 긴 문장의 길이(패딩에 사용):  554


In [20]:
# 패딩
X_padded = pad_sequences(X, maxlen=max_len, padding='post')
print(X_padded[0])

[ 147    1   10  362  381  124   24   22   39 8795 2621   20  126 2252
   36  179   13 3939   12  145  258  190    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0 

In [21]:
# 홀드아웃
from sklearn.model_selection import train_test_split

In [22]:
X_train, X_valid, y_train, y_valid = train_test_split(X_padded, y, stratify=y, random_state=10, test_size=0.3)

In [23]:
# 임베딩에 입력될 단어 수
word_size = len(token.word_index) + 1
print(word_size)

63572


# RNN 네트워크를 이용해 텍스트 분석 

In [2]:
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, Dropout, Bidirectional 
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

2024-09-13 16:52:46.119529: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-09-13 16:52:48.097774: I tensorflow/c/logging.cc:34] Successfully opened dynamic library libdirectml.d6f03b303ac3c4f2eeb8ca631688c9757b361310.so
2024-09-13 16:52:48.097879: I tensorflow/c/logging.cc:34] Successfully opened dynamic library libdxcore.so
2024-09-13 16:52:48.102457: I tensorflow/c/logging.cc:34] Successfully opened dynamic library libd3d12.so
2024-09-13 16:52:48.565552: I tensorflow/c/logging.cc:34] DirectML device enumeration: found 1 compatible adapters.


In [1]:
birnn = Sequential()
birnn.add(Embedding(word_size, 64, input_length=max_len))
birnn.add(Bidirectional(SimpleRNN(128,return_sequences=True, activation='tanh')))
birnn.add(Dropout(0.5))
birnn.add(SimpleRNN(64, activation='tanh'))
birnn.add(Dropout(0.5))
birnn.add(Dense(32, activation='relu'))
birnn.add(Dense(1, activation='sigmoid'))
birnn.summary()

NameError: name 'Sequential' is not defined

In [33]:
birnn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
modelpath = "./model/hotels_review_birnn.keras"
checkpoint = ModelCheckpoint(filepath=modelpath, save_best_only=True)
earlystop = EarlyStopping(patience=10)

In [None]:
birnn_history = birnn.fit(X_train, y_train, epochs=1000, batch_size=64, 
                         validation_data=(X_valid, y_valid), callbacks=[earlystop,checkpoint])

Epoch 1/1000


2024-09-13 16:46:06.420475: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2024-09-13 16:46:06.976920: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-09-13 16:46:06.976982: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 14845 MB memory) -> physical PluggableDevice (device: 0, name: DML, pci bus id: <undefined>)
2024-09-13 16:46:06.978826: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-09-13 16:46:06.978890: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_f