In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tensorflow.keras.preprocessing.text import Tokenizer, text_to_word_sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Embedding
from tensorflow.keras.utils import to_categorical
import koreanize_matplotlib
from konlpy.tag import Mecab

In [3]:
train_data = pd.read_csv("https://raw.githubusercontent.com/haram4th/data4mdai/main/hotelscom_review_train.csv")
test_data = pd.read_csv("./data/hotelscom_review_test.csv")

In [4]:
train_data.head()

Unnamed: 0,description,isgood
0,사람이 너무 많고 작은 수영장과 조식 수용한계로 모두 포기하고 옆의 아이파크몰에서 ...,0
1,방도크고 깨끗하여 아주 좋았어요,1
2,매년여름휴가철마다 찾는곳이예요 너무 좋아요 점점 더 좋아지는듯 직원분들도 너무 친절...,1
3,여수에서는 제일 유명한 호텔이래요 호텔 stay 가 필요하다면 소노캄 여수도 괜찮은...,1
4,가격대비 훌륭하지만 아무래도 오래된 느낌이 많이 드네요겉이불은 세탁하니까 깨끗히나 ...,1


In [5]:
test_data.head()

Unnamed: 0,description,isgood
0,쾌적한 시설과 특히 금진온천은 저에게는 특별한추억이었습니다,1
1,관광호텔로 알고 예약 후 직접 방문시 주변이 모두 모텔주변에 위치하였으며 관광호텔급...,0
2,신축이라 그런지 아주 청결합니다 조명은 여러가지 색깔이 들어오는데 다 켜면 잘 조화...,1
3,맨몸으로들어간 호텔에 치솔과 면도기가 없어서 황당했습니다 해외에도 다있고 국내 모텔...,1
4,도어락이 제대로 작동되지 않았고텔레비전도 아주 낡아서 중간에 소리도 안 들리고 화면...,0


# 토큰화

In [7]:
docs = train_data['description']


In [None]:
mecab = Mecab()
mecab.morphs(docs[0])

In [14]:
type(docs)

pandas.core.series.Series

In [9]:
%%time
# 전체 문장을 토큰화 후 tokenized_docs에 저장
tokenized_docs = docs.apply(mecab.morphs)

In [None]:
# import joblib
# joblib.dump(tokenized_docs, "./model/hotels_tokenized_docs")

In [None]:
# tokenized_docs = joblib.load("./model/hotels_tokenized_docs")

In [None]:
# tokenized_docs[0]

In [15]:
# 단어 인덱스 생성
token = Tokenizer(lower=False)
token.fit_on_texts(tokenized_docs)
print(len(token.word_index))

63571


In [16]:
# 문장 백터화 
X = token.texts_to_sequences(tokenized_docs)
print(X[0])

[147, 1, 10, 362, 381, 124, 24, 22, 39, 8795, 2621, 20, 126, 2252, 36, 179, 13, 3939, 12, 145, 258, 190]


In [10]:
y = train_data['isgood']
y

In [18]:
# 가장 긴 문장의 길이 구하기
max_len = max(len(i) for i in X)
print("가장 긴 문장의 길이(패딩에 사용): ", max_len)

가장 긴 문장의 길이(패딩에 사용):  554


In [20]:
# 패딩
X_padded = pad_sequences(X, maxlen=max_len, padding='post')
print(X_padded[0])

[ 147    1   10  362  381  124   24   22   39 8795 2621   20  126 2252
   36  179   13 3939   12  145  258  190    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0 

In [21]:
# 홀드아웃
from sklearn.model_selection import train_test_split

In [22]:
X_train, X_valid, y_train, y_valid = train_test_split(X_padded, y, stratify=y, random_state=10, test_size=0.3)

In [23]:
# 임베딩에 입력될 단어 수
word_size = len(token.word_index) + 1 
print(word_size)

63572


# 양방향 RNN 네트워크를 이용해 텍스트 분석 

In [2]:
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, Dropout, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

2024-09-13 16:52:46.119529: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-09-13 16:52:48.097774: I tensorflow/c/logging.cc:34] Successfully opened dynamic library libdirectml.d6f03b303ac3c4f2eeb8ca631688c9757b361310.so
2024-09-13 16:52:48.097879: I tensorflow/c/logging.cc:34] Successfully opened dynamic library libdxcore.so
2024-09-13 16:52:48.102457: I tensorflow/c/logging.cc:34] Successfully opened dynamic library libd3d12.so
2024-09-13 16:52:48.565552: I tensorflow/c/logging.cc:34] DirectML device enumeration: found 1 compatible adapters.


In [1]:
birnn = Sequential()
birnn.add(Embedding(input_dim=word_size, output_dim=64))
birnn.add(Bidirectional(SimpleRNN(128, return_sequences=True, activation='tanh')))
birnn.add(Dropout(0.5))
birnn.add(SimpleRNN(64, activation='tanh'))
birnn.add(Dropout(0.5))
birnn.add(Dense(32, activation='relu'))
birnn.add(Dense(1, activation='sigmoid'))
# 모델 요약
birnn.build(input_shape=(None, max_len))  # 입력 형태 지정
birnn.summary()

NameError: name 'Sequential' is not defined

In [33]:
birnn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
modelpath = "./model/hotels_review_birnn.keras"
checkpoint = ModelCheckpoint(filepath=modelpath, save_best_only=True)
earlystop = EarlyStopping(patience=50)

In [None]:
birnn_history = birnn.fit(X_train, y_train, epochs=1000, batch_size=8,
                          validation_data=(X_valid, y_valid),
                          callbacks=[earlystop, checkpoint])

Epoch 1/1000


2024-09-13 16:46:06.420475: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2024-09-13 16:46:06.976920: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-09-13 16:46:06.976982: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 14845 MB memory) -> physical PluggableDevice (device: 0, name: DML, pci bus id: <undefined>)
2024-09-13 16:46:06.978826: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-09-13 16:46:06.978890: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_f

In [None]:
def result_plot(x):
    # 검증셋과 학습셋의 오차를 저장합니다.
    y_vloss = x.history['val_loss']
    y_loss = x.history['loss']

    # 그래프로 표현해 봅니다.
    x_len = np.arange(len(y_loss))
    plt.plot(x_len, y_vloss, marker='.', c="red", label='Testset_loss')
    plt.plot(x_len, y_loss, marker='.', c="blue", label='Trainset_loss')

    # 그래프에 그리드를 주고 레이블을 표시해 보겠습니다.
    plt.legend(loc='upper right')
    plt.grid()
    plt.xlabel('epoch')
    plt.ylabel('loss')
    plt.show()

In [None]:
result_plot(birnn_history)

# LSTM과 CNN 조합 모델로 분석

In [None]:
from tensorflow.keras.layers import Dropout, Activation, Conv1D, MaxPooling1D
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional

In [None]:
# 모델의 구조를 설정합니다.
model = Sequential()
model.add(Embedding(input_dim=word_size, output_dim=64))  # 임베딩 벡터의 차원을 64로 증가
model.add(Dropout(0.5))  # Dropout 층 추가
model.add(Conv1D(128, 5, padding='valid', activation='relu', strides=1))  # Conv1D 층 추가
model.add(MaxPooling1D(pool_size=4))  # MaxPooling 층 추가
model.add(Conv1D(128, 5, padding='valid', activation='relu', strides=1))  # 추가적인 Conv1D 층
model.add(MaxPooling1D(pool_size=4))  # MaxPooling 층 추가
model.add(Bidirectional(LSTM(256, return_sequences=True, activation='tanh')))  # 양방향 LSTM 층 추가
model.add(Dropout(0.5))  # Dropout 층 추가
model.add(LSTM(128, activation='tanh'))  # 추가적인 LSTM 층
model.add(Dropout(0.5))  # Dropout 층 추가
model.add(Dense(64, activation='relu'))  # Dense 층 추가
model.add(Dense(32, activation='relu'))  # 추가적인 Dense 층
model.add(Dense(1, activation='sigmoid'))  # 출력층
# 모델 요약
model.build(input_shape=(None, max_len))  # 입력 형태 지정
model.summary()

In [None]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
modelpath = "./model/hotels_review_LSTM_CNN.keras"
checkpoint = ModelCheckpoint(filepath=modelpath, save_best_only=True)
earlystop = EarlyStopping(patience=50)

In [None]:
history = model.fit(X_train, y_train, epochs=1000, batch_size=128,
                          validation_data=(X_valid, y_valid),
                          callbacks=[earlystop, checkpoint])

In [None]:
# !pip install attention

In [None]:
from attention import Attention

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional, Input, Flatten, GlobalAveragePooling1D
from tensorflow.keras.layers import Attention
from tensorflow.keras.models import Model

# 입력 레이어
inputs = Input(shape=(max_len,))

# 임베딩 레이어
x = Embedding(input_dim=word_size, output_dim=64)(inputs)
x = Dropout(0.5)(x)

# 양방향 LSTM과 Attention
x = Bidirectional(LSTM(128, return_sequences=True))(x)
attention = Attention()([x, x])
x = Dropout(0.5)(attention)

# 추가적인 LSTM과 Attention
x = LSTM(64, return_sequences=True)(x)
attention = Attention()([x, x])
x = Dropout(0.5)(attention)

# 차원 축소
x = GlobalAveragePooling1D()(x)

# Dense 레이어
x = Dense(256, activation='relu')(x)
x = Dense(128, activation='relu')(x)
x = Dense(64, activation='relu')(x)
x = Dense(32, activation='relu')(x)

# 출력 레이어
outputs = Dense(1, activation='sigmoid')(x)

# 모델 정의
model = Model(inputs=inputs, outputs=outputs)

# 모델 컴파일
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# 모델 요약
model.summary()

In [None]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
modelpath = "./model/hotels_review_Attention.keras"
checkpoint = ModelCheckpoint(filepath=modelpath, save_best_only=True)
earlystop = EarlyStopping(patience=50)

In [None]:
history = model.fit(X_train, y_train, epochs=1000, batch_size=128,
                          validation_data=(X_valid, y_valid),
                          callbacks=[earlystop, checkpoint])

In [None]:
result_plot(history)

# 머신러닝 나이브 베이즈와 비교를 위해 비교 분석

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

In [None]:
def tokenizer(x):
    token = mecab.morphs(x)
    return token

In [None]:
X = train_data['description']
y = train_data['isgood']

In [None]:
tfidf_cv = TfidfVectorizer(tokenizer=tokenizer, max_df=0.8, min_df=4, ngram_range=(1,3))
X_tfidf = tfidf_cv.fit_transform(X)

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X_tfidf, y, test_size=0.4, random_state=7)
X_valid, X_test, y_valid, y_test = train_test_split(X_valid, y_valid, test_size=0.5, random_state=7)

In [None]:
mnb = MultinomialNB()
mnb.fit(X_train, y_train)
pred = mnb.predict(X_valid)
print(classification_report(y_valid, pred))
test_pred = mnb.predict(X_test)
print("="*40, "test result", "="*40)
print(classification_report(y_test, test_pred))

In [None]:
import joblib
joblib.dump(mecab, "./model/mecab_hotels_model")
joblib.dump(mnb, "./model/Naive Bayes_hotels_model")
joblib.dump(tfidf_cv, "./model/tfidf_cv_hotels_model")