# RNN_순환신경망
* 언어, 시계열 데이터 분석에 주로 사용
* 길이가 길어지면 기울기 소실문제 발생
* LSTM, GRU 등으로 문제를 보완

In [1]:
from tensorflow.keras.preprocessing.text import Tokenizer, text_to_word_sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Embedding
from tensorflow.keras.utils import to_categorical

import pandas as pd
import numpy as np

# 텐서플로에서 텍스트 전처리하기
* 토큰화: 문장을 단어 혹은 형태소로 쪼개는 것
* 원핫인코딩: 문자를 벡터화
* 임베딩(embedding): 벡터화 -> 원핫인코딩을 더 축소

In [2]:
text = "해보지 않으면 해낼 수 없다"

In [4]:
result = text_to_word_sequence(text)
result

['해보지', '않으면', '해낼', '수', '없다']

단어 빈도수 세기

In [6]:
docs = ['먼저 텍스트의 각 단어를 나누어 토큰화합니다', 
        '텍스트의 단어로 토큰화해야 딥러닝에서 인식됩니다', 
        '토큰화한 결과는 딥러닝에서 사용할 수 있습니다',
        '텍스트 전처리에는 벡터화 원핫인코딩 패딩으로 길이 맞추기 등이 필요합니다',
        '딥러닝 쉽지 않네요']

In [8]:
token = Tokenizer()
token.fit_on_texts(docs)
print("단어 카운트:\n ", token.word_counts)
print("문장 카운트:\n", token.document_count)
print("각 단어가 몇 개의 문장에 포함되어 있는지 계산:\n ", token.word_docs)
print("각 단어에 매겨진 인덱스 값:\n ", token.word_index)

단어 카운트:
  OrderedDict([('먼저', 1), ('텍스트의', 2), ('각', 1), ('단어를', 1), ('나누어', 1), ('토큰화합니다', 1), ('단어로', 1), ('토큰화해야', 1), ('딥러닝에서', 2), ('인식됩니다', 1), ('토큰화한', 1), ('결과는', 1), ('사용할', 1), ('수', 1), ('있습니다', 1)])
문장 카운트:
 3
각 단어가 몇 개의 문장에 포함되어 있는지 계산:
  defaultdict(<class 'int'>, {'단어를': 1, '먼저': 1, '나누어': 1, '텍스트의': 2, '각': 1, '토큰화합니다': 1, '단어로': 1, '토큰화해야': 1, '딥러닝에서': 2, '인식됩니다': 1, '있습니다': 1, '토큰화한': 1, '수': 1, '결과는': 1, '사용할': 1})
각 단어에 매겨진 인덱스 값:
  {'텍스트의': 1, '딥러닝에서': 2, '먼저': 3, '각': 4, '단어를': 5, '나누어': 6, '토큰화합니다': 7, '단어로': 8, '토큰화해야': 9, '인식됩니다': 10, '토큰화한': 11, '결과는': 12, '사용할': 13, '수': 14, '있습니다': 15}


In [13]:
x = token.texts_to_sequences(docs)
print(x)

[[3, 1, 4, 5, 6, 7], [1, 8, 9, 2, 10], [11, 12, 2, 13, 14, 15]]


In [15]:
max([len(i) for i in x])

6

In [16]:
# 문장의 길이를 맞추기 위한 패딩
# 가장 긴 문장 길이 +1의 길이로 패딩
# 문장 시작에는 0이 있어야 함
padded_x = pad_sequences(x,max([len(i) for i in x])+1)
padded_x

array([[ 0,  3,  1,  4,  5,  6,  7],
       [ 0,  0,  1,  8,  9,  2, 10],
       [ 0, 11, 12,  2, 13, 14, 15]], dtype=int32)

# 텍스트를 읽고 긍정, 부정 예측하기

In [17]:
docs2 = ["너무 재밌네요",
        "최고예요",
        "참 신기한 딥러닝이네요",
        "인공지능 칭찬합니다",
        "더 자세히 배우고 싶어요",
        "변화가 너무 빨라요",
        "GPT성능이 생각보다 별로네요",
        "제미나이보다는 낫죠",
        "나는 차라리 라마를 쓴다",
        "유료 결재 싫어요"]

In [18]:
classes = np.array([1,1,1,1,1,0,0,1,0,0])

In [19]:
token = Tokenizer()
token.fit_on_texts(docs2)
print(token.word_index)

{'너무': 1, '재밌네요': 2, '최고예요': 3, '참': 4, '신기한': 5, '딥러닝이네요': 6, '인공지능': 7, '칭찬합니다': 8, '더': 9, '자세히': 10, '배우고': 11, '싶어요': 12, '변화가': 13, '빨라요': 14, 'gpt성능이': 15, '생각보다': 16, '별로네요': 17, '제미나이보다는': 18, '낫죠': 19, '나는': 20, '차라리': 21, '라마를': 22, '쓴다': 23, '유료': 24, '결재': 25, '싫어요': 26}


In [20]:
x = token.texts_to_sequences(docs2)
print("토큰화 결과: \n", x)

토큰화 결과: 
 [[1, 2], [3], [4, 5, 6], [7, 8], [9, 10, 11, 12], [13, 1, 14], [15, 16, 17], [18, 19], [20, 21, 22, 23], [24, 25, 26]]


In [22]:
padding_x = pad_sequences(x,max([len(i) for i in x]))
padding_x

array([[ 0,  0,  1,  2],
       [ 0,  0,  0,  3],
       [ 0,  4,  5,  6],
       [ 0,  0,  7,  8],
       [ 9, 10, 11, 12],
       [ 0, 13,  1, 14],
       [ 0, 15, 16, 17],
       [ 0,  0, 18, 19],
       [20, 21, 22, 23],
       [ 0, 24, 25, 26]], dtype=int32)

# 임베딩

In [23]:
word_size = len(token.word_index) +1
word_size

27

In [24]:
model = Sequential()
model.add(Embedding(word_size, 8,input_length=4))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 4, 8)              216       
                                                                 
 flatten (Flatten)           (None, 32)                0         
                                                                 
 dense (Dense)               (None, 1)                 33        
                                                                 
Total params: 249
Trainable params: 249
Non-trainable params: 0
_________________________________________________________________


2024-09-13 15:03:32.166170: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-09-13 15:03:32.169803: I tensorflow/c/logging.cc:34] DirectML: creating device on adapter 0 (AMD Radeon(TM) Graphics)
2024-09-13 15:03:32.491729: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-09-13 15:03:32.491781: W tensorflow/core/common_runtime/pluggable_device/pluggable_device_bfc_allocator.cc:28] Overriding allow_growth setting because force_memory_growth was requested by the device.
2024-09-13 15:03:32.491804: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_f

In [25]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(padding_x, classes, epochs=20)
print(model, evaluate(padding_x, classes)[1])

Epoch 1/20
Epoch 2/20
Epoch 3/20


2024-09-13 15:06:50.171529: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2024-09-13 15:06:50.248164: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-09-13 15:06:50.248225: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 14845 MB memory) -> physical PluggableDevice (device: 0, name: DML, pci bus id: <undefined>)
2024-09-13 15:06:50.252680: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-09-13 15:06:50.252742: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_f

Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


NameError: name 'evaluate' is not defined