<a href="https://colab.research.google.com/github/minzzii-kim/machine-learing/blob/main/tensorflow_note_6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# [문제4. NLP 자연어처리]
# Tokenizer : 문장에 대한 단어사전 만들기
# Embedding Layer : 차원축소 (단어사전은 원핫인코딩되어있기 때문에 대부분이 0으로 채워져있고, 차원의 저주로 인해 0으로 수렴할 수 있음.)

# RNN(Recurrent Neural Network) 은 gradient loss 문제가 있어 LSTM을 통해 개선
# 자연어처리는 Bidirectional(LSTM)을 통해 특성추출, 예측
# RNN 의 경우 many to one, many to many 모델이 있는데
# 4번 유형 -> many_to_one
# 5번 시계열 예측 -> many_to_many

# NLP문제의 학습파라미터는 이미지분류에 비해 훨씬 적지만, 순차적으로 가중치를 계산해 나가야하기때문에 학습속도가 생각보다 오래걸림


In [2]:
import json
import tensorflow as tf
import numpy as np
import urllib

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding,Bidirectional,LSTM,Dense,Flatten
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import ModelCheckpoint


In [3]:
url = 'https://storage.googleapis.com/download.tensorflow.org/data/sarcasm.json'
urllib.request.urlretrieve(url, 'sarcasm.json')

('sarcasm.json', <http.client.HTTPMessage at 0x7f31ecf1fed0>)

In [4]:
# json 파일 로드
with open('sarcasm.json') as f:
  datas = json.load(f)

datas[:5]

[{'article_link': 'https://www.huffingtonpost.com/entry/versace-black-code_us_5861fbefe4b0de3a08f600d5',
  'headline': "former versace store clerk sues over secret 'black code' for minority shoppers",
  'is_sarcastic': 0},
 {'article_link': 'https://www.huffingtonpost.com/entry/roseanne-revival-review_us_5ab3a497e4b054d118e04365',
  'headline': "the 'roseanne' revival catches up to our thorny political mood, for better and worse",
  'is_sarcastic': 0},
 {'article_link': 'https://local.theonion.com/mom-starting-to-fear-son-s-web-series-closest-thing-she-1819576697',
  'headline': "mom starting to fear son's web series closest thing she will have to grandchild",
  'is_sarcastic': 1},
 {'article_link': 'https://politics.theonion.com/boehner-just-wants-wife-to-listen-not-come-up-with-alt-1819574302',
  'headline': 'boehner just wants wife to listen, not come up with alternative debt-reduction ideas',
  'is_sarcastic': 1},
 {'article_link': 'https://www.huffingtonpost.com/entry/jk-rowling-w

In [16]:
# data 나누기

x=[]
y=[]

for data in datas:
  x.append(data['headline'])
  y.append(data['is_sarcastic'])

print(len(x)) #26709

TRAIN_SIZE = 20000
train_x = x[:TRAIN_SIZE]
train_y = y[:TRAIN_SIZE]

valid_x = x[TRAIN_SIZE:]
valid_y = y[TRAIN_SIZE:]


26709


In [17]:
# tokenizer 정의

vocab_size=1000
oov_token = '<oov>'

tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_token)

In [19]:
# [토큰화 진행]

# 1. 단어사전 만들기
tokenizer.fit_on_texts(train_x)

word_dict = tokenizer.word_index
for k, v in word_dict.items():
  print(f"{k} : {v}")
  if v==10: break

# 2. 문장 => 수열로 치환
train_x = tokenizer.texts_to_sequences(train_x)
valid_x = tokenizer.texts_to_sequences(valid_x)

print(train_x[:2])

# 3. 시퀀스 길이 맞추기
max_length = 120
trunc_type='post'
padding_type='post'

train_padded = pad_sequences(train_x, maxlen=max_length, padding=padding_type, truncating=trunc_type)
valid_padded = pad_sequences(valid_x, maxlen=max_length, padding=padding_type, truncating=trunc_type)

print(train_padded.shape)

#4.label 값을 numpy array로 변환 -> 모델이 list를 받아들이지못함
print(train_y) # [] list
train_y = np.array(train_y)
valid_y = np.array(valid_y)



<oov> : 1
to : 2
of : 3
the : 4
in : 5
for : 6
a : 7
on : 8
and : 9
with : 10
[[328, 1, 799, 1, 1, 47, 389, 1, 1, 6, 1, 1], [4, 1, 1, 1, 23, 2, 161, 1, 390, 1, 6, 251, 9, 889]]
(20000, 120)
[0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,

In [22]:
# Embedding Layer
test = np.array(train_padded[0])

embedding_size = 16
embedding = Embedding(vocab_size, embedding_size, input_length=max_length)
embedding(test)[0]

<tf.Tensor: shape=(16,), dtype=float32, numpy=
array([ 0.01732079, -0.0171303 , -0.01925035, -0.00301714, -0.04632512,
       -0.03322885,  0.00987216,  0.04219481,  0.04448987,  0.04554698,
       -0.00187035,  0.02541167,  0.02869941,  0.00420436, -0.02055799,
       -0.02504563], dtype=float32)>

In [23]:
# 모델정의
model = Sequential([
  Embedding(vocab_size, embedding_size, input_length=max_length),
  Bidirectional(LSTM(64, return_sequences=True)),
  Bidirectional(LSTM(64)),
  Dense(32, activation='relu'),
  Dense(16, activation='relu'),
  Dense(1, activation='sigmoid'),
])

model.summary()

# 모델생성
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 120, 16)           16000     
                                                                 
 bidirectional (Bidirectiona  (None, 120, 128)         41472     
 l)                                                              
                                                                 
 bidirectional_1 (Bidirectio  (None, 128)              98816     
 nal)                                                            
                                                                 
 dense (Dense)               (None, 32)                4128      
                                                                 
 dense_1 (Dense)             (None, 16)                528       
                                                                 
 dense_2 (Dense)             (None, 1)                 1

In [24]:
# 학습

checkpoint_path = 'my_checkpoint.ckpt'
checkpoint = ModelCheckpoint(checkpoint_path, monitor='val_loss', save_best_only=True, save_weights_only=True,verbose=1)

model.fit(train_padded, train_y, 
          validation_data=(valid_padded, valid_y), 
          epochs=10, 
          callbacks=[checkpoint])


Epoch 1/10
Epoch 00001: val_loss improved from inf to 0.40503, saving model to my_checkpoint.ckpt
Epoch 2/10
Epoch 00002: val_loss improved from 0.40503 to 0.38749, saving model to my_checkpoint.ckpt
Epoch 3/10
Epoch 00003: val_loss improved from 0.38749 to 0.38311, saving model to my_checkpoint.ckpt
Epoch 4/10
Epoch 00004: val_loss improved from 0.38311 to 0.37501, saving model to my_checkpoint.ckpt
Epoch 5/10
Epoch 00005: val_loss did not improve from 0.37501
Epoch 6/10
Epoch 00006: val_loss did not improve from 0.37501
Epoch 7/10
Epoch 00007: val_loss did not improve from 0.37501
Epoch 8/10
Epoch 00008: val_loss did not improve from 0.37501
Epoch 9/10
Epoch 00009: val_loss did not improve from 0.37501
Epoch 10/10
Epoch 00010: val_loss did not improve from 0.37501


<keras.callbacks.History at 0x7f31eb519510>

In [25]:
model.load_weights(checkpoint_path)
model.evaluate(valid_padded, valid_y)



[0.37500566244125366, 0.8321657180786133]