#자연어처리(Embedding)

In [1]:
corpus = {
    'I love my dog',
    'I love my cat',
    'You love my dog',
    'Do you think my dog is amazing'
}

In [2]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [3]:
tokenizer = Tokenizer(num_words = 100, oov_token = '<OOV>') #num_words는 Limit 값, oov_token -> Out of voca
tokenizer.fit_on_texts(corpus)

In [4]:
tokenizer.word_index  #단어가 많이 나온 순서대로 지정됨
#단어를 숫자로 바꿀 수 있음

{'<OOV>': 1,
 'my': 2,
 'dog': 3,
 'love': 4,
 'you': 5,
 'i': 6,
 'do': 7,
 'think': 8,
 'is': 9,
 'amazing': 10,
 'cat': 11}

In [6]:
sequences = tokenizer.texts_to_sequences(corpus)

In [7]:
sequences
#문장의 길이가 다름 -> 맞춰야함

[[7, 5, 8, 2, 3, 9, 10], [6, 4, 2, 3], [5, 4, 2, 3], [6, 4, 2, 11]]

In [8]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

padded = pad_sequences(sequences, maxlen = 6, padding = 'pre') #앞에다가 0을 채울거다

In [9]:
padded

array([[ 5,  8,  2,  3,  9, 10],
       [ 0,  0,  6,  4,  2,  3],
       [ 0,  0,  5,  4,  2,  3],
       [ 0,  0,  6,  4,  2, 11]], dtype=int32)

In [10]:
#현재 데이터 속 숫자 간의 관계가 없기 때문에 따로 Embedding 처리를 해줘야함 -> 원핫인코딩
#5와 8은 거리 관계 X

In [11]:
from tensorflow.keras.utils import to_categorical

padded_seq = to_categorical(padded)

In [12]:
padded_seq

array([[[0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.]],

       [[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.]],

       [[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0

In [14]:
padded_seq.shape #4개의 문장은 6개의 Token으로 이루져있고 12개의 단어로 구성된다

(4, 6, 12)

#IMDB(리뷰 데이터) 긍정/부정 분류

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras

##1. 데이터 준비

In [2]:
from tensorflow.keras.datasets import imdb
(train_input, train_target), (_, _) = imdb.load_data(num_words= 500)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz


In [18]:
train_input.shape

(25000,)

In [20]:
train_input[:5]

array([list([1, 14, 22, 16, 43, 2, 2, 2, 2, 65, 458, 2, 66, 2, 4, 173, 36, 256, 5, 25, 100, 43, 2, 112, 50, 2, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 2, 2, 17, 2, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2, 19, 14, 22, 4, 2, 2, 469, 4, 22, 71, 87, 12, 16, 43, 2, 38, 76, 15, 13, 2, 4, 22, 17, 2, 17, 12, 16, 2, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2, 2, 16, 480, 66, 2, 33, 4, 130, 12, 16, 38, 2, 5, 25, 124, 51, 36, 135, 48, 25, 2, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 2, 15, 256, 4, 2, 7, 2, 5, 2, 36, 71, 43, 2, 476, 26, 400, 317, 46, 7, 4, 2, 2, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2, 56, 26, 141, 6, 194, 2, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 2, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 2, 88, 12, 16, 283, 5, 16, 2, 113, 103, 32, 15, 16, 2, 19, 178, 32]),
       list([1, 194, 2, 194, 2, 78, 228, 5, 6, 2, 2, 2, 134, 26, 4, 2, 8, 118, 2, 14, 394, 20, 13, 119, 2, 189, 102, 5, 207, 110, 2, 21, 

In [3]:
from sklearn.model_selection import train_test_split
train_input, test_input, train_target, test_target = train_test_split(train_input, train_target, test_size = 0.2)

In [22]:
train_input.shape

(20000,)

In [26]:
train_target[1]

0

In [27]:
imdb.get_word_index()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json


{'fawn': 34701,
 'tsukino': 52006,
 'nunnery': 52007,
 'sonja': 16816,
 'vani': 63951,
 'woods': 1408,
 'spiders': 16115,
 'hanging': 2345,
 'woody': 2289,
 'trawling': 52008,
 "hold's": 52009,
 'comically': 11307,
 'localized': 40830,
 'disobeying': 30568,
 "'royale": 52010,
 "harpo's": 40831,
 'canet': 52011,
 'aileen': 19313,
 'acurately': 52012,
 "diplomat's": 52013,
 'rickman': 25242,
 'arranged': 6746,
 'rumbustious': 52014,
 'familiarness': 52015,
 "spider'": 52016,
 'hahahah': 68804,
 "wood'": 52017,
 'transvestism': 40833,
 "hangin'": 34702,
 'bringing': 2338,
 'seamier': 40834,
 'wooded': 34703,
 'bravora': 52018,
 'grueling': 16817,
 'wooden': 1636,
 'wednesday': 16818,
 "'prix": 52019,
 'altagracia': 34704,
 'circuitry': 52020,
 'crotch': 11585,
 'busybody': 57766,
 "tart'n'tangy": 52021,
 'burgade': 14129,
 'thrace': 52023,
 "tom's": 11038,
 'snuggles': 52025,
 'francesco': 29114,
 'complainers': 52027,
 'templarios': 52125,
 '272': 40835,
 '273': 52028,
 'zaniacs': 52130,

##각 데이터의 길이를 맞춰야함

In [4]:
review_length = np.array([len(x) for x in train_input])

In [35]:
review_length.min(), review_length.max(), review_length.mean()

(12, 2494, 239.19925)

In [5]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [6]:
train_seq = pad_sequences(train_input, maxlen = 200, padding = 'pre')

In [41]:
train_seq.shape #2만개의 문장은 200개의 단어로 구성

(20000, 200)

In [7]:
from tensorflow.keras.utils import to_categorical

train_oh = to_categorical(train_seq)

In [44]:
train_oh.shape  #단어 하나를 500개로 표현하고, 200개의 단어로 구성된 리뷰가 20000개

(20000, 200, 500)

In [8]:
import tensorflow
from tensorflow import keras
from keras import layers

model = keras.Sequential([
    layers.LSTM(32, activation = 'tanh', input_shape = (200, 500), return_sequences = True), #LSTM은 Flatten된 상태로 결과가 나옴
    layers.LSTM(32, activation = 'tanh'),
    layers.Dense(1, activation = 'sigmoid')
])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 200, 32)           68224     
                                                                 
 lstm_1 (LSTM)               (None, 32)                8320      
                                                                 
 dense (Dense)               (None, 1)                 33        
                                                                 
Total params: 76,577
Trainable params: 76,577
Non-trainable params: 0
_________________________________________________________________


In [13]:
model.compile(
    optimizer = 'adam',
    loss = 'accuracy',
    metrics = 'accuracy'
)

In [None]:
Epochs = 5
Batch_size = 2000

history = model.fit(
    train_oh, train_target,
    epochs = Epochs,
    batch_size = Batch_size,
    validation_split = 0.2,
    verbose = 1
)

In [None]:
Y_pred = model.predict(test_oh).reshape(-1)

In [None]:
Y_pred = (Y_pred > 0.5).astype('int')

##Embeding layers 사용
- 원핫인코딩을 따로 처리하지 않고, Sequential 내부에 넣는다

In [None]:
import tensorflow
from tensorflow import keras
from keras import layers

model = keras.Sequential([
    layers.Embedding(500, 16, input_length = 200), # 500개의 원핫인코딩을 16개의 숫자로 변환
    layers.LSTM(32, activation = 'tanh', return_sequences = True), #LSTM은 Flatten된 상태로 결과가 나옴
    layers.LSTM(32, activation = 'tanh'),
    layers.Dense(1, activation = 'sigmoid')
])

model.summary()

In [None]:
model.compile(
    optimizer = 'adam',
    loss = 'binary_crossentropy',
    metrics = 'acc'
)