In [1]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [5]:
sentences = [
    'I like eggs and ham.',
    'I love chocolate and bunnies.',
    'I hate oniones.'
]

In [6]:
MAX_VOCAB_SIZE = 20000
tokenizer = Tokenizer(num_words = MAX_VOCAB_SIZE) # 사용 빈도가 높은 20000개 단어
tokenizer.fit_on_texts(sentences) # 토큰화
sequences = tokenizer.texts_to_sequences(sentences) # 수치화

In [7]:
sequences

[[1, 3, 4, 2, 5], [1, 6, 7, 2, 8], [1, 9, 10]]

In [8]:
tokenizer.word_index

{'i': 1,
 'and': 2,
 'like': 3,
 'eggs': 4,
 'ham': 5,
 'love': 6,
 'chocolate': 7,
 'bunnies': 8,
 'hate': 9,
 'oniones': 10}

In [9]:
# 길이값 조정
data = pad_sequences(sequences)
print(data)

[[ 1  3  4  2  5]
 [ 1  6  7  2  8]
 [ 0  0  1  9 10]]


In [10]:
MAX_SEQUENCE_LENGTH = 5
data = pad_sequences(sequences, maxlen = MAX_SEQUENCE_LENGTH)
print(data)

[[ 1  3  4  2  5]
 [ 1  6  7  2  8]
 [ 0  0  1  9 10]]


In [11]:
MAX_SEQUENCE_LENGTH = 5
data = pad_sequences(sequences, maxlen = MAX_SEQUENCE_LENGTH, padding = 'post') # 빈공간의 0이 뒤로 간다.
print(data)

[[ 1  3  4  2  5]
 [ 1  6  7  2  8]
 [ 1  9 10  0  0]]


In [12]:
# 길이가 늘어남. -> 0이 생겨남.
MAX_SEQUENCE_LENGTH = 6
data = pad_sequences(sequences, maxlen = MAX_SEQUENCE_LENGTH)
print(data)

[[ 0  1  3  4  2  5]
 [ 0  1  6  7  2  8]
 [ 0  0  0  1  9 10]]


In [14]:
MAX_SEQUENCE_LENGTH = 6
data = pad_sequences(sequences, maxlen = MAX_SEQUENCE_LENGTH, padding = 'post')
print(data)

[[ 1  3  4  2  5  0]
 [ 1  6  7  2  8  0]
 [ 1  9 10  0  0  0]]


In [15]:
# 앞에 있는 데이터 자름.
MAX_SEQUENCE_LENGTH = 4
data = pad_sequences(sequences, maxlen = MAX_SEQUENCE_LENGTH)
print(data)

[[ 3  4  2  5]
 [ 6  7  2  8]
 [ 0  1  9 10]]


In [16]:
# 뒤에 있는 데이터 자름.
MAX_SEQUENCE_LENGTH = 4
data = pad_sequences(sequences, maxlen = MAX_SEQUENCE_LENGTH, truncating = 'post')
print(data)

[[ 1  3  4  2]
 [ 1  6  7  2]
 [ 0  1  9 10]]
