In [1]:
sentence = " 나는 밥을 먹었다 "
# <start>는 문장의 시작 입니다. 
# <start> 토큰을 받은 순환 신경망은 "나는"을 출력한다.
# 출력된 "나는"이라는 단어를 다시 입력으로 사용하고 이러한 반복을 통해 "먹었다"까지 
# 글을 생성한다. 마지막으로 끝(완성)을 뜻하는 <end> 토큰을 생성하여 마무리 합니다. 
source_sentence = "<start>" + sentence
target_sentence = sentence + "<end>"

print("Source 문장:", source_sentence)
# Source 문장: <start> 나는 밥을 먹었다
print("Target 문장:", target_sentence)
# Target 문장:  나는 밥을 먹었다 <end>

Source 문장: <start> 나는 밥을 먹었다 
Target 문장:  나는 밥을 먹었다 <end>


In [2]:
import os, re
import numpy as np
import tensorflow as tf

In [3]:
os.getcwd()

'/Users/joowanha/PycharmProjects/AIFFEL/project'

In [6]:
file_path = '/Users/joowanha/PycharmProjects/AIFFEL/lyrics/shakespeare.txt'

In [7]:
with open(file_path, 'r') as f:
    raw_corpus = f.read().splitlines()

In [9]:
print(raw_corpus[:9])

['First Citizen:', 'Before we proceed any further, hear me speak.', '', 'All:', 'Speak, speak.', '', 'First Citizen:', 'You are all resolved rather to die than to famish?', '']


In [11]:
for idx, sentence in enumerate(raw_corpus):
    if len(sentence) == 0 : continue
    if sentence[-1] == ':' : continue
    if idx > 9 : break
    print(sentence)

Before we proceed any further, hear me speak.
Speak, speak.
You are all resolved rather to die than to famish?


In [25]:
def preprocess_sentence(sentence):
    sentence = sentence.lower().strip()
    sentence = re.sub(r'([?.!,])', r' \1 ', sentence)
    sentence = re.sub(r'[" "]+', " ", sentence)
    sentence = re.sub(r'[^a-zA-Z?.!,]+', ' ', sentence)
    sentence = sentence.strip()
    sentence = '<start> ' + sentence + ' <end>'
    return sentence

In [26]:
print(preprocess_sentence("This @_is ;;;sample        sentence."))

<start> this is sample sentence . <end>


In [29]:
corpus = []

for sentence in raw_corpus:
    if len(sentence) == 0 : continue
    if sentence[-1] == ':': continue
    
    preprocessed_sentence = preprocess_sentence(sentence)
    corpus.append(preprocessed_sentence)

In [30]:
corpus[:10]

['<start> before we proceed any further , hear me speak . <end>',
 '<start> speak , speak . <end>',
 '<start> you are all resolved rather to die than to famish ? <end>',
 '<start> resolved . resolved . <end>',
 '<start> first , you know caius marcius is chief enemy to the people . <end>',
 '<start> we know t , we know t . <end>',
 '<start> let us kill him , and we ll have corn at our own price . <end>',
 '<start> is t a verdict ? <end>',
 '<start> no more talking on t let it be done away , away ! <end>',
 '<start> one word , good citizens . <end>']

In [65]:
len(raw_corpus)

40000

In [66]:
raw_corpus[0]

'First Citizen:'

In [35]:
def tokenize(corpus):
    tokenizer = tf.keras.preprocessing.text.Tokenizer(
        num_words=7000,
        filters=' ',
        oov_token='<unk>'
    )
    tokenizer.fit_on_texts(corpus)
    tensor = tokenizer.texts_to_sequences(corpus)
    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post')
    
    print(tensor, tokenizer)
    return tensor, tokenizer

In [36]:
tensor, tokenizer = tokenize(corpus)

[[   2  143   40 ...    0    0    0]
 [   2  110    4 ...    0    0    0]
 [   2   11   50 ...    0    0    0]
 ...
 [   2  149 4553 ...    0    0    0]
 [   2   34   71 ...    0    0    0]
 [   2  945   34 ...    0    0    0]] <keras.preprocessing.text.Tokenizer object at 0x7f8d621f5330>


In [39]:
print(tensor[0,0])

2


In [40]:
tensor[0]

array([  2, 143,  40, 933, 140, 591,   4, 124,  24, 110,   5,   3,   0,
         0,   0,   0,   0,   0,   0,   0,   0], dtype=int32)

In [44]:
print(tensor[:3, :14])

[[   2  143   40  933  140  591    4  124   24  110    5    3    0    0]
 [   2  110    4  110    5    3    0    0    0    0    0    0    0    0]
 [   2   11   50   43 1201  316    9  201   74    9 3034   15    3    0]]


In [45]:
print(tokenizer)

<keras.preprocessing.text.Tokenizer object at 0x7f8d621f5330>


In [50]:
print(dir(tokenizer))

['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_keras_api_names', '_keras_api_names_v1', 'analyzer', 'char_level', 'document_count', 'filters', 'fit_on_sequences', 'fit_on_texts', 'get_config', 'index_docs', 'index_word', 'lower', 'num_words', 'oov_token', 'sequences_to_matrix', 'sequences_to_texts', 'sequences_to_texts_generator', 'split', 'texts_to_matrix', 'texts_to_sequences', 'texts_to_sequences_generator', 'to_json', 'word_counts', 'word_docs', 'word_index']


In [51]:
for idx in tokenizer.index_word :
    print(idx, ":", tokenizer.index_word[idx])
    
    if idx >= 10: break

1 : <unk>
2 : <start>
3 : <end>
4 : ,
5 : .
6 : the
7 : and
8 : i
9 : to
10 : of


In [53]:
src_input = tensor[:, :-1]

In [60]:
tgt_input = tensor[:, 1:]

In [56]:
print(src_input[0])

[  2 143  40 933 140 591   4 124  24 110   5   3   0   0   0   0   0   0
   0   0]


In [62]:
print(tgt_input[0])

[143  40 933 140 591   4 124  24 110   5   3   0   0   0   0   0   0   0
   0   0]


In [69]:
len(src_input)

24015

In [72]:
len(src_input[0])

20

In [63]:
BUFFER_SIZE = len(src_input)
BATCH_SIZE = 256
steps_per_epoch = len(src_input) // BATCH_SIZE

In [64]:
VOCAB_SIZE = tokenizer.num_words + 1

In [67]:
dataset = tf.data.Dataset.from_tensor_slices((src_input, tgt_input))
dataset = dataset.shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

2022-11-25 12:03:17.101469: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [68]:
dataset

<BatchDataset element_spec=(TensorSpec(shape=(256, 20), dtype=tf.int32, name=None), TensorSpec(shape=(256, 20), dtype=tf.int32, name=None))>

In [73]:
dir(dataset)

['_GeneratorState',
 '__abstractmethods__',
 '__bool__',
 '__class__',
 '__class_getitem__',
 '__debug_string__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__nonzero__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__slots__',
 '__str__',
 '__subclasshook__',
 '__tf_tracing_type__',
 '__weakref__',
 '_abc_impl',
 '_add_trackable_child',
 '_add_variable_with_custom_getter',
 '_apply_debug_options',
 '_as_serialized_graph',
 '_batch_size',
 '_checkpoint_dependencies',
 '_common_args',
 '_consumers',
 '_deferred_dependencies',
 '_deserialization_dependencies',
 '_deserialize_from_proto',
 '_drop_remainder',
 '_export_to_saved_model_graph',
 '_flat_shapes',
 '_flat_structure',
 '_flat_types',
 '_functions',
 '_gather_saveables_for_checkpoint',
 

In [83]:
i = 0
for sample in dataset :
    if i < 1:
        print(sample)
    else : break
    i += 1 

(<tf.Tensor: shape=(256, 20), dtype=int32, numpy=
array([[   2,   71,   24, ...,    0,    0,    0],
       [   2,   25,  159, ...,    0,    0,    0],
       [   2,  120,   46, ...,    0,    0,    0],
       ...,
       [   2,  314,    7, ...,    0,    0,    0],
       [   2,    9, 4276, ...,    0,    0,    0],
       [   2, 6048,    6, ...,    0,    0,    0]], dtype=int32)>, <tf.Tensor: shape=(256, 20), dtype=int32, numpy=
array([[  71,   24,   31, ...,    0,    0,    0],
       [  25,  159, 3646, ...,    0,    0,    0],
       [ 120,   46,   34, ...,    0,    0,    0],
       ...,
       [ 314,    7,   26, ...,    0,    0,    0],
       [   9, 4276,   87, ...,    0,    0,    0],
       [6048,    6,  438, ...,    0,    0,    0]], dtype=int32)>)


In [88]:
class TextGenerator(tf.keras.Model):
    def __init__(self, vocab_size, embedding_size, hidden_size):
        super().__init__()
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_size)
        self.rnn_1 = tf.keras.layers.LSTM(hidden_size, return_sequences=True)
        self.rnn_2 = tf.keras.layers.LSTM(hidden_size, return_sequences=True)
        self.linear = tf.keras.layers.Dense(vocab_size)
        
    def call(self, x):
        out = self.embedding(x)
        out = self.rnn_1(out)
        out = self.rnn_2(out)
        out = self.linear(out)
        
        return out
    
embedding_size = 256
hidden_size = 1024
model = TextGenerator(tokenizer.num_words + 1, embedding_size, hidden_size)

In [89]:
for src_sample, tgt_sample in dataset.take(1): break

In [90]:
model(src_sample)

<tf.Tensor: shape=(256, 20, 7001), dtype=float32, numpy=
array([[[ 1.44534322e-04,  3.70757334e-04, -7.25606005e-05, ...,
         -1.19911841e-04, -2.34439867e-04, -3.27364651e-05],
        [ 2.86474329e-04,  6.70218898e-04, -4.39357333e-04, ...,
         -2.44967057e-04, -4.15275921e-04,  5.29773708e-04],
        [ 3.24533612e-04,  8.06174474e-04, -6.10205403e-04, ...,
         -7.07736821e-04, -4.78244852e-04,  1.01226510e-03],
        ...,
        [ 2.64809642e-04, -1.44217163e-03, -8.85048066e-04, ...,
         -1.74949190e-03, -3.02353117e-04,  1.50539482e-03],
        [ 4.88101738e-04, -2.16178363e-03, -1.36829633e-03, ...,
         -2.02134508e-03, -5.32328617e-04,  1.63685391e-03],
        [ 6.47747889e-04, -2.78236088e-03, -1.86683191e-03, ...,
         -2.29873718e-03, -7.34247966e-04,  1.78448693e-03]],

       [[ 1.44534322e-04,  3.70757334e-04, -7.25606005e-05, ...,
         -1.19911841e-04, -2.34439867e-04, -3.27364651e-05],
        [ 5.75172162e-05,  6.68061955e-04, -2.

In [91]:
model.summary()

Model: "text_generator_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     multiple                  1792256   
                                                                 
 lstm (LSTM)                 multiple                  5246976   
                                                                 
 lstm_1 (LSTM)               multiple                  8392704   
                                                                 
 dense (Dense)               multiple                  7176025   
                                                                 
Total params: 22,607,961
Trainable params: 22,607,961
Non-trainable params: 0
_________________________________________________________________


In [92]:
tf.test.is_gpu_available()

Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.


False

In [93]:
tf.config.list_physical_devices('GPU')

[]

In [98]:
optimizer = tf.keras.optimizers.Adam()

loss = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True,
    reduction='none'
)

model.compile(loss=loss, optimizer=optimizer)
model.fit(dataset, epochs=30)

In [99]:
def generate_text(model, tokenizer, init_sentence="<start>", max_len=20):
    test_input = tokenizer.texts_to_sequences([init_sentence])
    test_tensor = tf.convert_to_tensor(test_input, dtype=tf.int64)
    end_token = tokenizer.word_index["<end>"]
    
    while True:
        predict = model(test_tensor)
        predict_word = tf.argmez(tf.nn.softmax(predict, axis=-1), axis=-1)[:, -1]
        test_tensor = tf.concat([test_tensor, tf.expand_dims(predict_word, axis=0)], axis=-1)
        
        if predict_word.numpy()[0] == end_token: break
        if test_tensor.shape[1] >= max_len: break
        
    generated = ""
    for word_index in test_tensor[0].numpy():
        generated += tokenizer.index_word[word_index] + " "
        
    return generated

In [None]:
import glob
import os

txt_file_path = os.getenv('HOME')+'/aiffel/lyricist/data/lyrics/*' #os.getenv(x)함수는 환경 변수x의 값을 포함하는 문자열 변수를 반환합니다. txt_file_path 에 "/root/aiffel/lyricist/data/lyrics/*" 저장

txt_list = glob.glob(txt_file_path) #txt_file_path 경로에 있는 모든 파일명을 리스트 형식으로 txt_list 에 할당

raw_corpus = [] 

# 여러개의 txt 파일을 모두 읽어서 raw_corpus 에 담습니다.
for txt_file in txt_list:
    with open(txt_file, "r") as f:
        raw = f.read().splitlines() #read() : 파일 전체의 내용을 하나의 문자열로 읽어온다. , splitlines()  : 여러라인으로 구분되어 있는 문자열을 한라인씩 분리하여 리스트로 반환
        raw_corpus.extend(raw) # extend() : 리스트함수로 추가적인 내용을 연장 한다.

print("데이터 크기:", len(raw_corpus))
print("Examples:\n", raw_corpus[:3])