In [22]:
import glob
import os
import re
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split

def load_lyrics_data(txt_file_path):
    txt_list = glob.glob(txt_file_path)

    raw_corpus = []
    for txt_file in txt_list:
        with open(txt_file, "r") as f:
            raw = f.read().splitlines()
            for sentence in raw:
                preprocessed_sentence = preprocess_sentence(sentence)
                if preprocessed_sentence:
                    raw_corpus.append(preprocessed_sentence)
    return raw_corpus

def preprocess_sentence(sentence):
    sentence = sentence.lower().strip()
    sentence = re.sub(r"([?.!,¿])", r" \1 ", sentence)
    sentence = re.sub(r'[" "]+', " ", sentence)
    sentence = re.sub(r"[^a-zA-Z가-힣?.!,¿]+", " ", sentence)
    sentence = sentence.strip()
    tokens = sentence.split(" ")
    if len(tokens) > 15:
        return None
    return " ".join(tokens)

def tokenize(corpus, num_words=12000):
    tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=num_words, filters=' ', oov_token="<unk>")
    tokenizer.fit_on_texts(corpus)
    tensor = tokenizer.texts_to_sequences(corpus)
    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post')
    return tensor, tokenizer

def build_model(vocab_size, embedding_dim, hidden_dim):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, embedding_dim),
        tf.keras.layers.LSTM(hidden_dim, return_sequences=True, stateful=False),
        tf.keras.layers.Dense(vocab_size)
    ])
    model.compile(optimizer='adam', loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none'))
    return model

def generate_lyrics(model, tokenizer, init_sentence="<start>", max_len=15):
    input_sequence = tokenizer.texts_to_sequences([init_sentence])
    input_sequence = tf.keras.preprocessing.sequence.pad_sequences(input_sequence, maxlen=max_len, padding='post')
    generated_sentence = []
    for _ in range(max_len):
        output = model.predict(input_sequence)
        output_word_id = np.argmax(output, axis=-1)[:, -1]
        if output_word_id[0] not in tokenizer.index_word:
            continue
        generated_word = tokenizer.index_word[output_word_id[0]]
        if generated_word == "<end>" or (len(generated_sentence) == 0 and generated_word == "<pad>"):
            break
        generated_sentence.append(generated_word)
        input_sequence = np.concatenate([input_sequence[:, 1:], output_word_id[:, None]], axis=-1)
    return " ".join(generated_sentence)

txt_file_path = os.getenv('HOME')+'/aiffel/lyricist/data/lyrics/*'
raw_corpus = load_lyrics_data(txt_file_path)

tensor, tokenizer = tokenize(raw_corpus)
x_train, x_eval, y_train, y_eval = train_test_split(tensor[:,:-1], tensor[:, 1:], test_size=0.2, random_state=42)

batch_size = 64
train_data = tf.data.Dataset.from_tensor_slices((x_train, y_train)).shuffle(10000).batch(batch_size, drop_remainder=True)
eval_data = tf.data.Dataset.from_tensor_slices((x_eval, y_eval)).batch(batch_size, drop_remainder=True)

embedding_dim = 256
hidden_dim = 1024

model = build_model(len(tokenizer.word_index)+1, embedding_dim, hidden_dim)
model.fit(train_data, epochs=10, validation_data=eval_data)

seed_text = "<start>"
generated_lyrics_line = generate_lyrics(model, tokenizer, seed_text)
print("Generated lyrics line:", generated_lyrics_line)

first_word = generated_lyrics_line.split()[0]
print("First word of generated lyrics line:", first_word)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Generated lyrics line: 


IndexError: list index out of range