# [Exp-06]LyricistAI

## 라이브러리 불러오고 버전 확인

In [1]:
import tensorflow

print(tensorflow.__version__)

2.9.1


## 데이터 다운로드 및 불러오기

In [2]:
import glob
import os, re 
import numpy as np
import tensorflow as tf

txt_file_path = 'lyrics/*'

txt_list = glob.glob(txt_file_path)
# print(txt_list)
raw_corpus = []

# 여러개의 txt 파일을 모두 읽어서 raw_corpus 에 담습니다.
for txt_file in txt_list:
    with open(txt_file, "r") as f:
        raw = f.read().splitlines()
        raw_corpus.extend(raw)

print("데이터 크기:", len(raw_corpus))
print("Examples:\n", raw_corpus[:10])

데이터 크기: 187088
Examples:
 ['THE QUEEN _of_ HEARTS', '  ', '', '    The Queen of Hearts she made some tarts,', "      All on a summer's day;", '', '    The Knave of Hearts he stole those tarts,', '      And took them clean away.', '', '    The King of Hearts called for those tarts,']


## 데이터 정제

정제 법칙
1. 불필요한 공백 제거
2. `를 제외한 모든 특수문자를 제거(^ a-zA-Z[]())
3. [글자],(글자) 형식을 제거(코러스 또는 표시용)
4. 문장 전체가 대문자인 경우 제거해줍니다.(동요에서 제목)
5. 모든 대문자를 소문자로 변경

정제 단계
1. 양쪽의 공백을 제거합니다.
2. 여러 개의 공백을 하나로 만듭니다.
3. 알파벳, 작은따옴표, 스페이스바와 나중에 제거할 특수문자를 제외한 문자를 제거합니다.
4. (문자), [문자]를 제거해줍니다.
5. setence의 모든 문자가 대문자인 경우 제거
6. 소문자로 만든 뒤 양쪽 공백을 제거해줍니다.
7. 빈 문장인 경우 start, end를 추가하지않습니다.

In [3]:
def preprocess_sentence(sentence):
    sentence = sentence.strip()
    sentence = re.sub('[" "]+', ' ', sentence)
    sentence = re.sub('[^a-zA-Z \'\(\)\[\]]+', '', sentence)
    sentence = re.sub('\([\w\d\s]*\)', '', sentence)
    sentence = re.sub('\[[\w\d\s-]*\]', '', sentence)
    if (str.isupper(sentence) == True):
        sentence = ""
    sentence = sentence.lower().strip()
    if len(sentence) != 0:
        sentence = '<start> ' + sentence + ' <end>'
    
    return sentence

# 테스트
test1 = "(Verse; 1) [Chorus:]  I'm movin' DING, DONG, BELL."
test2 = "UPPER YES"
test3 = "UPPER No"
test4 = "I CAN'T YOU"

print(preprocess_sentence(test1))
print(preprocess_sentence(test2))
print(preprocess_sentence(test3))
print(preprocess_sentence(test4))

<start> i'm movin' ding dong bell <end>

<start> upper no <end>



15개 이상 넘어가는 문장은 제외합니다.

In [4]:
corpus = []

for sentence in raw_corpus:
    if len(sentence) == 0:
        continue
    
    preprocessed_sentence = preprocess_sentence(sentence)
    if len(preprocessed_sentence) == 0:
        continue
    
    if preprocessed_sentence == '<start> the queen of hearts <end>':
        continue
    
    if len(preprocessed_sentence.split()) > 15:
        continue
    
    corpus.append(preprocessed_sentence)

corpus[:10]

['<start> the queen of hearts she made some tarts <end>',
 "<start> all on a summer's day <end>",
 '<start> the knave of hearts he stole those tarts <end>',
 '<start> and took them clean away <end>',
 '<start> the king of hearts called for those tarts <end>',
 '<start> and beat the knave full sore <end>',
 '<start> the knave of hearts brought back those tarts <end>',
 "<start> and vowed he'd steal no more <end>",
 "<start> st swithin's day if thou dost rain <end>",
 '<start> for forty days it will remain <end>']

## Tokenizer

In [5]:
def tokenize(corpus):
    tokenizer = tf.keras.preprocessing.text.Tokenizer(
        num_words=12000, 
        filters=' ',
        oov_token="<unk>"
    )
    tokenizer.fit_on_texts(corpus)
    tensor = tokenizer.texts_to_sequences(corpus)   
    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post')  
    
    print(tensor,tokenizer)
    return tensor, tokenizer

tensor, tokenizer = tokenize(corpus)

[[    2     4   784 ...     0     0     0]
 [    2    21    15 ...     0     0     0]
 [    2     4 11137 ...     0     0     0]
 ...
 [    2   203     3 ...     0     0     0]
 [    2   425     9 ...     0     0     0]
 [    2     9  1466 ...     0     0     0]] <keras.preprocessing.text.Tokenizer object at 0x7f9232acfd60>


In [6]:
for k, v in tokenizer.word_counts.items():
    if len(k) == 1:
        print(k, v)

a 21566
i 37058
b 95
z 8
c 52
d 104
e 49
f 59
g 98
r 82
s 129
t 60
u 1814
w 4
x 127
y 54
o 103
h 6
j 29
k 38
l 26
m 46
n 132
p 149
q 9
v 21
' 59
) 16
[ 2
] 2
( 13


## 소스 및 타겟 문장 생성

In [7]:
src_input = tensor[:, :-1]
tgt_input = tensor[:, 1:]

print(raw_corpus[:9])
print(src_input[0])
print(tgt_input[0])

['THE QUEEN _of_ HEARTS', '  ', '', '    The Queen of Hearts she made some tarts,', "      All on a summer's day;", '', '    The Knave of Hearts he stole those tarts,', '      And took them clean away.', '']
[   2    4  784   17  943   45  215   87 9259    3    0    0    0    0]
[   4  784   17  943   45  215   87 9259    3    0    0    0    0    0]


## 평가 데이터셋 분리

* sklearn train_test_split() 함수를 사용해서 train, test, validation으로 나눕니다.
* 비율은 0.8, 0.2, 0.2 로 나눕니다.

In [8]:
from sklearn.model_selection import train_test_split

X_train_all, X_test, y_train_all, y_test = train_test_split(src_input, tgt_input, test_size = 0.2, random_state = 2022)
X_train, X_val, y_train, y_val = train_test_split(X_train_all, y_train_all, test_size = 0.25, random_state = 2022)

print(X_train_all.shape)
print(X_test.shape)
print(X_val.shape)
print(y_train_all.shape)
print(y_test.shape)
print(y_val.shape)

(128269, 14)
(32068, 14)
(32068, 14)
(128269, 14)
(32068, 14)
(32068, 14)


## 인공지능 모델 만들기

In [9]:
class TextGenerator(tf.keras.Model):
    def __init__(self, vocab_size, embedding_size, hidden_size):
        super().__init__()
        
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_size)
        self.rnn_1 = tf.keras.layers.LSTM(hidden_size, return_sequences=True)
        self.rnn_2 = tf.keras.layers.LSTM(hidden_size, return_sequences=True)
        self.linear = tf.keras.layers.Dense(vocab_size)
        
    def call(self, x):
        out = self.embedding(x)
        out = self.rnn_1(out)
        out = self.rnn_2(out)
        out = self.linear(out)
        
        return out

embedding_size = 256
hidden_size = 2048
model = TextGenerator(tokenizer.num_words + 1, embedding_size , hidden_size)

2022-08-12 23:19:40.889878: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-12 23:19:40.919442: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-12 23:19:40.919655: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-12 23:19:40.920525: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags

In [10]:
optimizer = tf.keras.optimizers.Adam()
loss = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True,
    reduction='none'
)

model.compile(loss=loss, optimizer=optimizer)

In [11]:
model.fit(X_train, y_train, validation_data = (X_val, y_val), epochs=10, batch_size = 512)

Epoch 1/10


2022-08-12 23:19:57.262416: I tensorflow/stream_executor/cuda/cuda_dnn.cc:384] Loaded cuDNN version 8204


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f922bb52a10>

## 가사 만들기

In [12]:
def generate_text(model, tokenizer, init_sentence="<start>", max_len=20):
    test_input = tokenizer.texts_to_sequences([init_sentence])
    test_tensor = tf.convert_to_tensor(test_input, dtype=tf.int64)
    end_token = tokenizer.word_index["<end>"]

    while True:
        predict = model(test_tensor) 
        predict_word = tf.argmax(tf.nn.softmax(predict, axis=-1), axis=-1)[:, -1] 
        test_tensor = tf.concat([test_tensor, tf.expand_dims(predict_word, axis=0)], axis=-1)
        if predict_word.numpy()[0] == end_token: break
        if test_tensor.shape[1] >= max_len: break

    generated = ""
    for word_index in test_tensor[0].numpy():
        generated += tokenizer.index_word[word_index] + " "

    return generated

In [13]:
generate_text(model, tokenizer, init_sentence="<start> i love", max_len=20)

'<start> i love you <end> '

In [14]:
generate_text(model, tokenizer, init_sentence="<start> i", max_len=20)

"<start> i don't know what to do <end> "

In [15]:
generate_text(model, tokenizer, init_sentence="<start> he", max_len=20)

'<start> he said i like to be alone <end> '

In [16]:
generate_text(model, tokenizer, init_sentence="<start> you", max_len=20)

'<start> you know i love you <end> '

### 회고

* 이번 프로젝트에서 어려웠던 점
    - 데이터 전처리(정규식 표현 활용)
* 프로젝트를 진행하면서 알아낸 점 혹은 아직 모호한 점
  * 알게 된점
    - 정규표현식 활용법
    - 소스 문장과 타겟 문장 생성시 무조건 양끝을 제거하는 이유(모델계산과 행렬계산을 따로 생각해야함)
  * 아직 모호한 점
* 루브릭 평가를 맞추기 위해 시도한 점
    - 전처리를 위해 다양한 테스트 문장을 넣어보았고 대부분의 데이터를 육안으로 확인하였습니다.
    - loss 값을 낮추기 위해 pre 보단 post를 hidden size를 2배로 증가시켰습니다.
    - 여러개의 input을 넣었습니다.