# 데이터 불러와 전처리하기

In [None]:
# 구글 드라이브에서 파일 불러오기 설정
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
# songs.csv를 불러와 dataframe으로 만들기
import pandas as pd
path = '/content/gdrive/My Drive/Colab Notebooks/더조은-딥러닝/data/songs_preprocessed.csv'
songs = pd.read_csv(path)

In [None]:
categories = songs['genre'].to_list()

def category_encode(category):
    if category == '발라드':
        return [1,0,0,0]
    elif category == '댄스':
        return [0,1,0,0]
    elif category == '힙합':
        return [0,0,1,0]
    else:
        return [0,0,0,1]

In [None]:
encoded_category = [category_encode(category) for category in categories]
encoded_category

In [None]:
# 장르 맵핑하기
songs['genre'] = songs['genre'].map({'발라드':0, '댄스':1, '힙합':2, '트로트':3})
songs.head()

In [None]:
# data와 target으로 분류하기
target = songs['genre']
data = songs['lyric']

# 전처리 함수 만들기

In [None]:
!pip install konlpy

In [None]:
import tensorflow as tf
from tensorflow import keras
import konlpy
from konlpy.tag import Okt
okt = Okt()

In [None]:
def preprocess(text):
  return okt.morphs(text)

In [None]:
# preprocess 잘 되는지 확인
preprocess(data[0])[:5]

# 사전 만들기

In [None]:
data = [preprocess(lyric) for lyric in data]

In [None]:
# 중복 없는 토큰 갯수 파악
tokens = {}
# 가장 긴 단어 갯수 확인(padding 추가 위함)
max_length = 0
max_i = 0

for lyric in data:
  if len(lyric) > max_length:
    max_length = len(lyric)
  for token in lyric:
    if token not in tokens:
      tokens[token] = 0
    tokens[token] += 1
    max_i = i
tokens = sorted(tokens.items(), key = lambda x:x[1], reverse = True)
print(max_length)
print(max_i)


In [None]:
tokens[0:5]

# 정수로 변환

In [None]:
token_to_index = {}
i = 1
for (token, frequency) in tokens:
  if frequency > 1:
    token_to_index[token] = i
    i += 1
  else:
    token_to_index[token] = 0 # 빈도수 1이면 0으로 바꿔버린다

In [None]:
data_indexed = [[token_to_index[token] for token in lyric] for lyric in data]

In [None]:
# max_length 미달인 녀석들 0으로 패딩
data_padded = keras.preprocessing.sequence.pad_sequences(data_indexed, maxlen=max_length, padding='post')

In [None]:
data_padded

In [None]:
encoded_category

# 모델 구현

In [None]:
max_length

In [None]:
model = keras.Sequential()
model.add(keras.layers.Embedding(
    input_dim = max_i + 1,
    output_dim = 64,
    input_length = max_length,
    mask_zero = True
))
model.add(keras.layers.LSTM(64))
model.add(keras.layers.Dense(32 , activation = "relu")) 
model.add(keras.layers.Dense(4, activation='softmax'))
model.summary()

In [None]:
np.array(encoded_category).shape

In [None]:
np.array(data_padded).shape

In [None]:
import numpy as np

In [None]:
model.compile(
  optimizer='adam',
  loss='sparse_categorical_crossentropy',
  metrics=['accuracy']
)
hist = model.fit(
  np.array(data_padded),
  np.array(target),
  # np.array(encoded_category),
  epochs=20,
)

In [None]:
# draw graph
plt.plot(hist.history['accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.show()

In [None]:
import matplotlib.pyplot as plt

plt.plot(history['loss'])
plt.plot(history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'valid'], loc='upper left')
plt.show()
