In [1]:
import copy
import os
import json
from torch.utils.data import DataLoader

In [2]:
with open(os.path.join('../data/data_lyrics.json'), 'r') as fp:
    lyrics_data = json.load(fp)
with open(os.path.join('../data/data_news.json'), 'r') as fp:
    news_data = json.load(fp)
with open(os.path.join('../data/data_rap.json'), 'r') as fp:
    raw_rap_data = json.load(fp)

In [3]:
from data_utils import add_some_music, gen_pre_data_preprocession, add_some_news, dis_pre_data_preprocession, get_dev_data

gen_percentage = 0.7
music_percentage = 0.1
gen_clean_control = {'lemmatize': True, 'stop_words': True, 'remove_number': True,}

# generator data preprocessing
rap_music = add_some_music(raw_rap_data, lyrics_data, music_percentage)
final_train, gen_pre, dis_rap_raw = gen_pre_data_preprocession(rap_music, gen_percentage, gen_clean_control)

# discriminator data preprocessing
rap_news = add_some_news(dis_rap_raw, news_data)
dis_pre = dis_pre_data_preprocession(rap_news)

# get validation data
pre_dev_percentage = 0.1
gen_pre, gen_pre_dev = get_dev_data(gen_pre, pre_dev_percentage)
dis_pre, dis_pre_dev = get_dev_data(dis_pre, pre_dev_percentage)

In [4]:
from sentence_transformers import SentenceTransformer
sen_embed = SentenceTransformer('bert-base-nli-mean-tokens')

In [5]:
from dataset import GENDataset, DISDataset, basic_collate_fn
import argparse

# parser = argparse.ArgumentParser()
# parser.add_argument('--pre-train-epochs', type=int, default=30)
# parser.add_argument('--batch-size', type=int, default=256)
# parser.add_argument('--sequence-length', type=int, default=5)
# gen_args = parser.parse_args()

gen_args = {"pre_train_epochs": 30, "batch_size": 256, "sequence_length": 5}

# new_parser = argparse.ArgumentParser()
# new_parser.add_argument('--pre-train-epochs', type=int, default=30)
# dis_args = new_parser.parse_args()

dis_args = {"pre_train_epochs": 30}

dis_batch_size = 32

gen_pre_data = GENDataset(gen_args, gen_pre)
gen_pre_dev_data = GENDataset(gen_args, gen_pre_dev)
final_train_data = GENDataset(gen_args, final_train)

dis_pre_data = DISDataset(dis_pre, sen_embed)
dis_pre_dev_data = DISDataset(dis_pre_dev, sen_embed)

gen_dataloader = DataLoader(gen_pre_data, batch_size=gen_args["batch_size"])
gen_dev_loader = DataLoader(gen_pre_dev_data, batch_size=gen_args["batch_size"])
final_loader = DataLoader(final_train_data, batch_size=gen_args["batch_size"])

dis_dataloader = DataLoader(dis_pre_data, batch_size=dis_batch_size, collate_fn=basic_collate_fn, shuffle=True)
dis_dev_loader = DataLoader(dis_pre_dev_data, batch_size=gen_args["batch_size"])

{'pre_train_epochs': 30, 'batch_size': 256, 'sequence_length': 5}
{'pre_train_epochs': 30, 'batch_size': 256, 'sequence_length': 5}
{'pre_train_epochs': 30, 'batch_size': 256, 'sequence_length': 5}


In [6]:
from model.generator import Generator
from model.discriminator import Discriminator

lstm_input_size, num_layers, lstm_hidden_dim, dropout = 128, 2, 32, 0.1
dis_hidden_dim = 1024

generator = Generator(gen_pre_data, lstm_input_size, num_layers, lstm_hidden_dim, dropout)
discriminator = Discriminator(dis_hidden_dim)

In [7]:
from train import pre_train_generator, pre_train_discriminator

device = 'cpu'
pre_patience = 10

gen_loss_type, gen_optim_type = 'bce', 'adam'
g_lr, g_weight_decay = 0.001, 0.00001

dis_loss_type, dis_optim_type = 'bce', 'adam'
d_lr, d_weight_decay = 0.001, 0.00001

generator = pre_train_generator(gen_args, generator, gen_dataloader, gen_dev_loader, gen_loss_type, gen_optim_type, g_lr, g_weight_decay, pre_patience, device)
discriminator = pre_train_discriminator(dis_args, discriminator, dis_dataloader, dis_dev_loader, dis_loss_type, dis_optim_type, d_lr, d_weight_decay, pre_patience, device)

ValueError: Target size (torch.Size([256, 5])) must be the same as input size (torch.Size([256, 25973, 5]))

In [None]:
def get_hyper_parameters():
    _g_para_list = [{"optim_type": 'adam', 'lr': 0.01, "weight_deacy": 1e-4}]
    _d_para_list = [{"optim_type": 'adam', 'lr': 0.01, "weight_deacy": 1e-4}]
    _num_epoch = 40
    _patience = 10
    _max_words = 10
    _device = 'cpu'
    return _g_para_list, _d_para_list, _num_epoch, _patience, _max_words, _device

In [None]:
import itertools
from train import train_model
from data_utils import plot_loss

g_para_list, d_para_list, num_epoch, patience, max_words, device = get_hyper_parameters()

# model training
best_gen, best_dis, best_stats = None, None, None
best_lr, best_wd, best_bs, best_hd, best_lt, best_om = 0, 0, 0, 0, '', ''
best_dis_loss, best_gen_loss = 0, 100
for g_para, d_para in itertools.product(g_para_list, d_para_list):
    Generator, Discriminator, stats = train_model(Generator, Discriminator, final_loader, final_train_data, num_epoch, g_para, d_para, gen_dev_loader, patience, max_words, device)

    # update best parameters if needed
    if stats['dis_loss'] > best_dis_loss and stats['gen_loss'] < best_gen_loss:
        best_dis_loss = stats['dis_loss']
        best_gen_loss = stats['gen_loss']
        best_gen, best_dis, best_stats = copy.deepcopy(Generator), copy.deepcopy(Discriminator) , copy.deepcopy(stats)
        best_g_para, best_d_para = g_para, d_para

    print("\n\nBest hidden dimension: {}, Best learning rate: {}, best weight_decay: {}, best batch_size: {}, best loss type： {}, best optimizer: {}".format(
    best_hd, best_lr, best_wd, best_bs, best_lt, best_om))
print("Generator loss: {:.4f}".format(best_gen_loss))
print("Discriminator loss: {:.4f}".format(best_dis_loss))
plot_loss(best_stats)

In [None]:
from generate_rap import generate_rap

sen_input = "I build a castle"
num_sentences = 10
max_words = 10

lyrics = generate_rap(best_gen, sen_input, num_sentences, max_words, final_train_data)
for sen in lyrics:
    print(sen)