In [1]:
from google.colab import drive
import os
drive.mount('/content/drive')
os.chdir('/content/drive/MyDrive/embedding-GAN-gpu')

Mounted at /content/drive


In [2]:
import copy
import os
import json
from torch.utils.data import DataLoader
%load_ext autoreload
%autoreload 2

In [3]:
from data_utils import merge_lists, gen_clean, dis_clean

with open(os.path.join('../data/data_lyrics.json'), 'r') as fp:
    lyrics_data = json.load(fp)
with open(os.path.join('../data/data_news.json'), 'r') as fp:
    news_data = json.load(fp)
with open(os.path.join('../data/data_rap.json'), 'r') as fp:
    raw_rap_data = json.load(fp)

print(sum([len(raw_rap_data) for i in range(len(raw_rap_data))]))

gen_clean_control = {'lemmatize': True, 'stop_words': False, 'remove_number': True,}
all_raw = merge_lists(lyrics_data) + merge_lists(news_data) + merge_lists(raw_rap_data)
all_data = gen_clean(all_raw, gen_clean_control)
dis_all_data = dis_clean(all_raw, gen_clean_control)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
1444


In [15]:
batch_num = 3000

gen_args = {"pre_train_epochs": 20, "batch_size": 64, "sequence_length": 10}

In [16]:
from data_utils import add_some_music, gen_pre_data_preprocession, add_some_news, dis_pre_data_preprocession, get_dev_data

gen_percentage = 0.3 # percentage for generator pretraining from rap data
music_percentage = 0.1 # percentage of music added into rap lyrics


# generator data preprocessing
rap_music = add_some_music(raw_rap_data, lyrics_data, music_percentage)
final_train, gen_pre_cv, dis_rap_raw = gen_pre_data_preprocession(rap_music, gen_percentage, gen_clean_control)

gen_pre = gen_pre_cv[0:(batch_num + 1)*gen_args['batch_size']]

# discriminator data preprocessing
merge_len = 50
rap_news = add_some_news(dis_rap_raw, news_data, merge_len)
dis_pre = dis_pre_data_preprocession(rap_news)

# dis_pre = dis_pre[0:64*1000] + dis_pre[-64*1000:]

# get validation data
pre_dev_percentage = 0.1
gen_pre, gen_pre_dev = get_dev_data(gen_pre, pre_dev_percentage)
dis_pre, dis_pre_dev = get_dev_data(dis_pre, pre_dev_percentage)

with open('a.txt', 'w') as infile:
  for i in dis_pre:
    infile.write(str(i['y']))
    infile.write('    ')
    infile.write(i['X'])
    infile.write('\n')

20934


In [None]:
%pip install sentence_transformers
from sentence_transformers import SentenceTransformer
sen_embed = SentenceTransformer('bert-base-nli-mean-tokens')

In [None]:

!pip install --upgrade gensim
import gensim.downloader
embed = gensim.downloader.load('word2vec-google-news-300')

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(analyzer=lambda x: x, min_df=10)
matrix = vectorizer.fit_transform([x for x in dis_all_data])
tfidf = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))



In [10]:
from dataset import GENDataset, DISDataset, basic_collate_fn
import argparse

# parser = argparse.ArgumentParser()
# parser.add_argument('--pre-train-epochs', type=int, default=30)
# parser.add_argument('--batch-size', type=int, default=256)
# parser.add_argument('--sequence-length', type=int, default=5)
# gen_args = parser.parse_args()

# new_parser = argparse.ArgumentParser()
# new_parser.add_argument('--pre-train-epochs', type=int, default=30)
# dis_args = new_parser.parse_args()

dis_batch_size = 64

# gen_pre = gen_pre[0:512]
# gen_pre_dev = gen_pre[0:16]
# dis_pre_dev = dis_pre[0:16] + dis_pre[-16:]
# final_train = gen_pre[0:16]


gen_pre_data = GENDataset(gen_args, gen_pre, all_data, embed)
gen_pre_dev_data = GENDataset(gen_args, gen_pre_dev, all_data, embed)
final_train_data = GENDataset(gen_args, final_train, all_data, embed)

dis_pre_data = DISDataset(dis_pre, embed, tfidf)
dis_pre_dev_data = DISDataset(dis_pre_dev, embed, tfidf)

gen_dataloader = DataLoader(gen_pre_data, batch_size=gen_args["batch_size"])
gen_dev_loader = DataLoader(gen_pre_dev_data, batch_size=1)
final_loader = DataLoader(final_train_data, batch_size=1)

dis_dataloader = DataLoader(dis_pre_data, batch_size=dis_batch_size, collate_fn=basic_collate_fn, shuffle=True)
dis_dev_loader = DataLoader(dis_pre_dev_data, batch_size=gen_args["batch_size"])

In [11]:
import torch

print(torch.cuda.is_available())

True


In [12]:
from model.generator import Generator
from model.discriminator import Discriminator
import torch

print(torch.cuda.is_available())

lstm_input_size, num_layers, lstm_hidden_dim, dropout = 300, 2, 128, 0.1
dis_hidden_dim = 1024

generator = Generator(gen_pre_data, lstm_input_size, num_layers, lstm_hidden_dim, dropout).cuda()
discriminator = Discriminator(dis_hidden_dim)

True


In [17]:
from train import pre_train_generator, pre_train_discriminator
import matplotlib.pyplot as plt
import numpy as np

unk = np.mean([embed[word] for word in embed.index_to_key], axis=0)

# 16 min/epoch
device = 'cuda'
pre_patience = 50

gen_loss_type, gen_optim_type = 'mse', 'adam'
g_lr, g_weight_decay = 0.01, 0.00001

dis_loss_type, dis_optim_type = 'bce', 'adam'
d_lr, d_weight_decay = 0.001, 0.0001

# 8/49 per epoch, 64-17000

generator, plot_s = pre_train_generator(gen_args, generator, gen_dataloader, gen_dev_loader, gen_loss_type, gen_optim_type, g_lr, g_weight_decay, pre_patience, device, embed, unk)
# discriminator = pre_train_discriminator(dis_args, discriminator, dis_dataloader, dis_dev_loader, dis_loss_type, dis_optim_type, d_lr, d_weight_decay, pre_patience, device)

epoch = gen_args["pre_train_epochs"]
fig, axs = plt.subplots(2, 2, constrained_layout=True)
fig.tight_layout()
axs[0, 0].plot(list(range(1,epoch + 1)), plot_s['t_acc'])
axs[0, 0].set_title('Training Accuracy')
axs[0, 1].plot(list(range(1,epoch + 1)), plot_s['t_loss'])
axs[0, 1].set_title('Training Loss')
axs[1, 0].plot(list(range(1,epoch + 1)), plot_s['v_acc'])
axs[1, 0].set_title('Validation Accuracy')
axs[1, 1].plot(list(range(1,epoch + 1)), plot_s['v_loss'])
axs[1, 1].set_title('Validation Loss')

for ax in axs.flat:
    ax.set(xlabel='epochs', ylabel='performance')

plt.show()

KeyboardInterrupt: ignored

In [None]:
print(plot_s)

In [None]:
from train import pre_train_generator, pre_train_discriminator
import matplotlib.pyplot as plt
# 16 min/epoch
device = 'cuda'
pre_patience = 50

dis_args = {"pre_train_epochs": 50}

gen_loss_type, gen_optim_type = 'cross', 'adam'
g_lr, g_weight_decay = 0.003, 0.0003

dis_loss_type, dis_optim_type = 'bce', 'adam'
d_lr, d_weight_decay = 0.002, 0.0003
discriminator, plot_d = pre_train_discriminator(dis_args, discriminator, dis_dataloader, dis_dev_loader, dis_loss_type, dis_optim_type, d_lr, d_weight_decay, pre_patience, device)



epoch = dis_args["pre_train_epochs"]
fig, axs = plt.subplots(2, 2, constrained_layout=True)
fig.tight_layout()
axs[0, 0].plot(list(range(1,epoch + 1)), plot_d['t_acc'])
axs[0, 0].set_title('Training Accuracy')
axs[0, 1].plot(list(range(1,epoch + 1)), plot_d['t_loss'])
axs[0, 1].set_title('Training Loss')
axs[1, 0].plot(list(range(1,epoch + 1)), plot_d['v_acc'])
axs[1, 0].set_title('Validation Accuracy')
axs[1, 1].plot(list(range(1,epoch + 1)), plot_d['v_loss'])
axs[1, 1].set_title('Validation Loss')

for ax in axs.flat:
    ax.set(xlabel='epochs', ylabel='performance')

plt.show()

Batch: 500 0.010964691638946533
Batch: 1000 0.1511344611644745
Batch: 1500 0.012981433421373367
Batch: 2000 0.15391527116298676
{'epoch': 1, 'train_loss': 0.07576533281924169, 'train_acc': 0.9732142686843872, 'val_loss': 0.08290345353172877, 'val_acc': tensor(0.9682, device='cuda:0')}
Batch: 500 0.001788556226529181
Batch: 1000 0.012208241038024426
Batch: 1500 0.008604406379163265
Batch: 2000 0.07626893371343613
{'epoch': 2, 'train_loss': 0.0665300453322242, 'train_acc': 0.9762037396430969, 'val_loss': 0.06822687919126251, 'val_acc': tensor(0.9736, device='cuda:0')}
Batch: 500 0.057420000433921814
Batch: 1000 0.013866656459867954
Batch: 1500 0.02475588582456112
Batch: 2000 0.005385030061006546
{'epoch': 3, 'train_loss': 0.06130877907840045, 'train_acc': 0.9780063033103943, 'val_loss': 0.06544683723388096, 'val_acc': tensor(0.9743, device='cuda:0')}
Batch: 500 0.004867812152951956
Batch: 1000 0.021916940808296204
Batch: 1500 0.010877422988414764
Batch: 2000 0.036872729659080505


KeyboardInterrupt: ignored

In [None]:
print(plot_d['t_acc'])

In [None]:
def get_hyper_parameters():
    _g_para_list = [{"optim_type": 'adam', 'lr': 0.01, "weight_decay": 1e-4}]
    _d_para_list = [{"optim_type": 'adam', 'lr': 0.01, "weight_decay": 1e-4}]
    _num_epoch = 1
    _patience = 10
    _max_words = 10
    _device = 'cpu'
    return _g_para_list, _d_para_list, _num_epoch, _patience, _max_words, _device

In [None]:
import itertools
from train import train_model
from data_utils import plot_loss
from generate_rap import generate_rap
import numpy as np
from generate_rap import generate_rap

g_para_list, d_para_list, num_epoch, patience, max_words, device = get_hyper_parameters()

# model training
best_gen, best_dis, best_stats = copy.deepcopy(generator), copy.deepcopy(discriminator), None
best_lr, best_wd, best_bs, best_hd, best_lt, best_om = 0, 0, 0, 0, '', ''
best_dis_loss, best_gen_loss = float('-inf'), float('inf')
for g_para, d_para in itertools.product(g_para_list, d_para_list):
    g, d, stats = train_model(generator, discriminator, final_loader, final_train_data, num_epoch, g_para, d_para, gen_dev_loader, patience, max_words, device)

    # update best parameters if needed
    if np.mean(stats['dis_loss']) > best_dis_loss and np.mean(stats['gen_loss']) < best_gen_loss:
        best_dis_loss = np.mean(stats['dis_loss'])
        best_gen_loss = np.mean(stats['gen_loss'])
        best_gen, best_dis, best_stats = copy.deepcopy(g), copy.deepcopy(d) , copy.deepcopy(stats)
        best_g_para, best_d_para = g_para, d_para

    print("\n\nBest hidden dimension: {}, Best learning rate: {}, best weight_decay: {}, best batch_size: {}, best loss type： {}, best optimizer: {}".format(
    best_hd, best_lr, best_wd, best_bs, best_lt, best_om))
print("Generator loss: {:.4f}".format(best_gen_loss))
print("Discriminator loss: {:.4f}".format(best_dis_loss))
plot_loss(best_stats)

sen_input = "i build a castle"
num_sentences = 100
max_words = 10

lyrics = generate_rap(best_gen, sen_input, num_sentences, max_words, final_train_data)
with open('a.txt', 'w') as out:
  for sen in lyrics:
      out.write(sen + '.' + '\n')

In [None]:
from generate_rap import generate_rap

sen_input = "i build a castle"
sen_input = "just remember when you come up"
num_sentences = 100
max_words = 7

lyrics = generate_rap(generator, sen_input, num_sentences, max_words, final_train_data)
for sen in lyrics:
    print(sen + '.')

In [None]:
def plot_word_cloud(terms):
    text = terms.index
    text = ' '.join(list(text))
    # lower max_font_size
    wordcloud = WordCloud(max_font_size=40).generate(text)
    plt.figure(figsize=(25, 25))
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    plt.show()
tfidf2 = pd.DataFrame(columns=['tfidf']).from_dict(dict(tfidf), orient='index')
tfidf2.columns = ['tfidf']
plot_word_cloud(tfidf2.sort_values(by=['tfidf'], ascending=True).head(100))