In [208]:
# pip install tensorboardX
# pip install jupyter_tensorboard
pip install -q tensorboard

Note: you may need to restart the kernel to use updated packages.


In [1]:
import os
import pandas as pd
import numpy as np
import import_ipynb
from sklearn.model_selection import train_test_split, ParameterGrid
from torchtext.data import get_tokenizer
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import torch.nn as nn
import gensim.downloader
from time import time
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
import torch
from tensorboardX import SummaryWriter

# ipynb
from Preprocessing import *
from DataLoader_class import *
from ass3_deep import *
from RNN import *

importing Jupyter notebook from Preprocessing.ipynb
importing Jupyter notebook from DataLoader_class.ipynb
importing Jupyter notebook from ass3_deep.ipynb
PyTorch Version: 1.13.1+cu117

Python 3.7.11 (default, Jul 27 2021, 14:32:16) 
[GCC 7.5.0]
Pandas 1.3.5
Scikit-Learn 1.0.2
GPU is available
importing Jupyter notebook from RNN.ipynb


In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

Using cuda device


In [3]:
# Paths
DATA_PATH = 'ass3_file'
MIDI_PATH =  os.path.join(DATA_PATH,'midi_files/')
TRAIN_PATH =  os.path.join(DATA_PATH,'lyrics_train_set.csv')
TEST_PATH =  os.path.join(DATA_PATH,'lyrics_test_set.csv')
PICK_PATH = os.path.join(DATA_PATH,'pickle_file')
TRAIN_PKL_PATH = os.path.join(PICK_PATH,'train_df.pkl')
TEST_PKL_PATH = os.path.join(PICK_PATH,'test_df.pkl')
PATH_LOG = os.path.join(DATA_PATH,'logs')
WORD2VEC_name = 'glove-wiki-gigaword-300'
word2vec = gensim.downloader.load(WORD2VEC_name)

# Variables
VECTOR_SIZE = 300

In [22]:
# Open pikcle files
train_df = pd.read_pickle(TRAIN_PKL_PATH)
test_df = pd.read_pickle(TEST_PKL_PATH)

In [21]:
def main():
    if not os.path.exists(TRAIN_PKL_PATH):
        # Rename to lower case
        for file in os.listdir(MIDI_PATH):
            os.rename(MIDI_PATH + file, MIDI_PATH + file.lower())
                 
        # Dataframes
        train_df = (pd.read_csv(TRAIN_PATH, header = None)
                .rename(columns={0:'artist',1:'song',2:'lyrics'})
                .drop(columns=[3,4,5,6], axis=1))

        test_df = (pd.read_csv(TEST_PATH, header = None)
                .rename(columns={0:'artist',1:'song',2:'lyrics'}))
        preprocess(train_df, test_df, word2vec)
    
main()

5it [00:02,  2.39it/s]


In [29]:
tokens = list(train_df['tokens'])+list(test_df['tokens'])
tokens_lst = list(set([word for lst in tokens for word in lst]))
total_words = len(tokens_lst)
index2word = {i:tokens_lst[i] for i in range(len(tokens_lst))}
word2index = {tokens_lst[i]:i for i in range(len(tokens_lst))}

# Convert list of tokens to list of indexes
train_df['tokens'] = train_df['tokens'].apply(lambda x: text2index(x, word2index))
test_df['tokens'] = test_df['tokens'].apply(lambda x: text2index(x, word2index))

word2vec_matrix = get_word2vec_matrix(total_words=len(tokens_lst),
                                      index2word=index2word,
                                      word2vec=word2vec,
                                      vector_size=VECTOR_SIZE)

all_tokens = [token for sublist in list(train_df['tokens']) for token in sublist]
tf_tokens = Counter(all_tokens)
#aplly log on each freq
tf_tokens = {key:np.log2(val+1) for key, val in tf_tokens.items()}

## Parmeters

In [64]:
NUM_EPOCH = 50
# params = {'learning_rate' : [0.001, 0.01], 'batch_size' : [32, 64], 'num_layers': [1,2], 'units' : [256,512], 'seq_length':[1,5,9]}

# best params
params = {'learning_rate' : [0.001], 'batch_size' : [32], 'num_layers': [2], 'units' : [512], 'seq_length':[9]} 

# Create a SummaryWriter object
# log_writer = SummaryWriter(log_dir=PATH_LOG)

## Train function

In [9]:
def train(parameters, DataLoader_train, size, model, optimizer, loss_fun):
    for epoch in range(NUM_EPOCH):
        # print(f'Epoch : {epoch}')
        current_loss = 0
        for step, (X, y, tf, features) in enumerate(DataLoader_train):
            input_sequence, output_sequence, tf, features = X.to(device), y.to(device), tf.to(device), features.to(device)
            pred = model(input_sequence, features)
            # loss
            loss = loss_fun(pred, output_sequence, tf)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            current_loss += loss.item()
            
            # Log scalars
            log_writer.add_scalar('train/loss', loss.item(), epoch * len(DataLoader_train) + step)
            
        if epoch == 0:
            first_loss = current_loss 
        if epoch == NUM_EPOCH - 1:
            final_loss = current_loss 


    final_loss = final_loss/size 
    print(f'batch_size: {parameters["batch_size"]}, num_layers: {parameters["num_layers"]}, units: {parameters["units"]}, lr: {parameters["learning_rate"]}, seq_len: {parameters["seq_length"]}')
    print(f'first_loss: {first_loss/size}, final_Loss: {final_loss}')
    print("---------------------------------------------------------------")
    return model, final_loss

## Experiments

In [14]:
results = []
for parameters in tqdm(ParameterGrid(params)):

    # Create sequences
    encoded_lyrics_list = list(train_df['tokens'])
    features_list = list(train_df['feature_method1'])
    
    input_sequences, word2vec_next_words, tokens_tf_array, input_features = create_sequences(encoded_lyrics_list, features_list, total_words, parameters['seq_length'], word2vec_matrix , VECTOR_SIZE,tf_tokens)
    X_train, y_train = input_sequences, word2vec_next_words
    
    # Train DataLoader
    SongDataset_train = SongDataset(X_train,  y_train, tokens_tf_array, input_features)
    DataLoader_train = DataLoader(SongDataset_train, batch_size=parameters['batch_size'], shuffle=True)
    size = len(DataLoader_train)

    # Initialize the LSTM Network
    model = LSTMLyrics(total_words=total_words, vector_size=VECTOR_SIZE, word2vec_matrix=word2vec_matrix,num_layers = parameters['num_layers'], units=parameters['units'], features_size=2).to(device)

    # Adam Optimizer
    optimizer = optim.Adam(model.parameters(), lr=parameters['learning_rate'])

    # Loss - cross entropy loss
    cos = Custom_L1_Loss()    
    
    start = time()
    model.train()
    model, loss = train(parameters, DataLoader_train, size, model, optimizer, cos)
    train_time = time() - start
    
    #result
    temp_result = list(parameters.values())
    temp_result += [round(train_time,3),loss]
    results += [temp_result]
    


100%|██████████| 1/1 [03:20<00:00, 200.42s/it]

batch_size: 32, num_layers: 2, units: 512, lr: 0.001, seq_len: 9
first_loss: 0.03486593927882144, final_Loss: 0.022081862190279408
---------------------------------------------------------------





In [63]:
test_df

Unnamed: 0,tokens,feature_method1
0,"[1791, 1348, 2157, 1681, 1874, 1348, 6395, 670...","[[3, 3], [3, 4], [2, 4], [3, 12], [2, 11], [3,..."
1,"[4456, 4073, 5699, 4272, 964, 2209, 321, 1263,...","[[2, 0], [1, 0], [1, 0], [1, 0], [2, 0], [1, 0..."
2,"[5139, 6554, 2531, 2984, 4642, 1350, 6162, 407...","[[2, 11], [1, 6], [1, 6], [1, 5], [2, 9], [1, ..."
3,"[72, 285, 3078, 6744, 2562, 2650, 4073, 6342, ...","[[1, 0], [1, 0], [1, 0], [1, 0], [1, 0], [1, 0..."
4,"[147, 2056, 2723, 5413, 2477, 6147, 1801, 2054...","[[3, 0], [2, 30], [2, 31], [3, 41], [2, 39], [..."


# Test

In [30]:
def create_sequences(encoded_lyrics_list, features_list, total_words, seq_length, word2vec, vector_size, tokens_tf,test = False,):
    """
    This function creates sequences from the lyrics
    :param encoded_lyrics_list: A list representing all the songs in the dataset (615 songs). Each cell contains a list
    of ints, where each int corresponds to the lyrics in that song. "I'm a barbie girl" --> [23, 52, 189, 792] etc.
    :param total_words: Number of words in our word2vec dictionary.
    :param seq_length: Number of words predating the word to be predicted.
    :return: (1) A numpy array containing all the sequences seen, concatenated.
             (2) A 2d numpy array where each row represents a word and the columns are the possible words in the
             vocabulary. There is a '1' in the corresponding word (e.g, word number '20,392' in the dataset is word
              number '39' in the vocab.
    """
    input_sequences = []
    next_words = []
    next_tf = []
    lst_features = []
    for idx, song_sequence in enumerate(encoded_lyrics_list):  # iterate over songs
        feature_sequence = features_list[idx]
        for i in range(seq_length, len(song_sequence), seq_length):  # iterate from minimal sequence length (number of words) to
            start_index = i - seq_length  # number of words in the song
            end_index = i
            # Slice the list into the desired sequence length
            sequence = song_sequence[start_index:end_index]
            features = feature_sequence[start_index:end_index]
            next_word = song_sequence[start_index+1:end_index+1]
            
            # append to lists
            input_sequences.append(sequence)
            lst_features.append(features)
            next_words.append(next_word)
            if test : 
                break
            next_tf.append([tokens_tf[tf] for tf in next_word])
            
    input_sequences = np.array(input_sequences)
    input_features = np.array(lst_features)
    word2vec_next_word = convert_to_word2vec(word2vec, next_words, vector_size, seq_length)
    return input_sequences, word2vec_next_word, np.array(next_tf), input_features

In [79]:
encoded_lyrics_list = list(test_df['tokens'])
words = [word[1] for word in encoded_lyrics_list]
index2word[4073]

'you'

In [83]:
seq_length =  1
length_lst = [55,95,77,93,58]
# Create sequences
encoded_lyrics_list = list(test_df['tokens'])
wordss = [word[2] for word in encoded_lyrics_list]

features_list = list(test_df['feature_method1'])[0]

input_sequences, word2vec_next_words, _ , input_features = create_sequences(encoded_lyrics_list, features_list, total_words, seq_length, word2vec_matrix , VECTOR_SIZE,None, True)
X_test, y_test = input_sequences, word2vec_next_words
SongDataset_test = SongDataset(X_test,  y_test, None, input_features)

DataLoader_test = DataLoader(SongDataset_test, batch_size= 1, shuffle=False)

for  idx, (X,y, _, _) in enumerate(DataLoader_test):
    # Begin of the song
    z = wordss[idx]
    print(f'start: {index2word[z]}')
    # print(f'start: {[index2word[x] for x in X.to("cpu").detach().numpy()[0]]}')
    print("------------------------")
    
    # List of tokens for adding every step the predicted next token in the sequence
    x_cpu = [z]
            
    WORDS_NUM = length_lst[idx]
    for i in range(WORDS_NUM):
        # Next sequence for the model
        if len(x_cpu) <= 5:
            seq = x_cpu[:len(x_cpu)]
            features_seq = features_list[:len(x_cpu)]
        else:
            seq = x_cpu[i-5:i]
            features_seq = features_list[i-5:i]

        # reshape with batch_size=1
        reshape_seq = np.array(seq).reshape((1,len(seq)))
        reshape_features = np.array(features_seq).reshape((1,len(seq),2))

        
        input_sequence = torch.tensor(reshape_seq, dtype=torch.long).to(device)
        input_features = torch.tensor(reshape_features, dtype=torch.long).to(device)
        
        vector_pred = model(input_sequence, input_features).to('cpu').detach().numpy()[0]
        
        top_n = 20
                
        # Sampling
        closest_words = word2vec.most_similar(vector_pred[-1],topn=top_n)
        closest_words = [tup for tup in closest_words if tup[0] in word2index]
        words = np.array([word for word, similarity in closest_words])
        similarities = np.array([similarity for word, similarity in closest_words])
        weights = softmax_stable(similarities)
        # Sample
        while True:
            sample = np.random.choice(words, p=weights)
            if sample != index2word[x_cpu[-1]]:
                break
        
        x_cpu.append(word2index[sample])
    
    words = [index2word[x] for x in x_cpu]
    print(" ".join(words))

start: eyes
------------------------
eyes however know not can take go so only even because so , way going just off not what so come why others both only would what can just let we because think when because but though although just you go sure even time if both do should take get just get out even both but
start: search
------------------------
search even what not but they know sure well way be what both you would get do so did come now only but know same when just going ? put it come it going so all if just know up because also so get younger only not well feel even so you thing if make really me thing think but so though this think . , though not going you to think ? but however think things when well not way as although your go feelings you because what well i so think come well but
start: fear
------------------------
fear even think its if going i instead need it but others if so things so could but been so because one in only one something get everyone going when then if what w

In [176]:
" ".join([index2word[x] for x in list(test_df['tokens'].iloc[3])])

'hiya barbie hi ken ! do you want to go for a ride ? sure ken jump in im a barbie girl in a barbie world life in plastic its fantastic you can brush my hair undress me everywhere imagination life is your creation come on barbie lets go party ! im a barbie girl in a barbie world life in plastic its fantastic you can brush my hair undress me everywhere imagination life is your creation im a blond bimbo girl in a fantasy world dress me up make it tight im your dolly youre my doll rocknroll feel the glamor in pink kiss me here touch me there hanky panky you can touch you can play if you say im always yours im a barbie girl in a barbie world life in plastic its fantastic you can brush my hair undress me everywhere imagination life is your creation come on barbie lets go party ! ( ah ah ah yeah ) come on barbie lets go party ! ( oh oh ) come on barbie lets go party ! ( ah ah ah yeah ) come on barbie lets go party ! ( oh oh ) make me walk make me talk do whatever you please i can act like a s

In [38]:
def softmax_stable(x):
    return(np.exp(x - np.max(x)) / np.exp(x - np.max(x)).sum())

In [None]:
# the lstm neural network
# the output in create sequences