# Create combined Dataset

With this notebook, a dataset combining log mel-spectrograms and tokenized lyrics is created. Attention: you need to calcuclte the log mel-spectrograms first and store them in a pkl-file (see `CalculateSpectrograms.ipynb`-file)! They are needed to run this script correctly :) 

The resulting dataset can be used for AudioNet, LyricsNet and FusionNet models for training and testing.

# Libraries

In [None]:
import pandas as pd # handling csv data
import pickle as pkl # handle pkl data
import numpy as np # numerical operations
from tqdm import tqdm # progress bar
import time # measure processing time
from google.colab import drive # connect to Google Drive
import warnings
import datetime

!pip install transformers
!pip install sentencepiece # needed for XLNet
# XLNet 
from transformers import XLNetTokenizer, XLNetForSequenceClassification, XLNetModel, XLNetConfig
# Padding/Truncating sequence to MAX_LEN
from keras.preprocessing.sequence import pad_sequences

# Helper Functions

In [None]:
def load_pkl_data(pkl_dir):
  t_start = time.time()
  print('\nLoading data...')
  with open(pkl_dir, "rb") as f:
    try:
      data = pkl.load(f)
      print('\nData loaded successfully from \n{}!'.format(pkl_dir))
      print('\nNumber of data entries: {}'.format(len(data)))
    except Exception as e:
      print(e)
  t_end = time.time()
  t_elapsed = t_end - t_start
  print('\nTime elapsed: {} seconds.'.format(np.round(t_elapsed,2)))

  return data


def load_csv_data(csv_dir):
  data = pd.read_csv(csv_dir,index_col=0)

  t_start = time.time()
  print('\nLoading data...')
  try:
    data = pd.read_csv(csv_dir,index_col=0)
    print('\nData loaded successfully from \n{}!'.format(csv_dir))
    print('\nNumber of data entries: {}'.format(data.shape[0]))
  except Exception as e:
    print(e)
  t_end = time.time()
  t_elapsed = t_end - t_start
  print('\nTime elapsed: {} seconds.'.format(np.round(t_elapsed,2)))

  return data

# Pre-processing data

In [None]:
# Load tokenizer
def get_tokenizer(transformer_name):
  if transformer_name == "XLNet":
    tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased', do_lower_case=True)
  elif transformer_name == "BERT":
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
  else:
    raise ValueError("transformer_name must be set to 'XLNet' or 'BERT'!")
  return tokenizer


def preprocess_lyrics(lyrics,tokenizer,transformer_name,MAX_LEN):
  # Tokenize lyrics
  if transformer_name == "XLNet":
    tokenized_text = [tokenizer.tokenize(x) for x in lyrics]
    input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_text]
  elif transformer_name == "BERT":
    input_ids = [tokenizer.encode(x,add_special_tokens=True) for x in lyrics]
  else:
    raise ValueError("transformer_name must be set to 'XLNet' or 'BERT'!")

  # Padding/Truncating
  input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

  # Creating attention masks for input_ids
  attention_masks = create_attention_masks(input_ids)

  return input_ids, attention_masks


# Attention Mask for XLNet
def create_attention_masks(input_ids):
  attention_masks = []
  # Create a mask of 1s for each token followed by 0s for padding
  for seq in input_ids:
    seq_mask = [float(i>0) for i in seq]
    attention_masks.append(seq_mask)
  return attention_masks


# Normalize each channel of the spectrogram to [0, 1] values
def normalize_specs(spec):
  spec_norm = []
  for channel in spec:
    channel_norm = (channel-np.min(channel))/np.max(channel-np.min(channel))
    spec_norm.append(channel_norm)

  return np.asarray(spec_norm)

# Script

In [None]:
# Connect to google drive
drive.mount('/content/gdrive')

dataset_csv = '/path/to/lyrics_lemma_no_sw.csv'
dataset_pkl = '/path/to/log_melspectrograms.pkl'

# Load lyrics data
data_csv = load_csv_data(dataset_csv)
# Load log mel-spectrograms data
data_pkl = load_pkl_data(dataset_pkl)


# Pre-process Lyrics
transformer_name = 'XLNet'
MAX_LEN = 160

# Get lyrics from CSV dataset
lyrics = data_csv['lyrics_lemma_no_sw'].tolist()

# Load tokenizer
tokenizer = get_tokenizer(transformer_name)

# Pre-processing lyrics
input_ids, attention_masks = preprocess_lyrics(lyrics,tokenizer,transformer_name,MAX_LEN)


# Combine data to a single dataset (combined Dataset)
idx = 0
for row in data_csv.itertuples():
  """
  ATTENTION: 
  Index for filename in row might differ 
  depending on the dataset used!
  """"

  # Load name of audiofile from metadata set
  audiofile_name = str(row[1])

  # Add audiofile name to combined dataset
  data_pkl[idx]['filename'] = audiofile_name
  # Add normalized spectrograms
  data_pkl[idx]['spec_values'] = normalize_specs(data_pkl[idx]['spec_values'])
  # Add input IDS (lyrics)
  data_pkl[idx]['input_ids'] = input_ids[idx]
  # Add attention masks (lyrics)
  data_pkl[idx]['attention_mask'] = attention_masks[idx]
  # Add name of tokenizer (lyrics)
  data_pkl[idx]['tokenizer'] = 'XLNet'
  # Add maximum length of token sequence (lyrics)
  data_pkl[idx]['MAX_LEN'] = MAX_LEN

  idx += 1

out_pkl_file = '/path/to/store/combined_dataset.pkl'

# Save dataset (log mel-spectrograms and tokenized lyrics) to pkl-file
with open(out_pkl_file,"wb") as handler:
  try:
    pkl.dump(data_pkl, handler, protocol=pkl.HIGHEST_PROTOCOL)
  except Exception as e:
    print(e)

  print('\nFile saved to {}\n'.format(out_pkl_file))