## Evaluation Dataset Preparation

In [1]:
import pandas as pd
import numpy as np
import random
import ast
import nlpaug.augmenter.word as naw
from math import sqrt
from sklearn.model_selection import train_test_split

2023-03-14 20:32:30.331542: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


2023-03-14 20:32:31.897564: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-03-14 20:32:31.897763: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory


In [2]:
df = pd.read_csv('../data/lyrics.csv')

In [3]:
df.head()

Unnamed: 0,lyrics,genre,id
0,"['So long', 'My sweet love', 'I miss you', ""Mo...",pop,0
1,"[""It won't be long before I'm leaving here"", ""...",pop,1
2,"['Are we crazy?', 'Living our lives through a ...",pop,2
3,"['When did it go wrong, I will never know', 'I...",pop,3
4,"[""I've waited all this time"", 'Counting minute...",pop,4


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36000 entries, 0 to 35999
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   lyrics  36000 non-null  object
 1   genre   36000 non-null  object
 2   id      36000 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 843.9+ KB


In [5]:
df['lyrics'] = df['lyrics'].apply(ast.literal_eval)

In [6]:
df0 = df.copy()

In [7]:
df = df.groupby("genre").sample(n=50, random_state=42).reset_index()

In [8]:
eg_lyrics = random.choice(df['lyrics'])
eg_lyrics

['You give me the shakes',
 'You give me the cold sweats',
 "You're giving me reasons to turn my tear drops",
 'Into death threats',
 "You're going straight to machine",
 'So I kiss my brothers',
 'I never took off my chains',
 'They never took my colors',
 "Take 'em on in the morning luv",
 "I'm digging deep for my midas touch",
 'We could be chasing the waves',
 "I know it isn't much",
 'Let it drop. Let it drop',
 "People don't make it easy",
 'Let it drop. Let it drop',
 "Isn't it strange",
 'The way you move me so',
 'You love me with hot hammers',
 'And then your blood runs cold',
 'For my next trick',
 'Gonna be like: where she go?',
 'Make an exit',
 'like adios amigos',
 'It got too complicated',
 'Cool it down and refridgerate it',
 'We could be lassoing the stars',
 "But people don't make it easy",
 'Let it drop. Let it drop',
 'Let it drop',
 "Let it drop. People don't make it easy",
 'Let it drop. Let it drop',
 'Let it drop']

In [9]:
aug = naw.ContextualWordEmbsAug(model_path='roberta-base', action="substitute")

In [10]:
def misalign_lyrics(lyrics, repetitions, repeat_verse):
    verses2repeat = random.sample(range(0, len(lyrics)), repetitions)
    misaligned_lyrics = []
    for idx, verse in enumerate(lyrics):
        if idx in verses2repeat:
            misaligned_lyrics.extend([verse] * repeat_verse)
        else:
            misaligned_lyrics.append(verse)
    return misaligned_lyrics

In [11]:
evaluation_ids = list(df['id'])

all_lyrics = df0.drop(index=evaluation_ids)['lyrics']

In [12]:
eg_verse = random.choice(eg_lyrics)
print('Original verse:')
print(eg_verse)

print('Augmented:')
aug.augment(eg_verse)

Original verse:
Cool it down and refridgerate it
Augmented:


['Cool ice completely and refridgerate later']

In [13]:
eg_lyrics = random.choice(df['lyrics'])

eg_aug_lyrics = aug.augment(eg_lyrics)

eg_aug_misl_lyrics = misalign_lyrics(eg_aug_lyrics, 
                                     repetitions=round(sqrt(len(eg_lyrics))), 
                                     repeat_verse=2)

eg_fake_lyrics = random.choice(all_lyrics)

print('ORIGINAL lyrics:')
print(eg_lyrics)
print()
print('AUGMENTED lyrics:')
print('Aligned:')
print(eg_aug_lyrics)
print()
print('Misaligned:')
print(eg_aug_misl_lyrics)
print()
print('RANDOM lyrics:')
print(eg_fake_lyrics)

ORIGINAL lyrics:
['She was from San Mateo', 'Her dad was in the CIA-o', 'She was on her way to Pasadena', "But nobody says they've seen her", "She's a teenage runaway", 'He works at the record place', "Now she's got his acid baby", 'Tonight, tonight, hey hey hey', "She don't know what to do now", 'The baby always has the flu now', 'No more looking like a queen', 'Overdosed on Afro-sheen.', "She's a teenage runaway", "Vegas ain't no place to stay", 'She just had to get away', 'Tonight, tonight', 'She was from San Mateo', 'Her dad was in the CIA-o', 'She was on her way to Pasadena', "But nobody says they've seen her", "She's a teenage runaway", 'He works at the record place', "Now she's got his acid baby", 'Tonight, tonight, hey hey hey', 'Hey hey hey', 'Hey hey Hey', 'Hey hey hey']

AUGMENTED lyrics:
Aligned:
['She was from Saint Miguel', 'Her mother was probably a CIA-o', 'She was on another road towards Pasadena', "But nobody said they've tried enough", "She's another teen runaway", '

In [14]:
aug_lyrics = df['lyrics'].apply(lambda x: aug.augment(x))


misl_aug_lyrics = [misalign_lyrics(aug_lyr, repetitions=round(sqrt(
    len(aug_lyr))), repeat_verse=random.choice([2,3])) for aug_lyr in aug_lyrics]

                                   
df['similar_lyrics'] = misl_aug_lyrics
df['random_lyrics'] = random.sample(list(all_lyrics), len(df['lyrics']))

In [15]:
df.head()

Unnamed: 0,index,lyrics,genre,id,similar_lyrics,random_lyrics
0,1935,"[Well, I've been waitin' ever since eight, Gue...",pop,1935,"[Well, after just waitin' to … eight, Guess wh...","[The world is a vampire, Sent to drain, Secret..."
1,6494,"[This kind of love, Is more than a lifeline, F...",pop,6494,"[This did of this, Is these question a lifelin...","[Just look over your shoulders, honey!, It's b..."
2,1720,"[I could tell by the look in her eyes, Maybe I...",pop,1720,"[I seldom tell by I look in serve eyes, Maybe ...","[She is the queen of barefoot, She goes walk o..."
3,9120,"[Don't Tell me you love me when I don't, You k...",pop,9120,"[Don't Tell me you Found me Twice but One, You...","[Listen Baby, When I\'m all alone, cause you\'..."
4,360,"[Who's the man? who's the boss?, who's the nig...",pop,360,"[Who's We man? who's p dress?, who's<unk> butt...","[DJ Khaled!, From the mud to the marble floors..."


In [16]:
df.to_csv('../data/evaluation/lyrics_eval.csv', index=False)