In [1]:
import pandas as pd
import numpy as np
import re
import csv
from string import ascii_lowercase
import random

In [2]:
subset_punk_df = pd.read_csv('../data/subset_punk_bands.csv')
subset_punk_df.drop('Unnamed: 0', axis = 1, inplace = True)

## Creating a model

### Setting up the data

In [3]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import LSTM, Bidirectional, Dense, Input, Embedding, Dropout, concatenate
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras import Model

In [4]:
# "randomly" choose one band to test if model works
yellowcard = subset_punk_df.LYRICS[subset_punk_df.ARTIST_NAME == 'yellowcard']


# for full model
# take out spaces in the artist names
new_artist_names = [re.sub(r'\W', '', string = subset_punk_df.ARTIST_NAME[w]) for w in range(len(subset_punk_df.ARTIST_NAME))]

tokenizer = Tokenizer()

tokenizer.fit_on_texts([str(lyr) for lyr in subset_punk_df.LYRICS])

token_seq = tokenizer.texts_to_sequences([str(lyr) for lyr in subset_punk_df.LYRICS])
tokenizer.fit_on_texts(new_artist_names)
artist_seq = tokenizer.texts_to_sequences(new_artist_names)

n_gram_seq = []
artists = []
# for every line in tokenized sequences
for line, band in zip(token_seq, artist_seq):
    # used to append the token_seq starting from 0th element to 1st element
    for length in range(2, len(line)):
        n_gram_seq.append(line[:length])
        artists.append(band)
        
artists = np.array(artists)

In [5]:
# create padded sequences
n_gram_seq_padded = pad_sequences(n_gram_seq, maxlen = 11)

In [6]:
n_gram_seq_padded[:11]

array([[   0,    0,    0,    0,    0,    0,    0,    0,    0,  700, 1615],
       [   0,    0,    0,    0,    0,    0,    0,    0,  700, 1615,   17],
       [   0,    0,    0,    0,    0,    0,    0,  700, 1615,   17,    3],
       [   0,    0,    0,    0,    0,    0,  700, 1615,   17,    3, 1452],
       [   0,    0,    0,    0,    0,  700, 1615,   17,    3, 1452,   90],
       [   0,    0,    0,    0,  700, 1615,   17,    3, 1452,   90,   20],
       [   0,    0,    0,  700, 1615,   17,    3, 1452,   90,   20,  155],
       [   0,    0,  700, 1615,   17,    3, 1452,   90,   20,  155,    7],
       [   0,  700, 1615,   17,    3, 1452,   90,   20,  155,    7,  425],
       [ 700, 1615,   17,    3, 1452,   90,   20,  155,    7,  425,    2],
       [1615,   17,    3, 1452,   90,   20,  155,    7,  425,    2,  407]],
      dtype=int32)

In [9]:
subset_punk_df

Unnamed: 0,ARTIST_NAME,ARTIST_URL,SONG_NAME,SONG_URL,LYRICS
0,all time low,https://www.azlyrics.com/a/alltimelow.html,i can't do the one-two step,https://www.azlyrics.com/lyrics/alltimelow/ica...,front page of the magazine said don't believe ...
1,all time low,https://www.azlyrics.com/a/alltimelow.html,the girl's a straight-up hustler,https://www.azlyrics.com/lyrics/alltimelow/the...,lipstick has a way of leaving more than just a...
2,all time low,https://www.azlyrics.com/a/alltimelow.html,"sticks, stones, and techno",https://www.azlyrics.com/lyrics/alltimelow/sti...,you spin your words like a record in motion st...
3,all time low,https://www.azlyrics.com/a/alltimelow.html,coffee shop soundtrack,https://www.azlyrics.com/lyrics/alltimelow/cof...,should i write myself out of the history books...
4,all time low,https://www.azlyrics.com/a/alltimelow.html,break out! break out!,https://www.azlyrics.com/lyrics/alltimelow/bre...,luck loves me not tonight i'm running out this...
...,...,...,...,...,...
544,yellowcard,https://www.azlyrics.com/y/yellowcard.html,what appears,https://www.azlyrics.com/lyrics/yellowcard/wha...,slow steady hands waving their last goodbye th...
545,yellowcard,https://www.azlyrics.com/y/yellowcard.html,got yours,https://www.azlyrics.com/lyrics/yellowcard/got...,stacking bricks on broken ground building towe...
546,yellowcard,https://www.azlyrics.com/y/yellowcard.html,a place we set afire,https://www.azlyrics.com/lyrics/yellowcard/apl...,you feel it you boxed it by the youth you left...
547,yellowcard,https://www.azlyrics.com/y/yellowcard.html,leave a light on,https://www.azlyrics.com/lyrics/yellowcard/lea...,so where are you and how's it been how's the w...


In [50]:
# create labels by using One Hot Encoding 
labels = to_categorical(n_gram_seq_padded[:,-1:])
X = n_gram_seq_padded[:,:-1]

train_size = round(n_gram_seq_padded.shape[0]*0.8)
ids = np.random.choice(range(len(artists)), train_size, replace = False)
test_ids = np.delete(X, ids, axis = 0)

# create test and train
y_train = labels[ids]
y_test = labels[test_ids]

lyrics_train = X[ids]
lyrics_test = np.delete(X, ids, axis = 0)

artist_train = artists[ids]
artist_test = np.delete(artists, ids, axis = 0)

# find largest vocab size in padded sequence; this is input size
vocab_size = max([w for sentence in n_gram_seq_padded for w in sentence]) + 1
artist_size = max([len(art) for art in artists]) + 1

### Creating the model

In [56]:
tf.config.run_functions_eagerly(True)

In [62]:
artist_train.shape

(107361, 1)

In [63]:
# define the two inputs
inputA = Input(shape = (10, ))
inputB = Input(shape = (1,))

# first branch for first input
lyrics = Embedding(input_dim = vocab_size, output_dim = 64, input_length = 10)(inputA)
lyrics = Bidirectional(LSTM(128, return_sequences = True))(lyrics)
lyrics = Dropout(0.2)(lyrics)
lyrics = LSTM(64)(lyrics)
lyrics = Dense(round(vocab_size/2), activation = 'relu')(lyrics)
lyrics = Dense(vocab_size, activation = 'softmax')(lyrics)
lyrics = Model(inputs = inputA, outputs = lyrics)

# second branch for second input
artist = Dense(64, activation = 'relu')(inputB)
artist = Dense(10, activation = 'relu')(artist)
artist = Dense(vocab_size, activation = 'relu')(artist)
artist = Model(inputs = inputB, outputs = artist)


# combine output of branches
combined = concatenate([lyrics.output, artist.output])


z = Dense(2, activation = 'relu')(combined)
z = Dense(vocab_size, activation = 'softmax')(z)
model = keras.Model(inputs = [lyrics.input, artist.input], outputs = z)

#compile model
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = 'accuracy')

In [58]:
model.summary()

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 10)]         0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 10, 64)       382656      input_1[0][0]                    
__________________________________________________________________________________________________
bidirectional (Bidirectional)   (None, 10, 256)      197632      embedding[0][0]                  
__________________________________________________________________________________________________
dropout (Dropout)               (None, 10, 256)      0           bidirectional[0][0]              
____________________________________________________________________________________________

In [65]:
model.fit([lyrics_train, artist_train], y_train, epochs = 8, batch_size = 32, validation_data = ([lyrics_test, artist_test], y_test))

 154/3356 [>.............................] - ETA: 31:22 - loss: 6.2514 - accuracy: 0.0347

KeyboardInterrupt: 

In [None]:
def generate_lyrics(prompt, author, length):
    '''
    prompt: string of lyrics
    length: length of lyrics that is wanted (includes prompt)
    '''
    # edge case; if prompt is as long as the length wanted
    if len(prompt.split(' ')) == length:
        return prompt
    else:
        a = [re.sub(r'\W', '', string = author)]
        a = tokenizer.texts_to_sequences(a)
        for _ in range(length - len(prompt.split(' '))):
            
            token_list = tokenizer.texts_to_sequences([prompt])[0]
            token_padded = pad_sequences([token_list], maxlen = 11)
            
            # get predicted probability for each word
            predicted_probs = model.predict(token_padded, a)[0]
            
            # find max probability of the each word
            word_choice = predicted_probs.argmax()
            
            # find out what the word is
            output_word = tokenizer.index_word[word_choice]
            
            # add word to the prompt
            prompt += ' ' + output_word
        return prompt

In [None]:
generate_lyrics("eyes are feeling heavy but they never seem to close", 'yellowcard', 20)

In [None]:
generate_lyrics("there's a place off ocean avenue", 20)

In [None]:
tokenizer = Tokenizer()

tokenizer.fit_on_texts([str(lyr) for lyr in punk_df.LYRICS])

token_seq = tokenizer.texts_to_sequences([str(lyr) for lyr in punk_df.LYRICS])

In [None]:
model.predict([lyrics_test[0], artist_test][0])

In [None]:
lyrics_test[0]

In [None]:
artist_train

In [None]:
artists = np.array(artists)

In [None]:
artist_train