In [1]:
import pandas as pd
import numpy as np
import re
import csv
from string import ascii_lowercase
import random

In [2]:
subset_punk_df = pd.read_csv('../data/subset_punk_bands.csv')
subset_punk_df.drop('Unnamed: 0', axis = 1, inplace = True)

In [3]:
subset_punk_df

Unnamed: 0,ARTIST_NAME,ARTIST_URL,SONG_NAME,SONG_URL,LYRICS
0,5 seconds of summer,https://www.azlyrics.com/19/5secondsofsummer.html,gotta get out,https://www.azlyrics.com/lyrics/5secondsofsumm...,even when the sky is falling down even when th...
1,5 seconds of summer,https://www.azlyrics.com/19/5secondsofsummer.html,better man,https://www.azlyrics.com/lyrics/5secondsofsumm...,find me at a quarter to three cigarette in my ...
2,5 seconds of summer,https://www.azlyrics.com/19/5secondsofsummer.html,more,https://www.azlyrics.com/lyrics/5secondsofsumm...,if me and you are living in the same place why...
3,5 seconds of summer,https://www.azlyrics.com/19/5secondsofsummer.html,why won't you love me,https://www.azlyrics.com/lyrics/5secondsofsumm...,switching into airplane mode again we're not ...
4,5 seconds of summer,https://www.azlyrics.com/19/5secondsofsummer.html,woke up in japan,https://www.azlyrics.com/lyrics/5secondsofsumm...,i woke up in japan feeling low feeling lonely ...
...,...,...,...,...,...
2534,zebrahead,https://www.azlyrics.com/z/zebrahead.html,out of control,https://www.azlyrics.com/lyrics/zebrahead/outo...,i'm a mad man with a mission like a nightmare ...
2535,zebrahead,https://www.azlyrics.com/z/zebrahead.html,photographs,https://www.azlyrics.com/lyrics/zebrahead/phot...,i am alive i am awake not like i'm not aware i...
2536,zebrahead,https://www.azlyrics.com/z/zebrahead.html,politics,https://www.azlyrics.com/lyrics/zebrahead/poli...,firewall school halls cop cars protecting you ...
2537,zebrahead,https://www.azlyrics.com/z/zebrahead.html,with legs like that,https://www.azlyrics.com/lyrics/zebrahead/with...,here she comes again like good medicine every ...


## Creating a model

### Setting up the data

In [4]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import LSTM, Bidirectional, Dense, Input, Embedding, Dropout, concatenate
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras import Model

In [5]:
# "randomly" choose one band to test if model works
yellowcard = punk_df.LYRICS[punk_df.ARTIST_NAME == 'yellowcard']


# for full model
# take out spaces in the artist names
new_artist_names = [re.sub(r'\W', '', string = punk_df.ARTIST_NAME[w]) for w in range(len(punk_df.ARTIST_NAME))]

tokenizer = Tokenizer()

tokenizer.fit_on_texts([str(lyr) for lyr in punk_df.LYRICS])

token_seq = tokenizer.texts_to_sequences([str(lyr) for lyr in punk_df.LYRICS])
tokenizer.fit_on_texts(new_artist_names)
artist_seq = tokenizer.texts_to_sequences(new_artist_names)

n_gram_seq = []
artists = []
# for every line in tokenized sequences
for line, band in zip(token_seq, artist_seq):
    # used to append the token_seq starting from 0th element to 1st element
    for length in range(2, len(line)):
        n_gram_seq.append(line[:length])
        artists.append(band)
        
artists = np.array(artists)

In [6]:
# create padded sequences
n_gram_seq_padded = pad_sequences(n_gram_seq, maxlen = 11)

In [7]:
n_gram_seq_padded[:11]

array([[  0,   0,   0,   0,   0,   0,   0,   0,   0, 189,  33],
       [  0,   0,   0,   0,   0,   0,   0,   0, 189,  33,   3],
       [  0,   0,   0,   0,   0,   0,   0, 189,  33,   3, 366],
       [  0,   0,   0,   0,   0,   0, 189,  33,   3, 366,  22],
       [  0,   0,   0,   0,   0, 189,  33,   3, 366,  22, 247],
       [  0,   0,   0,   0, 189,  33,   3, 366,  22, 247,  45],
       [  0,   0,   0, 189,  33,   3, 366,  22, 247,  45, 189],
       [  0,   0, 189,  33,   3, 366,  22, 247,  45, 189,  33],
       [  0, 189,  33,   3, 366,  22, 247,  45, 189,  33,   3],
       [189,  33,   3, 366,  22, 247,  45, 189,  33,   3, 893],
       [ 33,   3, 366,  22, 247,  45, 189,  33,   3, 893,  22]],
      dtype=int32)

In [33]:
# create labels by using One Hot Encoding 
labels = to_categorical(n_gram_seq_padded[:,-1:])
X = n_gram_seq_padded[:,:-1]

train_size = round(n_gram_seq_padded.shape[0]*0.8)

# create test and train
y_train = labels[:train_size:]
y_test = labels[train_size:,:]

lyrics_train = X[:train_size, :]
lyrics_test = X[train_size:,:]

artist_train = artists[:train_size, :]
artist_test = artists[train_size:,:]

# find largest vocab size in padded sequence; this is input size
vocab_size = max([w for sentence in n_gram_seq_padded for w in sentence]) + 1
artist_size = max([len(art) for art in artists]) + 1

### Creating the model

In [9]:
tf.config.run_functions_eagerly(True)

In [10]:
# define the two inputs
inputA = Input(shape = (10, ))
inputB = Input(shape = (0,))

# first branch for first input
lyrics = Embedding(input_dim = vocab_size, output_dim = 64, input_length = 10)(inputA)
lyrics = Bidirectional(LSTM(128, return_sequences = True))(lyrics)
lyrics = Dropout(0.2)(lyrics)
lyrics = LSTM(64)(lyrics)
lyrics = Dense(round(vocab_size/2), activation = 'relu')(lyrics)
lyrics = Dense(vocab_size, activation = 'softmax')(lyrics)
lyrics = Model(inputs = inputA, outputs = lyrics)

# second branch for second input
artist = Dense(64, activation = 'relu')(inputB)
artist = Dense(10, activation = 'relu')(artist)
artist = Dense(vocab_size, activation = 'relu')(artist)
artist = Model(inputs = inputB, outputs = artist)


# combine output of branches
combined = concatenate([lyrics.output, artist.output])


z = Dense(2, activation = 'relu')(combined)
z = Dense(vocab_size, activation = 'softmax')(z)
model = keras.Model(inputs = [lyrics.input, artist.input], outputs = z)

#compile model
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = 'accuracy')

In [11]:
model.summary()

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 10)]         0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 10, 64)       914432      input_1[0][0]                    
__________________________________________________________________________________________________
bidirectional (Bidirectional)   (None, 10, 256)      197632      embedding[0][0]                  
__________________________________________________________________________________________________
dropout (Dropout)               (None, 10, 256)      0           bidirectional[0][0]              
____________________________________________________________________________________________

In [13]:
model.fit([lyrics_train, artist_train], y_train, epochs = 1, validation_data = ([lyrics_test, artist_test], y_test))



<tensorflow.python.keras.callbacks.History at 0x7fbbda647a50>

In [15]:
def generate_lyrics(prompt, author, length):
    '''
    prompt: string of lyrics
    length: length of lyrics that is wanted (includes prompt)
    '''
    # edge case; if prompt is as long as the length wanted
    if len(prompt.split(' ')) == length:
        return prompt
    else:
        a = [re.sub(r'\W', '', string = author)]
        a = tokenizer.texts_to_sequences(a)
        for _ in range(length - len(prompt.split(' '))):
            
            token_list = tokenizer.texts_to_sequences([prompt])[0]
            token_padded = pad_sequences([token_list], maxlen = 11)
            
            # get predicted probability for each word
            predicted_probs = model.predict(token_padded, a)[0]
            
            # find max probability of the each word
            word_choice = predicted_probs.argmax()
            
            # find out what the word is
            output_word = tokenizer.index_word[word_choice]
            
            # add word to the prompt
            prompt += ' ' + output_word
        return prompt

In [16]:
generate_lyrics("eyes are feeling heavy but they never seem to close", 'yellowcard', 20)

TypeError: unsupported operand type(s) for /: 'int' and 'list'

In [None]:
generate_lyrics("there's a place off ocean avenue", 20)

In [None]:
tokenizer = Tokenizer()

tokenizer.fit_on_texts([str(lyr) for lyr in punk_df.LYRICS])

token_seq = tokenizer.texts_to_sequences([str(lyr) for lyr in punk_df.LYRICS])

In [22]:
model.predict([lyrics_test[0], artist_test][0])

ValueError: Layer model_2 expects 2 input(s), but it received 1 input tensors. Inputs received: [<tf.Tensor: shape=(10, 1), dtype=int32, numpy=
array([[359],
       [  3],
       [162],
       [ 50],
       [  2],
       [ 26],
       [ 67],
       [ 85],
       [940],
       [ 41]], dtype=int32)>]

In [24]:
lyrics_test[0]

array([359,   3, 162,  50,   2,  26,  67,  85, 940,  41], dtype=int32)

In [27]:
artist_train

array([], shape=(480638, 0), dtype=int64)

In [29]:
artists = np.array(artists)

In [34]:
artist_train

array([[ 898],
       [ 898],
       [ 898],
       ...,
       [3791],
       [3791],
       [3791]])