In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from keras.models import Sequential
from keras.layers import SimpleRNN , LSTM , Dense , Activation , Dropout
import random

Using TensorFlow backend.


In [2]:
df = pd.read_csv("LYRICS_DATASET.csv")["Lyrics"]
df = df.sample(frac=1)

### GIT CODE

In [8]:
text = df.str.cat(sep='\n').lower()
print('corpus length:', len(text))

# # Create a sorted list of the characters
chars = sorted(list(set(text)))
print('total chars:', len(chars))

corpus length: 430994
total chars: 73


In [12]:
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

In [16]:
maxlen = 40 # The window size
step = 3 # The steps between the windows
sentences = []
next_chars = []
for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i: i + maxlen]) # range from current index i for max length characters 
    next_chars.append(text[i + maxlen]) # the next character after that 
sentences = np.array(sentences)
next_chars = np.array(next_chars)
print('Number of sequences:', len(sentences))

Number of sequences: 143652


In [17]:
def getdata(sentences, next_chars):
    X = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
    y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
    length = len(sentences)
    index = 0
    for i in range(len(sentences)):
        sentence = sentences[i]
        for t, char in enumerate(sentence):
            X[i, t, char_indices[char]] = 1
        y[i, char_indices[next_chars[i]]] = 1
    return X, y

def generator(sentences, next_chars, batch_size):
    X = np.zeros((batch_size, maxlen, len(chars)), dtype=np.bool)
    y = np.zeros((batch_size, len(chars)), dtype=np.bool)
    length = len(sentences)
    index = 0
    while True:
        if index + batch_size >= length:
            index = 0
        X.fill(0)
        y.fill(0)
        for i in range(batch_size):
            sentence = sentences[index]
            for t, char in enumerate(sentence):
                X[i, t, char_indices[char]] = 1
            y[i, char_indices[next_chars[i]]] = 1
            index = index + 1
        yield X, y

In [18]:
print('Build model...')
model = Sequential()
model.add(LSTM(128, input_shape=(maxlen, len(chars))))
model.add(Dense(len(chars)))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

print("Compiling model complete...")

Build model...
Compiling model complete...


In [19]:
def sample(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [21]:
# Get data
X, y = getdata(sentences, next_chars)

# The training
print('Training...')
batch_size = 128

# Use the below command if you want to use the generator
# history = model.fit_generator(generator(sentences, next_chars, batch_size),steps_per_epoch=12800, epochs=10)

# Use this if they all fit into memory
history = model.fit(X, y,batch_size=128, epochs=3)

Training...
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [22]:
sentence = 'dance all night\ndance all day\ndance away'
x = np.zeros((1, maxlen, len(chars)))
for t, char in enumerate(sentence):
    x[0, t, char_indices[char]] = 1.
    
print(model.predict(x, verbose=0)[0])

[2.46973545e-03 7.41822541e-01 1.34595466e-04 1.04182679e-03
 5.37435881e-06 8.24598689e-03 6.14967212e-05 4.44581034e-03
 1.19459185e-06 8.84427205e-02 2.49962509e-03 1.58483069e-03
 2.17153556e-05 2.44890362e-05 5.03982665e-05 1.71404099e-05
 9.51096899e-06 1.15685634e-05 1.90241790e-05 5.66784229e-06
 1.57579016e-05 1.91032814e-05 1.26093812e-06 4.44709694e-05
 8.60144282e-05 6.23757113e-03 1.11675433e-06 2.48044240e-03
 2.12513097e-03 1.13536348e-03 7.11568445e-03 1.80438645e-02
 8.41407862e-04 2.12555658e-03 2.08464637e-03 8.26658402e-03
 1.17076044e-04 1.73337420e-03 3.31662316e-03 9.34394891e-04
 6.11539744e-03 1.87885966e-02 1.71499036e-03 6.63707033e-05
 4.77941800e-03 3.41566615e-02 1.86528452e-02 2.90284195e-04
 3.49330221e-04 9.21573781e-04 1.36823684e-04 4.73968545e-03
 1.95179513e-04 5.07472066e-07 4.12755435e-06 7.81688357e-07
 6.39239033e-07 5.98795305e-06 8.54322298e-06 2.73509886e-05
 2.04082899e-05 5.19566629e-06 1.57186165e-04 4.17540423e-06
 2.44236056e-04 1.598035

In [24]:
variance = 0.25
print('Variance: ', variance)

generated = ''
original = sentence
window = sentence
# Predict the next 400 characters based on the seed
for i in range(400):
    x = np.zeros((1, maxlen, len(chars)))
    for t, char in enumerate(window):
        x[0, t, char_indices[char]] = 1.

    preds = model.predict(x, verbose=0)[0]
    next_index = sample(preds, variance)
    next_char = indices_char[next_index]

    generated += next_char
    window = window[1:] + next_char

print(original + generated)

Variance:  0.25
[2.46973545e-03 7.41822541e-01 1.34595466e-04 1.04182679e-03
 5.37435881e-06 8.24598689e-03 6.14967212e-05 4.44581034e-03
 1.19459185e-06 8.84427205e-02 2.49962509e-03 1.58483069e-03
 2.17153556e-05 2.44890362e-05 5.03982665e-05 1.71404099e-05
 9.51096899e-06 1.15685634e-05 1.90241790e-05 5.66784229e-06
 1.57579016e-05 1.91032814e-05 1.26093812e-06 4.44709694e-05
 8.60144282e-05 6.23757113e-03 1.11675433e-06 2.48044240e-03
 2.12513097e-03 1.13536348e-03 7.11568445e-03 1.80438645e-02
 8.41407862e-04 2.12555658e-03 2.08464637e-03 8.26658402e-03
 1.17076044e-04 1.73337420e-03 3.31662316e-03 9.34394891e-04
 6.11539744e-03 1.87885966e-02 1.71499036e-03 6.63707033e-05
 4.77941800e-03 3.41566615e-02 1.86528452e-02 2.90284195e-04
 3.49330221e-04 9.21573781e-04 1.36823684e-04 4.73968545e-03
 1.95179513e-04 5.07472066e-07 4.12755435e-06 7.81688357e-07
 6.39239033e-07 5.98795305e-06 8.54322298e-06 2.73509886e-05
 2.04082899e-05 5.19566629e-06 1.57186165e-04 4.17540423e-06
 2.44236

## OLD CODE

In [314]:
lyr_alpha = data
features = []
i=0
for curSong in lyr_alpha:
    curSong = str(curSong).lower().split()
    features.append(curSong)
len(features)

347

In [316]:
features_ord = []
for curSong in features:
    words_ord = []
    for word in curSong:
        s = [ord(ch) for ch in word]
        words_ord.append(s)
    features_ord.append(words_ord)
len(features)
features[0]

['when',
 'the',
 'light',
 'revealed',
 'the',
 'dream',
 'opened',
 'the',
 'door',
 'hoping',
 'to',
 'lean',
 'and',
 'i',
 'gave',
 'my',
 'own',
 'life',
 'and',
 'i',
 'gave',
 'it',
 'all',
 'to',
 'the',
 'wind',
 'all',
 'for',
 'love',
 'gave',
 'that',
 'a',
 'try',
 'head',
 'on',
 'the',
 'floor',
 'most',
 'of',
 'the',
 'time',
 'and',
 'i',
 'gave',
 'my',
 'own',
 'life',
 'and',
 'i',
 'gave',
 'it',
 'all',
 'to',
 'the',
 'wind',
 'love',
 'loving',
 'is',
 'a',
 'new',
 'part',
 'of',
 'and',
 'up',
 'until',
 'today',
 'i',
 'seemed',
 'to',
 'get',
 'away',
 'i',
 'can',
 'see',
 'the',
 'stars',
 'above',
 'swirling',
 'around',
 'to',
 'say',
 'out',
 'went',
 'the',
 'light',
 'as',
 'i',
 'leapt',
 'through',
 'the',
 'door',
 'and',
 'i',
 'followed',
 'it',
 'home',
 'and',
 'i',
 "didn't",
 'know',
 'why',
 'i',
 "didn't",
 'know',
 'why',
 'all',
 'the',
 'nights',
 "i'd",
 'been',
 'afraid',
 'living',
 'life',
 'just',
 'to',
 'fade',
 'away',
 'but',


In [317]:
vocab = []
for song in features:
    for w in range(len(song)):
        vocab.append(song[w])
    for w in range(len(song)-1):
        vocab.append(song[w] + " " + song[w+1])
    for w in range(len(song)-2):
        vocab.append(song[w] + " " + song[w+1] + " " + song[w+2])
    for w in range(len(song)-3):
        vocab.append(song[w] + " " + song[w+1] + " " + song[w+2] + " " + song[w+3])
word_set = set(vocab)

In [318]:
vocab_dict = {}
c = 1
for w in word_set:
    vocab_dict[w] = c
    c += 1
len(vocab_dict)

160655

In [381]:
def train_test(data , window , target):
    dataX , dataY = [] , []
    for song in range(len(data)):
        x = []
        y = []
        for word in range(len(data[song]) - window):
            tr = "".join(i+" " for i in data[song][word : word+window]).strip()
            te = "".join(i+" " for i in data[song][word+window : word+window+target]).strip()
            
            x.append(vocab_dict[tr])
            y.append(vocab_dict[te])

        dataX.append(x)
        dataY.append(y)
    
    return dataX , dataY

In [385]:
window = 4
target = 2
train , test = train_test(features , window , target)
for i in range(len(train)):
    for j in range(len(train[i])):
        train[i][j] = [train[i][j]]
for i in range(len(test)):
    for j in range(len(train[i])):
        train[i][j] = [train[i][j]]
train[0]

[[[147872]],
 [[30016]],
 [[90795]],
 [[70505]],
 [[122425]],
 [[67682]],
 [[21279]],
 [[139969]],
 [[102517]],
 [[108350]],
 [[27692]],
 [[44302]],
 [[71679]],
 [[150060]],
 [[68088]],
 [[83226]],
 [[93830]],
 [[13963]],
 [[106712]],
 [[112762]],
 [[86860]],
 [[26462]],
 [[77626]],
 [[32086]],
 [[23711]],
 [[103173]],
 [[106453]],
 [[57231]],
 [[62317]],
 [[51079]],
 [[44552]],
 [[59926]],
 [[74699]],
 [[155420]],
 [[73498]],
 [[160020]],
 [[97643]],
 [[17345]],
 [[21624]],
 [[103066]],
 [[77355]],
 [[71679]],
 [[150060]],
 [[68088]],
 [[83226]],
 [[93830]],
 [[13963]],
 [[106712]],
 [[112762]],
 [[86860]],
 [[26462]],
 [[77626]],
 [[3148]],
 [[101097]],
 [[94474]],
 [[82182]],
 [[16225]],
 [[59164]],
 [[131276]],
 [[21372]],
 [[48107]],
 [[3806]],
 [[10995]],
 [[10531]],
 [[15769]],
 [[55743]],
 [[48725]],
 [[42163]],
 [[128403]],
 [[90454]],
 [[99707]],
 [[117653]],
 [[50696]],
 [[106596]],
 [[107991]],
 [[81129]],
 [[113998]],
 [[74790]],
 [[145875]],
 [[45108]],
 [[11900]],
 [[684

In [369]:
clipped_train = []
for i in train:
    clipped_train.append(np.array(i[:50]))
clipped_train = np.array(clipped_train)

clipped_test = []
for i in test:
    clipped_test.append(np.array(i[:50]))
clipped_test = np.array(clipped_test)


  after removing the cwd from sys.path.
  if __name__ == '__main__':


(347, 347)

In [379]:
np.array([[[1],[-1]],[[2],[-1]]]).shape

(2, 2, 1)

In [321]:
xtr = np.reshape(train , (train.shape[0] , 4))
ytr = np.reshape(test , (test.shape[0] , 2))
xtr[0]

ValueError: cannot reshape array of size 347 into shape (347,4)

In [371]:
model = Sequential()

model.add( SimpleRNN(4 , input_shape=(1 , 1)) )

model.add( Dense(1) )

model.compile(loss="mse" , optimizer="adam" , metrics=["accuracy"])

model.fit(clipped_train[0] , clipped_test[0] , epochs=20 , batch_size=1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7f0897544898>