### Import Required Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf

  from ._conv import register_converters as _register_converters


### Import Dataset

In [2]:
df = pd.read_csv('data/arijit_songs_with_lyrics.csv')
df.head()

Unnamed: 0,name,from,href,lyrics
0,Aa Jao Na,"[from ""Veere Di Wedding"" soundtrack]",/lyrics/bollywood/aajaona.html,\n\r\nTum thhe yahin\nPhir bhi tum gum thhe\nA...
1,Aaj Phir,"[from ""Hate Story 2"" soundtrack]",/lyrics/arijitsingh/aajphir.html,\n\r\nAaj phir tumpe pyar aaya hai\nAaj phir t...
2,Aaj Se Teri,"[from ""Padman"" soundtrack]",/lyrics/arijitsingh/aajseteri.html,\n\r\nAaj se teri saari galiyan meri ho gayi\n...
3,Aasan Nahin Yahan,"[from ""Aashiqui 2"" soundtrack]",/lyrics/arijitsingh/aasannahinyahan.html,\n\r\nWo o o o...\n\nAasaan nahi yahaan aashiq...
4,Ab Raat,"[from ""Dobaara"" soundtrack]",/lyrics/arijitsingh/abraat.html,\n\r\nChaand ki aankhein bhaari si hain\nRaat ...


### Preprocessing Lyrics 
1. Get all the lyrics into one single string(for simplicity).
2. Divide the string into characters. 
3. Group characters of size given by timesteps together for input.
4. Output will be the character just following the input.
5. Slide the window of size timestep by number of steps(=1) to make other inputs and outputs.

In [3]:
# 1. Get all the lyrics into one single string(for simplicity)
text = '' # will hold all the text data
for idx, row in df['lyrics'].iteritems():
    text = text + row

In [4]:
lower_text = text.lower()

In [5]:
# playground code - can be removed
str('abc')
sorted(list(set('abc\rd[ax2]b\n')))

['\n', '\r', '2', '[', ']', 'a', 'b', 'c', 'd', 'x']

#### Convert text to lower case 

In [9]:
# 2. Divide the string into characters. 
list_chars = sorted(list(set(lower_text)))
len(list_chars)

55

In [10]:
import collections
chars = collections.Counter(lower_text)
chars

Counter({'\n': 6493,
         '\r': 129,
         't': 4250,
         'u': 3617,
         'm': 3860,
         ' ': 19854,
         'h': 9034,
         'e': 8915,
         'y': 2584,
         'a': 21016,
         'i': 8612,
         'n': 6862,
         'p': 1359,
         'r': 5118,
         'b': 2151,
         'g': 1448,
         'l': 2593,
         'j': 2024,
         'o': 4723,
         's': 3392,
         '[': 141,
         'x': 140,
         '2': 108,
         ']': 141,
         'd': 3055,
         'w': 625,
         'k': 3936,
         'c': 1000,
         '?': 24,
         '…': 340,
         'v': 294,
         'z': 574,
         'q': 211,
         '-': 164,
         'f': 311,
         ',': 487,
         '.': 1130,
         '(': 65,
         ')': 65,
         '!': 49,
         "'": 38,
         '4': 16,
         '3': 7,
         '’': 9,
         '8': 3,
         ':': 7,
         'é': 10,
         '*': 1,
         '‘': 3,
         '1': 6,
         '6': 2,
         '0': 6,
         '

In [11]:
# 3. Divide the strings into size timesteps size
# 4.
# 5.
timesteps = 20
sentences = []
outputs = []
for i in range(0, len(lower_text)-timesteps):
    sentences.append(lower_text[i: i + timesteps])
    outputs.append(lower_text[i + timesteps])

In [12]:
print(sentences[0:2])
print(len(sentences))

['\n\r\ntum thhe yahin\nph', '\r\ntum thhe yahin\nphi']
130987


In [13]:
outputs[0:2]

['i', 'r']

In [14]:
# 6. Tokenize array of strings to chars 
tokenized_sentences = [list(sentence) for sentence in sentences]
tokenized_outputs = [list(output) for output in outputs]

In [15]:
# 7. Create character-to-index and index-to-character mappings
char_index = {char: idx for idx, char in enumerate(list_chars)}
index_char = [char for char in list_chars]

In [16]:
print('Character to Index: - ', char_index)
print('Index to Character: - ', index_char)

Character to Index: -  {'\n': 0, '\r': 1, ' ': 2, '!': 3, "'": 4, '(': 5, ')': 6, '*': 7, ',': 8, '-': 9, '.': 10, '0': 11, '1': 12, '2': 13, '3': 14, '4': 15, '6': 16, '8': 17, ':': 18, '?': 19, '[': 20, ']': 21, 'a': 22, 'b': 23, 'c': 24, 'd': 25, 'e': 26, 'f': 27, 'g': 28, 'h': 29, 'i': 30, 'j': 31, 'k': 32, 'l': 33, 'm': 34, 'n': 35, 'o': 36, 'p': 37, 'q': 38, 'r': 39, 's': 40, 't': 41, 'u': 42, 'v': 43, 'w': 44, 'x': 45, 'y': 46, 'z': 47, 'é': 48, '–': 49, '‘': 50, '’': 51, '“': 52, '”': 53, '…': 54}
Index to Character: -  ['\n', '\r', ' ', '!', "'", '(', ')', '*', ',', '-', '.', '0', '1', '2', '3', '4', '6', '8', ':', '?', '[', ']', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'é', '–', '‘', '’', '“', '”', '…']


In [17]:
# 8. Label encoding
le_tokenized_sentences = [[char_index[char] for char in sent] for sent in tokenized_sentences]
le_tokenized_outputs = [[char_index[char] for char in output] for output in tokenized_outputs]

In [18]:
# 9. Convert to one hot encodings
for i, sent in enumerate(le_tokenized_sentences):
    for j, index in enumerate(sent):
        le_tokenized_sentences[i][j] = np.zeros(len(list_chars))
        le_tokenized_sentences[i][j][index] = 1

In [19]:
le_tokenized_sentences[0]

[array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.]),
 array([0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.]),
 array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.]),
 array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0.

In [20]:
for i, output in enumerate(le_tokenized_outputs):
    for j, index in enumerate(output):
        le_tokenized_outputs[i][j] = np.zeros(len(list_chars))
        le_tokenized_outputs[i][j][index] = 1

In [21]:
le_tokenized_outputs[0]

[array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.])]

### Model Training
We will create a model to train it with the above sentences. One hot encoding of the sentences will be done here itself.
1. Timesteps is 20
2. Step is 1


In [22]:
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import LSTM
from keras.optimizers import RMSprop

Using TensorFlow backend.


In [23]:
model = Sequential()
model.add(LSTM(128, input_shape=(timesteps, len(list_chars))))

model.add(Dense(len(chars)))
model.add(Activation('softmax'))

# compile the model and pick the loss and optimizer
model.compile(loss='categorical_crossentropy', optimizer=RMSprop(lr=0.01))

In [24]:
le_tokenized_outputs = np.asanyarray(le_tokenized_outputs)
le_tokenized_sentences = np.asanyarray(le_tokenized_sentences)

In [30]:
le_tokenized_outputs = le_tokenized_outputs.reshape((-1, 55))

In [31]:
le_tokenized_sentences.shape
le_tokenized_outputs.shape

(130987, 55)

In [32]:
# train the model
model.fit(le_tokenized_sentences, le_tokenized_outputs, batch_size=128, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f442320a940>

In [33]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 128)               94208     
_________________________________________________________________
dense_1 (Dense)              (None, 55)                7095      
_________________________________________________________________
activation_1 (Activation)    (None, 55)                0         
Total params: 101,303
Trainable params: 101,303
Non-trainable params: 0
_________________________________________________________________
