# Pre-processing 

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf

  from ._conv import register_converters as _register_converters


In [2]:
df = pd.read_csv('data/arijit_songs_with_lyrics.csv')
df.head()

Unnamed: 0,name,from,href,lyrics
0,Aa Jao Na,"[from ""Veere Di Wedding"" soundtrack]",/lyrics/bollywood/aajaona.html,\n\r\nTum thhe yahin\nPhir bhi tum gum thhe\nA...
1,Aaj Phir,"[from ""Hate Story 2"" soundtrack]",/lyrics/arijitsingh/aajphir.html,\n\r\nAaj phir tumpe pyar aaya hai\nAaj phir t...
2,Aaj Se Teri,"[from ""Padman"" soundtrack]",/lyrics/arijitsingh/aajseteri.html,\n\r\nAaj se teri saari galiyan meri ho gayi\n...
3,Aasan Nahin Yahan,"[from ""Aashiqui 2"" soundtrack]",/lyrics/arijitsingh/aasannahinyahan.html,\n\r\nWo o o o...\n\nAasaan nahi yahaan aashiq...
4,Ab Raat,"[from ""Dobaara"" soundtrack]",/lyrics/arijitsingh/abraat.html,\n\r\nChaand ki aankhein bhaari si hain\nRaat ...


### Preprocessing Lyrics 
1. Get all the lyrics into one single string(for simplicity).
2. Divide the string into characters. 
3. Group characters of size given by timesteps together for input.
4. Output will be the character just following the input.
5. Slide the window of size timestep by number of steps(=1) to make other inputs and outputs.

In [3]:
# 1. Get all the lyrics into one single string(for simplicity)
text = '' # will hold all the text data
for idx, row in df['lyrics'].iteritems():
    text = text + row

In [4]:
# Convert text to lower case and use only lower case characters
lower_text = text.lower()

In [5]:
# playground code - can be removed
str('abc')
sorted(list(set('abc\rd[ax2]b\n')))

['\n', '\r', '2', '[', ']', 'a', 'b', 'c', 'd', 'x']

In [6]:
# 2. Divide the string into characters. 
list_chars = sorted(list(set(lower_text)))
print("Total characters:-", len(list_chars))

Total characters:- 55


In [7]:
import collections
chars = collections.Counter(lower_text)
print("Most common 30 characters (array of tuples):-", chars.most_common(30))

dict_chars_collections = dict(chars.most_common(30))
print("\nMost common 30 characters (dictionary):-", dict_chars_collections)

list_chars = list(dict_chars_collections.keys())
print("\nFinal list of characters:-", list_chars)

list_chars_not = list(dict(set(chars.most_common()) - set(chars.most_common(30))))
print("\nCharacters not in the final list:-", list_chars_not)

Most common 30 characters (array of tuples):- [('a', 21016), (' ', 19854), ('h', 9034), ('e', 8915), ('i', 8612), ('n', 6862), ('\n', 6493), ('r', 5118), ('o', 4723), ('t', 4250), ('k', 3936), ('m', 3860), ('u', 3617), ('s', 3392), ('d', 3055), ('l', 2593), ('y', 2584), ('b', 2151), ('j', 2024), ('g', 1448), ('p', 1359), ('.', 1130), ('c', 1000), ('w', 625), ('z', 574), (',', 487), ('…', 340), ('f', 311), ('v', 294), ('q', 211)]

Most common 30 characters (dictionary):- {'a': 21016, ' ': 19854, 'h': 9034, 'e': 8915, 'i': 8612, 'n': 6862, '\n': 6493, 'r': 5118, 'o': 4723, 't': 4250, 'k': 3936, 'm': 3860, 'u': 3617, 's': 3392, 'd': 3055, 'l': 2593, 'y': 2584, 'b': 2151, 'j': 2024, 'g': 1448, 'p': 1359, '.': 1130, 'c': 1000, 'w': 625, 'z': 574, ',': 487, '…': 340, 'f': 311, 'v': 294, 'q': 211}

Final list of characters:- ['a', ' ', 'h', 'e', 'i', 'n', '\n', 'r', 'o', 't', 'k', 'm', 'u', 's', 'd', 'l', 'y', 'b', 'j', 'g', 'p', '.', 'c', 'w', 'z', ',', '…', 'f', 'v', 'q']

Characters not in

In [8]:
# replace all chars not present in the list
import re 

print("Replacing:")
for char in list_chars_not:
    print(char,end='| ')
    lower_text = lower_text.replace(char, "")
    
print(lower_text)

Replacing:
3| –| x| ]| 0| 1| é| )| ?| ’| 6| *| “| [| :| ‘| !| 2| ”| 8| (| 4| | '| -| 

tum thhe yahin
phir bhi tum gum thhe
aur main laapata
ab jo mile ho toh phir
saath hi mein reh jaao na 

thode thode se poore
aur thode adhoore
ye waade rahe kya pataa
poore honge kayi khwab
reh jaayenge kuch adhoore
abhi kya pataa

aa jao na…
aa jao na…
aa jao itna bhi kya sochna
aa jaao na
aa jaao na
aa jaao itna bhi kya sochna

thoda sahi ik doosre mein
aa reh le kahin
aa jee bhi le
kab kyun kahaan kaise
soche nahin…

aa chal waadon ke bhatke huve
jugnuon ko dikha dein sahi raasta
aa chal sotey sitaaron ko haule se
sehla ke roshan karein aasmaan

aa jao na…
aa jao na…
aa jao itna bhi kya sochna
aa jaao na
aa jaao na
aa jaao itna bhi kya sochna…

hahmmm…

shaamein keyi hongi thehri hui
baatein keyi hongi roothi hui
chhoti si zidd hogi
lambi si raatein
phir bhi pyaar reh jaayega
rehta hamesha toh kuch bhi nahin
phir bhi naa jaane kyun mujhko yaqeen
sab beetne par bhi
sab chhutne par bhi
yeh pyaar r

In [9]:
# 3. Divide the strings into size 'timesteps'.
# 4. Output will be the character just following the input.
# 5. Slide the window of size 'timesteps' by number of steps(=1) to make other inputs and outputs.
timesteps = 20
sentences = []
outputs = []

for i in range(0, len(lower_text)-timesteps):
    sentences.append(lower_text[i: i + timesteps])
    outputs.append(lower_text[i + timesteps])

In [10]:
print("Sample sentences: ", sentences[0:3])
print("Corresponding sample outputs: ", outputs[0:3])
print("Total sentences: ", len(sentences))

Sample sentences:  ['\n\ntum thhe yahin\nphi', '\ntum thhe yahin\nphir', 'tum thhe yahin\nphir ']
Corresponding sample outputs:  ['r', ' ', 'b']
Total sentences:  129848


In [11]:
# 6. Tokenize array of strings to chars 
tkn_sentences = [list(sent) for sent in sentences]
tkn_outputs = [list(out) for out in outputs]

In [12]:
# 7. Create character-to-index and index-to-character mappings
char_index = {char: idx for idx, char in enumerate(list_chars)}
index_char = [char for char in list_chars]

print('Character to Index: - ', char_index)
print('Index to Character: - ', index_char)

Character to Index: -  {'a': 0, ' ': 1, 'h': 2, 'e': 3, 'i': 4, 'n': 5, '\n': 6, 'r': 7, 'o': 8, 't': 9, 'k': 10, 'm': 11, 'u': 12, 's': 13, 'd': 14, 'l': 15, 'y': 16, 'b': 17, 'j': 18, 'g': 19, 'p': 20, '.': 21, 'c': 22, 'w': 23, 'z': 24, ',': 25, '…': 26, 'f': 27, 'v': 28, 'q': 29}
Index to Character: -  ['a', ' ', 'h', 'e', 'i', 'n', '\n', 'r', 'o', 't', 'k', 'm', 'u', 's', 'd', 'l', 'y', 'b', 'j', 'g', 'p', '.', 'c', 'w', 'z', ',', '…', 'f', 'v', 'q']


In [13]:
# 8. Label encoding
le_tkn_sentences = [[char_index[char] for char in sent] for sent in tkn_sentences]
le_tkn_outputs = [[char_index[char] for char in out] for out in tkn_outputs]

print("Tokenized sentence:", le_tkn_sentences[0:10])
print("Tokenized output:", le_tkn_outputs[0:10])

Tokenized sentence: [[6, 6, 9, 12, 11, 1, 9, 2, 2, 3, 1, 16, 0, 2, 4, 5, 6, 20, 2, 4], [6, 9, 12, 11, 1, 9, 2, 2, 3, 1, 16, 0, 2, 4, 5, 6, 20, 2, 4, 7], [9, 12, 11, 1, 9, 2, 2, 3, 1, 16, 0, 2, 4, 5, 6, 20, 2, 4, 7, 1], [12, 11, 1, 9, 2, 2, 3, 1, 16, 0, 2, 4, 5, 6, 20, 2, 4, 7, 1, 17], [11, 1, 9, 2, 2, 3, 1, 16, 0, 2, 4, 5, 6, 20, 2, 4, 7, 1, 17, 2], [1, 9, 2, 2, 3, 1, 16, 0, 2, 4, 5, 6, 20, 2, 4, 7, 1, 17, 2, 4], [9, 2, 2, 3, 1, 16, 0, 2, 4, 5, 6, 20, 2, 4, 7, 1, 17, 2, 4, 1], [2, 2, 3, 1, 16, 0, 2, 4, 5, 6, 20, 2, 4, 7, 1, 17, 2, 4, 1, 9], [2, 3, 1, 16, 0, 2, 4, 5, 6, 20, 2, 4, 7, 1, 17, 2, 4, 1, 9, 12], [3, 1, 16, 0, 2, 4, 5, 6, 20, 2, 4, 7, 1, 17, 2, 4, 1, 9, 12, 11]]
Tokenized output: [[7], [1], [17], [2], [4], [1], [9], [12], [11], [1]]


In [14]:
# 9. Convert to one hot encodings
ohe_tkn_sentences = le_tkn_sentences
ohe_tkn_outputs = le_tkn_outputs

for i, sent in enumerate(ohe_tkn_sentences):
    for j, index in enumerate(sent):
        ohe_tkn_sentences[i][j] = np.zeros(len(list_chars))
        ohe_tkn_sentences[i][j][index] = 1
        
for i, output in enumerate(ohe_tkn_outputs):
    for j, index in enumerate(output):
        ohe_tkn_outputs[i][j] = np.zeros(len(list_chars))
        ohe_tkn_outputs[i][j][index] = 1

In [15]:
print("One hot encoded sentence sample:", ohe_tkn_sentences[0:10])
print("One hot encoded output sample:", ohe_tkn_outputs[0:10])

One hot encoded sentence sample: [[array([0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]), array([0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]), array([0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]), array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]), array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]), array([0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]), array([0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]), array([0., 0.

### Model Training
We will create a model to train it with the above sentences. One hot encoding of the sentences will be done here itself.
1. Timesteps is 20
2. Step is 1


In [16]:
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import LSTM
from keras.optimizers import RMSprop

Using TensorFlow backend.


In [17]:
model = Sequential()
model.add(LSTM(128, input_shape=(timesteps, len(list_chars))))

model.add(Dense(len(chars)))
model.add(Activation('softmax'))

# compile the model and pick the loss and optimizer
model.compile(loss='categorical_crossentropy', optimizer=RMSprop(lr=0.01))

In [None]:
le_tokenized_outputs = np.asanyarray(le_tokenized_outputs)
le_tokenized_sentences = np.asanyarray(le_tokenized_sentences)

In [None]:
le_tokenized_outputs = le_tokenized_outputs.reshape((-1, 55))

In [None]:
le_tokenized_sentences.shape
le_tokenized_outputs.shape

In [None]:
# train the model
history = model.fit(le_tokenized_sentences, le_tokenized_outputs, batch_size=128, epochs=10)

In [None]:
model.summary()

In [28]:
from keras.models import load_model 
import random
import sys

In [19]:
model = load_model('./generate-songs-1.h5')

In [44]:
def generate_output(model, steps=100):
    rand_idx = random.randint(0, len(lower_text) - timesteps)
    print('Random index: ', rand_idx)
    
    sentence = lower_text[rand_idx:rand_idx + timesteps]
    
    print('Seed: ', sentence)
    sys.stdout.write(sentence)
    
    for i in range(steps):
        x_pred = np.zeros((1, timesteps, len(list_chars)), dtype='int')
        
        # sentence to one hot encoded vector
        for j, char in enumerate(sentence):
            x_pred[0, j, char_index[char]] = 1
            
        # model prediction
        y_pred = model.predict(x_pred)[0]
        next_char = np.random.choice(np.asarray(list_chars), p = y_pred)
        
        # append new character to the sentence
        sentence = sentence[1:] + next_char
        
        sys.stdout.write(next_char)
    
    print('\n\nDone!')

In [45]:
generate_output(model, steps=500)

Random index:  63649
Seed:  t main jaandi aan
ha
t main jaandi aan
hasamosh hoond toh chittisirog, maange pishaayein...

chup rahe hum mein sabhi ke liye
teri tere hi khidda ki main
ya yeh kahen jaao mere
peene kadi kar ke
chute jaan
humdard ki hir jiyaan hai

aa jadoiya meri baat aar
hmai na paas nak rahe
tumko pe chaahte hok ekdhein

tu nghiyan hai..

aa tujhko rogh pe jaayegi
phir mushkhdan mein tera, dil de chaahe do
harhta khujan sehna
ye duaane pyar ki sua jiya haina
gin se judhaadunk mastabisan ve tera


bachp jaan meri
yeh ishq mein tu meri yaar na ho tu 

Done!
