### Import 

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf

### Import Dataset

In [2]:
df = pd.read_csv('data/arijit_songs_with_lyrics.csv')
df.head()

Unnamed: 0,name,from,href,lyrics
0,Aa Jao Na,"[from ""Veere Di Wedding"" soundtrack]",/lyrics/bollywood/aajaona.html,\n\r\nTum thhe yahin\nPhir bhi tum gum thhe\nA...
1,Aaj Phir,"[from ""Hate Story 2"" soundtrack]",/lyrics/arijitsingh/aajphir.html,\n\r\nAaj phir tumpe pyar aaya hai\nAaj phir t...
2,Aaj Se Teri,"[from ""Padman"" soundtrack]",/lyrics/arijitsingh/aajseteri.html,\n\r\nAaj se teri saari galiyan meri ho gayi\n...
3,Aasan Nahin Yahan,"[from ""Aashiqui 2"" soundtrack]",/lyrics/arijitsingh/aasannahinyahan.html,\n\r\nWo o o o...\n\nAasaan nahi yahaan aashiq...
4,Ab Raat,"[from ""Dobaara"" soundtrack]",/lyrics/arijitsingh/abraat.html,\n\r\nChaand ki aankhein bhaari si hain\nRaat ...


### Preprocessing Lyrics 
1. Get all the lyrics into one single string(for simplicity).
2. Divide the string into characters. 
3. Group characters of size given by timesteps together for input.
4. Output will be the character just following the input.
5. Slide the window of size timestep by number of steps(=1) to make other inputs and outputs.

In [3]:
# 1. Get all the lyrics into one single string(for simplicity)
text = '' # will hold all the text data
for idx, row in df['lyrics'].iteritems():
    text = text + row

In [4]:
lower_text = text.lower()

In [5]:
# playground code - can be remove
str('abc')
sorted(list(set('abc\rd[ax2]b\n')))

['\n', '\r', '2', '[', ']', 'a', 'b', 'c', 'd', 'x']

#### Convert text to lower case 

In [6]:
# 2. Divide the string into characters. 
list_chars = sorted(list(set(text)))
len(list_chars)

80

In [7]:
import collections
chars = collections.Counter(text.lower())
chars

Counter({'\n': 6493,
         '\r': 129,
         't': 4250,
         'u': 3617,
         'm': 3860,
         ' ': 19854,
         'h': 9034,
         'e': 8915,
         'y': 2584,
         'a': 21016,
         'i': 8612,
         'n': 6862,
         'p': 1359,
         'r': 5118,
         'b': 2151,
         'g': 1448,
         'l': 2593,
         'j': 2024,
         'o': 4723,
         's': 3392,
         '[': 141,
         'x': 140,
         '2': 108,
         ']': 141,
         'd': 3055,
         'w': 625,
         'k': 3936,
         'c': 1000,
         '?': 24,
         '…': 340,
         'v': 294,
         'z': 574,
         'q': 211,
         '-': 164,
         'f': 311,
         ',': 487,
         '.': 1130,
         '(': 65,
         ')': 65,
         '!': 49,
         "'": 38,
         '4': 16,
         '3': 7,
         '’': 9,
         '8': 3,
         ':': 7,
         'é': 10,
         '*': 1,
         '‘': 3,
         '1': 6,
         '6': 2,
         '0': 6,
         '

In [8]:
# 3. Divide the strings into size timesteps size
# 4.
# 5.
timesteps = 20
sentences = []
outputs = []
for i in range(0, len(lower_text)-timesteps):
    sentences.append(lower_text[i: i + timesteps])
    outputs.append(lower_text[i + timesteps])

In [12]:
print(sentences[0:2])
print(len(sentences))

['\n\r\ntum thhe yahin\nph', '\r\ntum thhe yahin\nphi']
130987


In [13]:
outputs[0:2]

['i', 'r']

In [14]:
# 6. Tokenize array of strings to chars 
tokenized_sentences = [list(sentence) for sentence in sentences]

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [16]:
# 7. Create character-to-index and index-to-character mappings
char_index = {char: idx for idx, char in enumerate(list_chars)}
index_char = {idx: char for idx, char in enumerate(list_chars)}

In [20]:
print('Character to Index: - ', char_index)
print('Index to Character: - ', index_char)

Character to Index: -  {'\n': 0, '\r': 1, ' ': 2, '!': 3, "'": 4, '(': 5, ')': 6, '*': 7, ',': 8, '-': 9, '.': 10, '0': 11, '1': 12, '2': 13, '3': 14, '4': 15, '6': 16, '8': 17, ':': 18, '?': 19, 'A': 20, 'B': 21, 'C': 22, 'D': 23, 'E': 24, 'F': 25, 'G': 26, 'H': 27, 'I': 28, 'J': 29, 'K': 30, 'L': 31, 'M': 32, 'N': 33, 'O': 34, 'P': 35, 'Q': 36, 'R': 37, 'S': 38, 'T': 39, 'U': 40, 'V': 41, 'W': 42, 'Y': 43, 'Z': 44, '[': 45, ']': 46, 'a': 47, 'b': 48, 'c': 49, 'd': 50, 'e': 51, 'f': 52, 'g': 53, 'h': 54, 'i': 55, 'j': 56, 'k': 57, 'l': 58, 'm': 59, 'n': 60, 'o': 61, 'p': 62, 'q': 63, 'r': 64, 's': 65, 't': 66, 'u': 67, 'v': 68, 'w': 69, 'x': 70, 'y': 71, 'z': 72, 'é': 73, '–': 74, '‘': 75, '’': 76, '“': 77, '”': 78, '…': 79}
Index to Character: -  {0: '\n', 1: '\r', 2: ' ', 3: '!', 4: "'", 5: '(', 6: ')', 7: '*', 8: ',', 9: '-', 10: '.', 11: '0', 12: '1', 13: '2', 14: '3', 15: '4', 16: '6', 17: '8', 18: ':', 19: '?', 20: 'A', 21: 'B', 22: 'C', 23: 'D', 24: 'E', 25: 'F', 26: 'G', 27: '