In [1]:
import pandas as pd
import numpy as np
import os, string
from sklearn.model_selection import train_test_split
from keras.layers import LSTM, Embedding, Input, Dropout, Dense, Activation
from keras.models import Model, Sequential
from keras.utils import np_utils

Using TensorFlow backend.


In [2]:
data = pd.read_csv("playlists.csv", encoding = "ISO-8859-1")
data = data.drop('Unnamed: 0', axis=1)
data.head()

Unnamed: 0,Playlist Title,Track List
0,newwwww,"['Linger', 'Best Friend', 'Wash It Away', 'Fre..."
1,junior year,"['Dreams - 2004 Remaster', 'Go Your Own Way - ..."
2,running away,"['Closing Time', 'Call Me Maybe', ""Honey, I'm ..."
3,jam,"['Go Your Own Way - 2004 Remaster', 'You Reall..."
4,cc/summer'18,"['Waves', 'Needles & Pins', 'Firework', 'Ring'..."


In [3]:
exclude = set(string.punctuation)
data['Playlist Title'] = data['Playlist Title'].apply(lambda x : ''.join(ch for ch in x if ch not in exclude))
data['Track List'] = data['Track List'].apply(lambda x : ''.join(ch for ch in x if ch not in exclude))

In [4]:
data['Playlist Title'] = data['Playlist Title'].apply(lambda x : 'START_ ' + x + ' _END')

In [5]:
titles = data['Playlist Title'].to_list()
tracks = data['Track List'].to_list()

In [6]:
title_words = set()
for title in titles:
    for word in title.split():
        title_words.add(word)

track_words = set()
for track in tracks:
    for word in track.split():
        track_words.add(word)

In [7]:
# Max Length of track list
max_len_tracks = 0
for l in data['Track List']:
    if len(l.split(' ')) > max_len_tracks:
        max_len_tracks = len(l.split(' '))

In [8]:
# Max Length of title
max_len_title = 0
for l in data['Track List']:
    if len(l.split(' ')) > max_len_title:
        max_len_title = len(l.split(' '))

In [9]:
input_words = sorted(list(track_words))
target_words = sorted(list(title_words))
num_encoder_tokens = len(track_words)
num_decoder_tokens = len(title_words)+1

In [10]:
input_token_index = dict([(word, i+1) for i, word in enumerate(input_words)])
target_token_index = dict([(word, i+1) for i, word in enumerate(target_words)])

In [11]:
reverse_input_char_index = dict((i, word) for word, i in input_token_index.items())
reverse_target_char_index = dict((i, word) for word, i in target_token_index.items())

In [12]:
x,y = data['Track List'], data['Playlist Title']
x_train,x_test, y_train, y_test = train_test_split(x, y, test_size=0.1)
x_train.shape, x_test.shape

((9021,), (1003,))

In [13]:
def gen_batch(x=x_train, y=y_train, batch_size=128):
    while True:
        for i in range(0,len(x),batch_size):
            encoder_input_data = np.zeros((batch_size, max_len_tracks), dtype='float32')
            decoder_input_data = np.zeros((batch_size, max_len_title), dtype='float32')
            decoder_target_data = np.zeros((batch_size, max_len_title, num_decoder_tokens), dtype='float32')
            for j,(input_text, target_text) in enumerate(zip(x[i:i+batch_size], y[i:i+batch_size])):
                for t,word in enumerate(input_text.split()):
                    encoder_input_data[i,t]=input_token_index[word]
                for t,word in enumerate(target_text.split()):
                    if t < len(target_text.split())-1:
                        decoder_input_data[i,t]=target_token_index[word]
                    if t > 0:
                        decoder_target_data[i, t-1, target_token_index[word]]=1
            yield([encoder_input_data, decoder_input_data],decoder_target_data)

In [14]:
#encoder
latent_dim = 64
encoder_inputs = Input(shape=(None,))
enc_embed = Embedding(num_encoder_tokens, latent_dim, mask_zero = True)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state = True)
encoder_outputs, state_h, state_c = encoder_lstm(enc_embed)
encoder_states = [state_h, state_c]

In [15]:
#decoder
decoder_inputs = Input(shape=(None,))
dec_embed_layer = Embedding(num_decoder_tokens, latent_dim, mask_zero=True)
dec_embed = dec_embed_layer(decoder_inputs)

decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_embed, initial_state=encoder_states)
decoder_dense_layer = Dense(num_decoder_tokens, activation = 'softmax')
decoder_outputs = decoder_dense_layer(decoder_outputs)

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [16]:
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['acc'])

In [25]:
train_samples = len(x_train)
val_samples = len(x_test)
batch_size = 128
epochs = 10

model.fit_generator(generator = gen_batch(x_train, y_train, batch_size = batch_size),
                    steps_per_epoch = train_samples//batch_size,
                    epochs=epochs,
                    validation_data = gen_batch(x_test, y_test, batch_size = batch_size),
                    validation_steps = val_samples//batch_size)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/50
 1/70 [..............................] - ETA: 8:52:07 - loss: 0.0139 - acc: 0.0000e+00

IndexError: index 128 is out of bounds for axis 0 with size 128

In [None]:
encoder_model = Model(encoder_inputs, encoder_states)

decoder_states_inputs = [Input(shape=(latent_dim,)),Input(shape=(latent_dim,))]
dec_embed2=dec_embed_layer(decoder_inputs)

decoder_outputs2,state_h2,state_c2 = decoder_lstm(dec_embed2, initial_state=decoder_states_inputs)
decoder_states2 = [state_h2,state_c2]
decoder_outputs2 = decoder_dense_layer(decoder_outputs2)

decoder_model = Model([decoder_inputs]+decoder_states_inputs,[decoder_outputs2]+decoder_states2)

In [None]:
def decode_seq(input_seq):
    states_value = encoder_model.predict(input_seq)
    target_seq = np.zeros((1,1))
    target_seq[0,0] = target_token_index['START_']
    
    stop = False
    decoded = ''
    
    while not stop:
        output_tokens,h,c = decoder_model.predict([target_seq]+states_value)
        
        sample_token_index = np.argmax(output_tokens[0,-1,:])
        sample_char = reverse_target_char_index[sample_token_index]
        decoded += ' ' + sample_char
        
        if (sample_char=='_END' or len(decoded)>50):
            stop = True
        
        target_seq = np.zeros((1,1))
        target_seq[0,0] = sample_token_index
        
        states_value=[h,c]
        
    return decoded

In [None]:
train_gen = gen_batch(x_train,y_train,batch_size=1)
k=-1

In [None]:
k+=1
(input_seq,actual_output),_=next(train_gen)
decoded=decode_seq(input_seq)
print('Tracks:', x_train[k:k+1].values[0])
print('Actual Playlist Title:', y_train[k:k+1].values[0][6:-4])
print('Predicted Playlist Title:', decoded[:-4])