# Generate Embeddings

This script contains code snippets to outline the process of generating embeddings for various fields in the songs.csv - songs, artist, composer, lyricist etc. We generate embeddings using the following method: Taking the one of the columns as input, we try to predict the output of the other 3 columns. There are two ways we can take the individual rows of the input column, as input - (1) char-rnn (2) A one hot encoding with each unique input element will be considered different from each other. Advantage of (1) is that it will capture textual level similarity between the names whereas (2) will be faster to train and will avoid capturing misleading features

In [1]:
from random import shuffle

import keras
from keras.models import Model
from keras.layers import Input, Dense, Dropout, Embedding
import pandas as pd
import numpy as np
import scipy.sparse

Using TensorFlow backend.


## Loading Data

In [2]:
data = pd.read_csv('new_data/shortlisted_song.csv').sample(n=1000).fillna('')
print("Data Loaded")

Data Loaded


  interactivity=interactivity, compiler=compiler, result=result)


Lets start by looking at number of distinct characters and number of distinct units in each columns. This will (hopefully) help in deciding which of the two approaches to choose from

In [3]:
def get_unique_chars(data, column):
    char_set = set([c for (i, row) in data.iterrows() for c in str(row[column])])
    return len(char_set)

# Some of the rows corresponding to a column have multiple values separated by '|'
# character. We need to split and separate these multiple values

def get_unique_entities(data, column):
    data[column] = data[column].apply(str)
    unique = set([name.strip() for row in data[column].str.split(r'[/|]') for name in row if name.strip() is not ''])
    return unique

In [4]:
#num_chars_artist_name = get_unique_chars(data, 'artist_name')
#num_chars_composer = get_unique_chars(data, 'composer')
#num_chars_lyricist = get_unique_chars(data, 'lyricist')
#num_chars_song_id = get_unique_chars(data, 'song_id')

In [5]:
#unique_artists = get_unique_entities(data, 'artist_name')
#unique_composers = get_unique_entities(data, 'composer')
#unique_lyricists = get_unique_entities(data, 'lyricist')
#unique_songs = get_unique_entities(data, 'song_id')
#print("Unique elements identified")

In [6]:
def to_one_hot(batch_rows, lengths):
    one_hot = [None]*len(batch_rows[0])
    for i in range(len(one_hot)):
        one_hot[i] = np.zeros((batch_size, lengths[i]))
    
    for (row_num, row) in enumerate(batch_rows):
        for (i, element) in enumerate(row):
            for part in element:
                one_hot[i][row_num][part] = 1
            
    return one_hot

def generate_mapper(data, column):
    unique_elements = get_unique_entities(data, column)
    mapper = dict()
    #mapper['<unk>'] = 0
    for u in unique_elements:
        mapper[u] = len(mapper)
    return mapper

In [7]:
artist_mapper = generate_mapper(data, 'artist_name')
composer_mapper = generate_mapper(data, 'composer')
lyricist_mapper = generate_mapper(data, 'lyricist')
song_mapper = data.song_id.to_dict()
mappers = [artist_mapper, composer_mapper, lyricist_mapper, song_mapper]

In [8]:
#oh_artist = to_one_hot(data.artist_name, artist_mapper)
#oh_composer = to_one_hot(data.composer, composer_mapper)
#oh_lyricist = to_one_hot(data.lyricist, lyricist_mapper)
#oh_song = to_one_hot(data.song_id, song_mapper)
#print("Input-output matrices generated")

## Creating the model


We will start by creating a simple MLP model with one hidden layer. This corresponds to idea (2).

Changeable parameters:

* `num_hidden_units`
* `hidden_activation`
* `dropout`
* `batch_size`

In [32]:
def batch_generator(data, input_columns, target_columns, mappers, batch_size):
    num_rows = data.shape[0]
    num_inputs = len(input_columns)
    num_outputs = len(target_columns)
    all_columns = target_columns
    encoded_data = [None]*num_rows
    
    for i, e in enumerate(encoded_data):
        encoded_data[i] = [None]*(num_inputs+num_outputs)
        
    row_num = 0
    for (_, row) in data[all_columns].iterrows():
        for (col_id, element) in enumerate(row):
            parts = [p.strip() for p in str(element).split(r'[/|]')]
            encoded_data[row_num][col_id] = []
            for p in parts:
                if p in mappers[col_id]:
                    encoded_data[row_num][col_id].append(mappers[col_id][p])
                #else:
                #    encoded_data[row_num][col_id].append(mappers[col_id]['<unk>'])
        row_num += 1
        
    lengths = list(map(len, mappers))    
    
    while True:
        shuffle(encoded_data)
        count = 0

        while count<=int(data.shape[0]/batch_size):
            idxs = slice(count*batch_size, min((count+1)*batch_size, num_rows))
            batch = encoded_data[idxs]
            count += 1
            yield (data[input_columns].iloc[idxs], to_one_hot(batch, lengths))
        

In [33]:
batch_size=64
input_generator = batch_generator(data, ['song_id'],
                    ['composer', 'lyricist', 'artist_name'],
                    mappers, batch_size)

In [20]:
input_col = 'song_id'
input_shape = len(mappers[0])
output_shapes = [len(mappers[1]), len(mappers[2]), len(mappers[3])]
num_hidden_units = 100
hidden_activation = 'relu'
dropout = 1.0

input_features = Input(shape = (1,))
embeddings = Embedding(output_dim = num_hidden_units, input_dim = input_shape)(input_features)
output_0 = Dense(output_shapes[0], activation='softmax')(embeddings)
output_1 = Dense(output_shapes[1], activation='softmax')(embeddings)
output_2 = Dense(output_shapes[2], activation='softmax')(embeddings)

model = keras.models.Model(inputs = [input_features],
                           outputs = [output_0, output_1, output_2])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
print("model compiled")

model compiled


In [34]:
next(input_generator)

TypeError: 'NoneType' object is not iterable

In [21]:
import IPython.display
from keras.utils import plot_model
plot_model(model, to_file='./model.png')

![model-visualization](./model.png)

In [22]:
model.fit_generator(input_generator, int(data.shape[0]/batch_size)+1, epochs=1)

Epoch 1/1


ValueError: Error when checking input: expected input_6 to have shape (None, 1) but got array with shape (64, 904)

In [23]:
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_6 (InputLayer)             (None, 1)             0                                            
____________________________________________________________________________________________________
embedding_6 (Embedding)          (None, 1, 100)        90400       input_6[0][0]                    
____________________________________________________________________________________________________
dense_6 (Dense)                  (None, 1, 957)        96657       embedding_6[0][0]                
____________________________________________________________________________________________________
dense_7 (Dense)                  (None, 1, 371)        37471       embedding_6[0][0]                
___________________________________________________________________________________________

### Todo
* Make code efficient for large scale
* CSR matrix