# Building Model 2
The architecture of the second model is based on Neural Machine Translation model.<br>
The encoder input will be the melody input and the output will be melody sequence, hidden state and cell state<br>
The decoder input will be the encoder's output and the sequence input.
The last layers are attention layer and softmax layer that outputs probabilities vector.
- **melody input** - will receive a matrix consisting the information of chroma and piano, based on sampling each 1 second for 221 seconds (Median of all the melodies length) 
- **sequence input** - will receive a list of sequence consisting of 10 words, and embed them.

In [1]:
import Model2Base as mb
import pandas as pd
import numpy as np
from nltk.tokenize import RegexpTokenizer
from keras.preprocessing.text import Tokenizer
from gensim.models import KeyedVectors

Using TensorFlow backend.


## Loading lyrics data

In [2]:
df = pd.read_csv("data/lyrics_train_set.csv",header=None)
df = df.fillna('')
df[2] = df[2] + df[3] + df[4] + df[5] + df[6] 
df=df.drop([3,4,5,6],axis=1)
df.columns=['singer','song','lyrics']

df['song_num']=df.index

## Creating midi matrices

In [3]:
midis_vec = mb.create_midis_vector(df)
number_of_sequences = mb.get_median_length(midis_vec)
songs_dict = {}
for midi in midis_vec:
    midi_file = midi[1]
    mat = mb.create_midi_matrix(midi_file, number_of_sequences)
    songs_dict[midi[0]] = mat
    
df=df[df.song_num.isin(songs_dict.keys())]
df['clean_lyrics'] = df.apply(lambda row: mb.clean_text(row.lyrics),axis=1)
df['singer_song']=df.apply(lambda row: mb.clean_singer_song(row['singer'],row['song']),axis=1)



## Tokenizing words

In [4]:
from nltk.tokenize import RegexpTokenizer

tokenizer = RegexpTokenizer(r'\w+')
df["tokens"] = df["clean_lyrics"].apply(tokenizer.tokenize)
df['ln']= df["tokens"].str.len()

## Creating Vocabulary

In [5]:
all_words = [word for tokens in df["tokens"] for word in tokens]
sentence_lengths = [len(tokens) for tokens in df["tokens"]]
VOCAB = sorted(list(set(all_words)))
print("%s words total, with a vocabulary size of %s" % (len(all_words), len(VOCAB)))
print("Max sentence length is %s" % max(sentence_lengths))
VOCAB_SIZE = len(VOCAB)

180248 words total, with a vocabulary size of 7525
Max sentence length is 1481


## Creating Sequences

In [6]:
tokenizer = Tokenizer(num_words=VOCAB_SIZE)
tokenizer.fit_on_texts(df["clean_lyrics"].tolist())
sequences = tokenizer.texts_to_sequences(df["clean_lyrics"].tolist())

In [None]:
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

### Creating Song sequences

In [8]:
MAX_SEQUENCE_LENGTH = 10
song_index=[]
sequences_list=[]
song_num=df.song_num.tolist()
for i,seq in enumerate(sequences):
    for j in range(1, len(seq)):
        for z in range(MAX_SEQUENCE_LENGTH):
            sequence = seq[j:j+z+2]
            sequences_list.append(np.array(sequence))
            song_index.append(song_num[i])
print('Total Sequences: %d' % len(sequences_list))

Total Sequences: 1796570


### Padding sequences according the max_length

In [9]:
from keras.preprocessing.sequence import pad_sequences

max_length = max([len(seq) for seq in sequences_list])
sequences_pad = pad_sequences(sequences_list, maxlen=max_length, padding='pre')

## Rearranging data for X_train and y_train data

In [10]:
data = np.array(sequences_pad)
song_index =np.array(song_index)
X=data[:,:-1]
Y=data[:,-1]


## Embedding the words for the model

In [11]:
word2vec = KeyedVectors.load_word2vec_format('data/wiki-news-300d-1M.vec')


In [12]:
EMBEDDING_DIM=300

embedding_weights = np.zeros((len(word_index)+1, EMBEDDING_DIM))
for word,index in word_index.items():
    embedding_weights[index,:] = word2vec[word] if word in word2vec else np.random.uniform(-1,1,EMBEDDING_DIM)
print(embedding_weights.shape)

(7526, 300)


### Creating the training data

In [13]:
x_train_list, midi_list, y_train_list, x_test_list, midi_test_list, y_test_list = mb.create_training_data(song_index, songs_dict, X, Y)

In [14]:
x_test=np.array(x_test_list)
midi_test=np.array(midi_test_list)
y_test=np.array(y_test_list)

### **Model Creation**

In [15]:
model = mb.build_model(word_index, embedding_weights)

In [16]:
batch_size=256
gen= mb.gendata(x_train_list,midi_list,y_train_list,batch_size) 

## **Model Training** 
#### 5 epochs each time, batch size 256, data is matched by the generator (gen)

In [18]:
history=model.fit_generator(gen,
                            steps_per_epoch=len(y_train_list)//batch_size,
                            epochs=5,
                            validation_data=([midi_test,x_test], y_test))

Epoch 1/1
