## Modeling

**Imports**

In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
np.random.seed(42)

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score

from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Dense, Dropout, Embedding, LSTM
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor

import matplotlib.pyplot as plt
import seaborn as sns

**Reading in Data**

In [2]:
model_sets = np.load('../data/model_sets.npy')
model_sets.shape

(36003, 21)

In [3]:
#remember, I am modeling for song n + 1 based on songs n previously played:
X_rr, y_rr = model_sets[:, :-1], model_sets[:, -1] 

In [4]:
X_rr.shape, y_rr.shape

((36003, 20), (36003,))

**Train-Test Split**

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X_rr, y_rr, 
                                                    test_size = 0.2, 
                                                    random_state = 42)

In [6]:
print(f'X_train and y_train: {X_train.shape}, {y_train.shape}')
print(f'X_test and y_test: {X_test.shape}, {y_test.shape}')

X_train and y_train: (28802, 20), (28802,)
X_test and y_test: (7201, 20), (7201,)


**Modeling**

In [7]:
#REVISIT
#REVISIT
#REVISIT

#consider using stringlookup instead of previously encoding:
# ids_from_chars = tf.keras.layers.StringLookup(
#     vocabulary=list(vocab), mask_token=None)

In [7]:
#https://www.tensorflow.org/text/tutorials/text_generation
songs = np.array(model_sets).tolist()
songs = [item for sublist in songs for item in sublist]

vocab = set(songs)
print(f'{len(vocab)} unique songs!')

970 unique songs!


In [8]:
# Length of the vocabulary in chars
vocab_size = len(vocab)

# The embedding dimension
embedding_dim = 970

# Number of RNN units
rnn_units = 516

In [10]:
#https://www.geeksforgeeks.org/python-keras-keras-utils-to_categorical/
#train labels:
X_train_ohe = np.array([to_categorical(x, num_classes = vocab_size) for x in X_train])
y_train_ohe = to_categorical(y_train, num_classes = vocab_size)

#test labels:
X_test_ohe = np.array([to_categorical(x, num_classes = vocab_size) for x in X_test])
y_test_ohe = to_categorical(y_test, num_classes = vocab_size)

In [11]:
print(f'Train shape: {X_train_ohe.shape}, Test shape: {X_test_ohe.shape}')
print(f'Train shape: {y_train_ohe.shape}, Test shape: {y_test_ohe.shape}')

Train shape: (28802, 20, 970), Test shape: (7201, 20, 970)
Train shape: (28802, 970), Test shape: (7201, 970)


In [12]:
#https://medium.com/analytics-vidhya/understanding-embedding-layer-in-keras-bbe3ff1327ce
#https://www.tensorflow.org/text/tutorials/text_generation
#https://www.analyticsvidhya.com/blog/2021/08/predict-the-next-word-of-your-text-using-long-short-term-memory-lstm/
#https://stackoverflow.com/questions/55774632/gridsearchcv-randomizedsearchcv-with-lstm

model = Sequential()

#first layer, embedding:
model.add(Embedding(vocab_size, embedding_dim))

#second layer, LSTM:
model.add(LSTM(rnn_units))

#final dense layer with softmax activation:
model.add(Dense(embedding_dim, activation ='softmax'))

In [13]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 970)         940900    
                                                                 
 lstm (LSTM)                 (None, 516)               3069168   
                                                                 
 dense (Dense)               (None, 970)               501490    
                                                                 
Total params: 4,511,558
Trainable params: 4,511,558
Non-trainable params: 0
_________________________________________________________________


In [15]:
opt = Adam(learning_rate = 0.01)
model.compile(loss = 'categorical_crossentropy', optimizer = opt, metrics = ['accuracy'])

In [16]:
history = model.fit(X_train, y_train_ohe,
                    batch_size = 128, 
                    epochs = 20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [17]:
# plt.plot(history.history['acc'])
# plt.plot(history.history['val_acc'])
# plt.title('model accuracy')
# plt.ylabel('accuracy')
# plt.xlabel('epoch')
# plt.legend(['train', 'test'], loc = 'upper left')

In [17]:
#https://medium.com/analytics-vidhya/understanding-embedding-layer-in-keras-bbe3ff1327ce
#https://www.tensorflow.org/text/tutorials/text_generation
#https://www.analyticsvidhya.com/blog/2021/08/predict-the-next-word-of-your-text-using-long-short-term-memory-lstm/
#https://stackoverflow.com/questions/55774632/gridsearchcv-randomizedsearchcv-with-lstm

def model_func(vocab_size = len(vocab),
               embedding_dim = 970,
               rnn_units = 516,
               dropout = 0.10,
               learningrate = 0.01):

    model = Sequential()

    #first layer, embedding:
    model.add(Embedding(vocab_size, embedding_dim))

    #second layer, LSTM:
    model.add(LSTM(rnn_units))

    #final dense layer with softmax activation:
    model.add(Dense(embedding_dim, activation ='softmax'))
    
    #setting optimizer:
    opt = Adam(learningrate)
    
    #compiling:
    model.compile(optimizer = opt, loss = 'mse', metrics = ['accuracy'])

    return model

In [18]:
rnn = KerasRegressor(build_fn = model_func, batch_size = 128, verbose = 2)

  rnn = KerasRegressor(build_fn = model_func, batch_size = 128, verbose = 0)


In [20]:
param_grid = {
    'epochs' : [10, 20],
    'rnn_units' : [256, 516, 1024],
    #'dropout' : [0.05, 0.10],
    'learningrate': [0.01, 0.05]}

#gridsearch:
gs = GridSearchCV(rnn, param_grid = param_grid, cv = 3)

In [None]:
gs.fit(X_train, y_train, verbose = 2)

Epoch 1/10
151/151 - 20s - loss: 322490.5000 - accuracy: 9.3745e-04 - 20s/epoch - 133ms/step
Epoch 2/10
151/151 - 19s - loss: 322490.5312 - accuracy: 9.8953e-04 - 19s/epoch - 124ms/step
Epoch 3/10
151/151 - 19s - loss: 322490.4375 - accuracy: 9.8953e-04 - 19s/epoch - 124ms/step
Epoch 4/10
151/151 - 18s - loss: 322490.4688 - accuracy: 9.8953e-04 - 18s/epoch - 122ms/step
Epoch 5/10
151/151 - 19s - loss: 322490.5938 - accuracy: 0.0010 - 19s/epoch - 124ms/step
Epoch 6/10
151/151 - 19s - loss: 322490.6250 - accuracy: 0.0010 - 19s/epoch - 124ms/step
Epoch 7/10
151/151 - 19s - loss: 322490.5625 - accuracy: 0.0010 - 19s/epoch - 125ms/step
Epoch 8/10
151/151 - 19s - loss: 322490.4375 - accuracy: 0.0010 - 19s/epoch - 124ms/step
Epoch 9/10
151/151 - 19s - loss: 322490.5000 - accuracy: 0.0010 - 19s/epoch - 123ms/step
Epoch 10/10
151/151 - 19s - loss: 322490.4375 - accuracy: 0.0010 - 19s/epoch - 123ms/step
Epoch 1/10
151/151 - 19s - loss: 321922.7812 - accuracy: 8.3329e-04 - 19s/epoch - 128ms/step


In [None]:
print(gs.best_score_)
gs.best_params_