## Modeling

**Imports**

In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
np.random.seed(42)

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score

from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Dense, Dropout, Embedding, LSTM
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor

import matplotlib.pyplot as plt
import seaborn as sns

**Reading in Data**

In [2]:
model_sets = np.load('../data/model_sets.npy')
model_sets.shape

(36003, 21)

In [3]:
#remember, I am modeling for song n + 1 based on songs n previously played:
X_rr, y_rr = model_sets[:, :-1], model_sets[:, -1] 

In [4]:
X_rr.shape, y_rr.shape

((36003, 20), (36003,))

**Train-Test Split**

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X_rr, y_rr, 
                                                    test_size = 0.2, 
                                                    random_state = 42)

In [6]:
print(f'X_train and y_train: {X_train.shape}, {y_train.shape}')
print(f'X_test and y_test: {X_test.shape}, {y_test.shape}')

X_train and y_train: (28802, 20), (28802,)
X_test and y_test: (7201, 20), (7201,)


**Modeling - Second Attempt, with GridSearchCV**

In [8]:
#https://www.tensorflow.org/text/tutorials/text_generation
songs = np.array(model_sets).tolist()
songs = [item for sublist in songs for item in sublist]

vocab = set(songs)
print(f'{len(vocab)} unique songs!')

970 unique songs!


In [9]:
# Length of the vocabulary in chars
vocab_size = len(vocab)

# The embedding dimension
embedding_dim = 970

# Number of RNN units
rnn_units = 516

In [10]:
#https://www.geeksforgeeks.org/python-keras-keras-utils-to_categorical/
#train labels:
X_train_ohe = np.array([to_categorical(x, num_classes = vocab_size) for x in X_train])
y_train_ohe = to_categorical(y_train, num_classes = vocab_size)

#test labels:
X_test_ohe = np.array([to_categorical(x, num_classes = vocab_size) for x in X_test])
y_test_ohe = to_categorical(y_test, num_classes = vocab_size)

In [11]:
print(f'Train shape: {X_train_ohe.shape}, Test shape: {X_test_ohe.shape}')
print(f'Train shape: {y_train_ohe.shape}, Test shape: {y_test_ohe.shape}')

Train shape: (28802, 20, 970), Test shape: (7201, 20, 970)
Train shape: (28802, 970), Test shape: (7201, 970)


In [21]:
#https://medium.com/analytics-vidhya/understanding-embedding-layer-in-keras-bbe3ff1327ce
#https://www.tensorflow.org/text/tutorials/text_generation
#https://www.analyticsvidhya.com/blog/2021/08/predict-the-next-word-of-your-text-using-long-short-term-memory-lstm/
#https://stackoverflow.com/questions/55774632/gridsearchcv-randomizedsearchcv-with-lstm

def model_func(vocab_size = len(vocab),
               embedding_dim = 970,
               rnn_units = 516,
               #dropout = 0.10,
               learningrate = 0.01):

    model = Sequential()

    #first layer, embedding:
    model.add(Embedding(vocab_size, embedding_dim))

    #second layer, LSTM:
    model.add(LSTM(rnn_units))

    #final dense layer with softmax activation:
    model.add(Dense(embedding_dim, activation ='softmax'))
    
    #setting optimizer:
    opt = Adam(learningrate)
    
    #compiling:
    model.compile(optimizer = opt, loss = 'categorical_crossentropy', metrics = ['accuracy'])

    return model

In [22]:
rnn = KerasRegressor(build_fn = model_func, batch_size = 128, verbose = 2)

  rnn = KerasRegressor(build_fn = model_func, batch_size = 128, verbose = 2)


In [23]:
param_grid = {
    'epochs' : [20, 30],
    'rnn_units' : [256, 516],
    #'dropout' : [0.05, 0.10],
    'learningrate': [0.01, 0.05]}

#gridsearch:
gs = GridSearchCV(rnn, param_grid = param_grid, cv = 3)

In [24]:
gs.fit(X_train, y_train_ohe,
       validation_data = (X_test, y_test_ohe),
       verbose = 2)

Epoch 1/20
151/151 - 45s - loss: 5.2243 - accuracy: 0.0667 - val_loss: 4.8848 - val_accuracy: 0.0965 - 45s/epoch - 300ms/step
Epoch 2/20
151/151 - 40s - loss: 4.6777 - accuracy: 0.0993 - val_loss: 4.8472 - val_accuracy: 0.0937 - 40s/epoch - 262ms/step
Epoch 3/20
151/151 - 40s - loss: 4.4585 - accuracy: 0.1145 - val_loss: 4.9336 - val_accuracy: 0.0976 - 40s/epoch - 266ms/step
Epoch 4/20
151/151 - 39s - loss: 4.2515 - accuracy: 0.1270 - val_loss: 5.0319 - val_accuracy: 0.0957 - 39s/epoch - 259ms/step
Epoch 5/20
151/151 - 39s - loss: 4.0811 - accuracy: 0.1434 - val_loss: 5.1527 - val_accuracy: 0.0962 - 39s/epoch - 258ms/step
Epoch 6/20
151/151 - 40s - loss: 3.9155 - accuracy: 0.1561 - val_loss: 5.2616 - val_accuracy: 0.0971 - 40s/epoch - 262ms/step
Epoch 7/20
151/151 - 39s - loss: 3.7873 - accuracy: 0.1724 - val_loss: 5.3919 - val_accuracy: 0.0939 - 39s/epoch - 257ms/step
Epoch 8/20
151/151 - 40s - loss: 3.6777 - accuracy: 0.1811 - val_loss: 5.5018 - val_accuracy: 0.0936 - 40s/epoch - 263

GridSearchCV(cv=3,
             estimator=<keras.wrappers.scikit_learn.KerasRegressor object at 0x000002113F643FD0>,
             param_grid={'epochs': [20, 30], 'learningrate': [0.01, 0.05],
                         'rnn_units': [256, 516]})

In [25]:
print(gs.best_score_)
gs.best_params_

-6.530418395996094


{'epochs': 20, 'learningrate': 0.01, 'rnn_units': 256}