In [None]:
# import necessary modules 

import tensorflow as tf
import json
import numpy as np
from tensorflow import keras as tfk
import os
from pymongo import MongoClient

In [None]:
# import all of necessary databases that will be used to learn off of

client = MongoClient('localhost', 27017)
db = client['']
genes_pod = db['']
gene = db['']
client.close()

# originally done with MongoDB shown above; used .json file for flask app

In [None]:
# puts all song lyrics into one list by like of song in ilk_sng
# then puts all of the songs into a single string with '\n' seperating the 
# lines to ensure that similar structure is kept

ilk_sng = []
for x in list(genes_pod.find({})):
    ilk_sng += x['lyrics']
ilk_sng.remove('')
sng_gwd = ' \n '.join(ilk_sng).lower().split(' ')
sng_gud = [x.replace('(', '').replace(')', '').replace('"', '').replace('"', '').replace(',', '') for x in sng_gwd]
sng_gud = sng_gud

In [None]:
# output number of unique words followed by totaL

vocab = sorted(set(sng_gud))
print('{} unique \n{} total'.format(len(vocab), len(sng_gud)))

In [None]:
# create an integer value for each of the words; create a numpy arrray of all of the words as type string in a serperate
# variable; create a numpy array of all of the numerical representations of the words

wrd_num = {u:i for i, u in enumerate(vocab)}
num_wrd = np.array(vocab)
sng_int = np.array([wrd_num[c] for c in sng_gud])

In [None]:
# create the length of the sequence that will be used to learn and make a prediction off of (what it will use as an input 
# later to make predictions off of); output the expected epoch length which can be used to calulate the number of steps by
# dividing the output by the number of batches that will be inputed at a later point in this code; create a dataset with 
# tensorflow

seq_length = 55
wrd_dataset = tf.data.Dataset.from_tensor_slices(sng_int)

In [None]:
# use the dataset previously created to be broken up into batches with the size of your desired sequence length 
# plus one because we are going to use the extra word to learn what is likely to follow the previous words

seqs = wrd_dataset.batch(seq_length+1, drop_remainder=True)

# can uncomment below if one wishes to output and be able to see when running code how long of a sequence 
# will be used to learn off of visually

# for i in seqs.take(1):
#     print((' '.join(num_wrd[i.numpy()])))

In [None]:
# creates chunks from the dataset and forms a new dataset with a two chunks: one as the input and one of 
# the target

def chnky_mnky(chawnk):
    inpt = chawnk[:-1]
    target = chawnk[1:]
    return inpt, target

dataset = seqs.map(chnky_mnky)

In [None]:
# create the batch_size (number of seqeunces to look at per step), buff_size(the amount of data to load into memory per 
# step), and then shuffle the newly created dataset. 

batch_size = 3
buff_size = 840
dataset = dataset.shuffle(buff_size).batch(batch_size, drop_remainder=True)
ex_per_ep = (len(sng_gud)//(seq_length+1))//batch_size
dataset

In [None]:
# create the neural network
# this will be altered depending on the dataset being used

# found that song lyrics require a higher embedding dimension than
# novels, yet lower sequence length

vocab_size = len(vocab)
embed_dim = 128
rnn_units = 512

def build_model(vocab_size, embed_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([\
                                 tf.keras.layers.Embedding(vocab_size, embed_dim,\
                                                           batch_input_shape=[batch_size, None]),\
                                 tf.keras.layers.LSTM(rnn_units,\
                                                     return_sequences=True,\
                                                     stateful=True,\
                                                     recurrent_initializer='glorot_uniform'),\
                                tf.keras.layers.Dense(vocab_size, activation='softmax'),\
                                ])
    return model

In [None]:
# define and build the model

model = build_model(
  vocab_size = len(vocab),
  embed_dim=embed_dim,
  rnn_units=rnn_units,
  batch_size=batch_size)

In [None]:
# define the loss function as the loss function that will be used

def winnawinna(nochkn, nodinna):
    return tf.keras.losses.sparse_categorical_crossentropy(nochkn, nodinna, from_logits=False, axis=-1)

In [None]:
# compile the model so it is ready to be trained on the data

model.compile(optimizer='adam', loss=winnawinna)

In [None]:
# create the file location for where the training data should be stored and referenced when necessary

# checkpoints saved
checkpoint_dir = ''
# files' names
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

In [None]:
# eps is the number of epochs of times one wants to train the model on the dataset

eps = 5
history = model.fit(dataset, epochs=eps, callbacks=[checkpoint_callback])

In [None]:
# rebuild the model with a batch size of one because we only want one output
# then load the weights in from the most recent checkpoint referencing the location
# that our files were saved in

model = build_model(vocab_size, embed_dim, rnn_units, batch_size=1)

model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))

model.build(tf.TensorShape([1, None]))

In [None]:
# create the function that will be used to generate the text
# set the length one wants as the output in bars and then set the 
# level of certinty by adjusting the shf value (affects the threshhold
# and what will be allowed to pass as a value accepted from the model) 

shf = 1.0
bars = 55

def bouttaend_urwhole_carreer(model, start_string, bars=None):

    num_generate = bars
    start_str = start_string.split(' ')
    input_eval = [wrd_num[s] for s in start_str]
    input_eval = tf.expand_dims(input_eval, 0)
    text_generated = []
    temp = shf
    model.reset_states()
    for i in range(num_generate):
        predictions = model(input_eval)
        predictions = tf.squeeze(predictions, 0)
        predictions = predictions / temp
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()
        input_eval = tf.expand_dims([predicted_id], 0)
        text_generated.append(num_wrd[predicted_id])
    return (start_string + ' '.join(text_generated))

In [None]:
# prints the string with the option of inputing a seed in the start_string *kwargs, using
# the function that was made above

lmno = bouttaend_urwhole_carreer(model, start_string=''' ''')
print(lmno+' \n \n ')