In [None]:


"""
In this mission, you will use a text from Shakespeare (Shakespeareâ€™s first sonnet) to train a recurrent neural network (RNN) for language modelling.
You can use any editor such as Jupyter Notebook or Spyder, to complete this mission in your computer.
Write code that uses the Keras library to build, train, and test an RNN with the architecture described.
Recommendations:
 Preprocessing: Tokenization, lowercasing, and removing punctuations might help, though maintaining the structure could be beneficial.
 Data Augmentation: Consider supplementing with other poetic or classical text sources.
 Model Choice: Use LSTMs or GRUs instead of simple RNNs to handle long-term dependencies.
 Regularization: Apply dropout to mitigate overfitting.
 Evaluation: Use perplexity or BLEU scores to assess generated text quality.


"""

#Read the text file as separate lines of text with open ('data.txt', 'r') as file:
# Read the text
# To read the text file of lines from Shakespeare plays, Use these lines of code: 
#Read the text file as separate lines of text

import numpy as np


def main():
    with open('data.txt', 'r') as file:
        text = file.read()
        lines = text.lower().split('\n')
    print("length of lines", len(lines))
    print("Text", text[0:200])
    print("Lines", lines)
    #Define words, vocabulary size and sequences of words as lines
    from keras.src.legacy.preprocessing.text import text_to_word_sequence, Tokenizer
    words = text_to_word_sequence(text)
    print("words", words)
    print("these many wordds", len(words))

    import pandas as pd
    word_series = pd.Series(words)

    # Use value_counts() to get the count of each unique word
    word_counts = word_series.value_counts()

    # Create a DataFrame from the counts
    word_df = pd.DataFrame({'Word': word_counts.index, 'Count': word_counts.values})

    # See count of words in poem
    print("words data frame" ,word_df)

    # create token for each word i.e assign it a spot in an array (work book 8-7)

    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(words)
    tokens = tokenizer.word_index
    print("tokens", tokens)
    
    """
    ''' Let's see how many words are in the "library" to train on.
    The +1 is required, because later on when using to_categorical, 
    a preprocessing step on the target data, it creates size - 1 columns, 
    so we have to account for that somewhere, and we can do that here so there are enough target columns.
    '''
    """
    vocabulary_size = len(tokenizer.word_index) + 1
    print("vocabulary size", vocabulary_size)
    sequences = tokenizer.texts_to_sequences(lines)
    print ("sequences", sequences)

    #Find subsequences 
    subsequences = []
    for sequence in sequences:
        for i in range(1, len(sequence)):
            subsequence = sequence[:i+1]
            subsequences.append(subsequence)
        # print("subsequences",subsequences)    

    #visualize the new set-up
    print("subsequences", subsequences)

    # Can use this embedding to call to display specific words in the dictionary at specific locations.
    # In this example, 581 is outside the range of the embedding, so does not return a word.

    _list=[26,189,581]
    for k, v in tokens.items():    
        if v in _list:
            print(v,k)
    
    #  Find the location of specific words in the embedding:
    _list=['trees','thou']
    for k, v in tokens.items():    
        if k in _list:
            print(k,v)
    
    # Padding your sequences
    # You need to have equal sequences for training. You will apply padding.
    # Write these lines of code to implement the padding needed:
    from keras.preprocessing.sequence import pad_sequences
    sequence_length = max([len(sequence) for sequence in sequences])
    sequences = pad_sequences(subsequences, maxlen=sequence_length, padding='pre')
    print("sequences",sequences)
    print("sequences shape",sequences.shape)

    #### 5. Split the lines of words into input and output.  

    # The last word of each line is now the fartherst right word on each line because of all the padding zeros pushing it over the appropriate number of spots. You can grab the output (our 'label') now, because the next word in the line is always in the last column. No I can easily see what collection of words goes before any given word. Cool! 
    # x, y = sequences[:,:-1],sequences[:,-1]


    # Encode the target labels
    # Use these lines of code to encode your labels for training:
    # 

    from keras.utils import to_categorical
    x, y = sequences[:,:-1],sequences[:,-1]
    print("x.shape,y.shape...", x.shape,y.shape)
    print(x[0:11])
    print(y)
    print(np.unique(y))
    print(len(np.unique(y)))
    
    y = to_categorical(y, num_classes=vocabulary_size)

    print("y shape...", y.shape)
    
    print("y label...",y)

    """
    Define an RNN with the following layers:
    An embedding layer with the following parameters:
    The input dimension is vocabulary_size.
    The output dimension is 100.
    The input length is sequence_length - 1.
    An LSTM layer with 100 units.
    A dropout layer with a dropout rate of 10%.
    A dense layer with the following parameters:
    Activation function is softmax.
    The number of units is vocabulary_size.
    
    """
    
    
    from keras.models import Sequential 
    model = Sequential()

    from keras.layers import Embedding
    model.add(Embedding(input_dim = vocabulary_size, 
                    output_dim = 100,
                    input_shape = (sequence_length - 1,)))

# In other words, solve a 100 dimensional vector (output_dim) for each word in the vocabulary (input_dim) based on our length of 9 subsequences (input_shape)

    from keras.layers import LSTM 
    model.add(LSTM(units = 100))

    from keras.layers import Dropout, Dense 
    model.add(Dropout(rate=0.1))

    #The output layer is a fully-connected layer the size of the vocabulary as it will present 
    # a probability for each word of it being the next word in the string.
    model.add(Dense(units=vocabulary_size, activation ='softmax'))

    print("model summary",model.summary()) 

    """
    ## Compile the network
    Build the network using:
    - An adam optimizer
    - The loss function is categorical_crossentropy
    - The metric used is accuracy
    """  

    model.compile(optimizer='adam',loss='categorical_crossentropy', metrics=['accuracy'])

    from livelossplot import PlotLossesKeras

    model.fit(x, y,
          callbacks=[PlotLossesKeras()],
          epochs = 500)

    scores = model.evaluate(x, y, verbose=0)
    print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))

    _list=[np.argmax(model.predict(x[3:4]))]
    for k, v in tokens.items():    
        if v in _list:
            print("The next word is: ",k) 

main()