In [308]:
import numpy as np
import pandas as pd
from keras.models import Sequential, load_model
from keras.layers import Dense, Embedding, LSTM, Dropout
from keras.utils import to_categorical
from random import randint
import re

In [309]:
import wandb

wandb.init(project="hp-cl", entity="nak-develops")

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mnataliekraft5426[0m ([33mnak-develops[0m). Use [1m`wandb login --relogin`[0m to force relogin


## Gather and preprocess text data 

In [310]:
hp_acl = pd.read_csv("../data/harrypotter_acl.csv")
hp_acl = hp_acl[~pd.isna(hp_acl['sentence_clean'])]
hp_acl.shape

# hp_cl = pd.read_csv("../data/harrypotter_cl.csv")
# hp_cl = hp_cl[~pd.isna(hp_cl['sentence_clean'])]
# hp_cl.shape

(61151, 8)

## Tokenize Input Data 

In [311]:
from tensorflow.keras.preprocessing.text import Tokenizer
from nltk.tokenize import word_tokenize

# anti curriculum learning
hp_text_words = " ".join(hp_acl['sentence_clean'])

hp_text_words = (word_tokenize(hp_text_words))
n_words = len(hp_text_words)
unique_words = len(set(hp_text_words))

print('Total Words: %d' % n_words)
print('Unique Words: %d' % unique_words)

tokenizer = Tokenizer(num_words=n_words)
tokenizer.fit_on_texts(hp_text_words)

Total Words: 622359
Unique Words: 20888


In [312]:
vocab_size = len(tokenizer.word_index) + 1
word_2_index = tokenizer.word_index
print("Vocab Size: " + str(vocab_size))

Vocab Size: 20889


## Modify the shape of the data 

In [313]:
input_sequence = []
output_words = []
input_seq_length = 10

for i in range(0, n_words - input_seq_length , 1):
    in_seq = hp_text_words[i:i + input_seq_length]
    out_seq = hp_text_words[i + input_seq_length]
    input_sequence.append([word_2_index[word] for word in in_seq])
    output_words.append(word_2_index[out_seq])

print(input_sequence[0])

[90, 14487, 192, 276, 2765, 1051, 1837, 3510, 64, 136]


In [314]:
X = np.reshape(input_sequence, (len(input_sequence), input_seq_length, 1))
X = X / float(vocab_size)

y = to_categorical(output_words)

In [315]:
print("X shape:", X.shape)
print("y shape:", y.shape)

X shape: (622349, 10, 1)
y shape: (622349, 20889)


## Training the Model

In [316]:
model = Sequential()
model.add(LSTM(800, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
model.add(LSTM(800, return_sequences=True))
model.add(LSTM(800))
model.add(Dense(y.shape[1], activation='softmax'))

model.summary()

model.compile(loss='categorical_crossentropy', optimizer='adam')

wandb.watch(model, log_freq=100)

model.fit(X, y, batch_size=80, epochs=4, verbose=1)
wandb.log

Model: "sequential_13"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_39 (LSTM)              (None, 10, 800)           2566400   
                                                                 
 lstm_40 (LSTM)              (None, 10, 800)           5123200   
                                                                 
 lstm_41 (LSTM)              (None, 800)               5123200   
                                                                 
 dense_13 (Dense)            (None, 20889)             16732089  
                                                                 
Total params: 29,544,889
Trainable params: 29,544,889
Non-trainable params: 0
_________________________________________________________________
Epoch 1/4
1497/7780 [====>.........................] - ETA: 40:23 - loss: 7.9967

wandb: Network error (ConnectTimeout), entering retry loop.


Epoch 2/4
1360/7780 [====>.........................] - ETA: 59:36 - loss: 7.8211

wandb: Network error (ConnectionError), entering retry loop.


Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x3193d2bb0>

## Experiment 

In [320]:
predicted_w = []
def prediction(sent):
    random_seq = [word_2_index[i] for i in sent]
    print(random_seq)
    index_2_word = dict(map(reversed, word_2_index.items()))

    word_sequence = [index_2_word[value] for value in random_seq]

    print(' '.join(word_sequence))

    [predicted_w.append(t) for t in random_seq]
    for i in range(10):
        int_sample = np.reshape(random_seq, (1, len(random_seq), 1))
        int_sample = int_sample / float(vocab_size)

        predicted_word_index = model.predict(int_sample, verbose=0)
        aa = pd.DataFrame(predicted_word_index).transpose().reset_index()
        aa = aa.loc[~aa.index.isin(predicted_w)]
        predicted_word_id = aa.sort_values(by=[0], ascending=False).reset_index()['index'][0]

        predicted_w.append(predicted_word_id)
        seq_in = [index_2_word[index] for index in random_seq]
        word_sequence.append(index_2_word[ predicted_word_id])

        random_seq.append(predicted_word_id)
        random_seq = random_seq[1:len(random_seq)]


    final_output = ""
    for word in word_sequence:
        final_output = final_output + " " + word

    print(final_output)

In [321]:
trans1 = ['harry', 'went', 'to', 'hermione', 'with', 'only', 'one', 'thing', 'to', 'say']
trans2 = ['harry', 'loved', 'hedwig', 'and', 'wanted', 'to', 'protect', 'him', 'but', 'voldemort']
trans3 = ['he', 'was', 'an', 'excellent', 'quidditch', 'player', 'himself', 'you', 'are', 'joking']
trans4 = ['the', 'dursleys', 'knew', 'that', 'potters', 'had', 'small', 'son', 'too', 'but']
trans5 = ['he', 'was', 'expelled', 'and', 'that', 'he', 'was', 'now', 'facing', 'three']

In [322]:
prediction(trans1)

[1, 131, 10, 4, 134, 880, 11, 143, 10, 95]
harry went to hermione with only one thing to say
 harry went to hermione with only one thing to say said ron the dumbledore back and see could right would


In [323]:
prediction(trans2)

[1, 2191, 557, 9, 165, 10, 2037, 121, 149, 52]
harry loved hedwig and wanted to protect him but voldemort
 harry loved hedwig and wanted to protect him but voldemort like it got know looked weasley around hagrid professor time


In [324]:
prediction(trans3)

[17, 36, 447, 984, 241, 2737, 1155, 42, 570, 3214]
he was an excellent quidditch player himself you are joking
 he was an excellent quidditch player himself you are joking well snape mr still though in face knew his eyes


In [325]:
prediction(trans4)


[5, 516, 76, 75, 2427, 83, 199, 610, 977, 149]
the dursleys knew that potters had small son too but
 the dursleys knew that potters had small son too but get think looking of little head room wand people us


In [326]:
prediction(trans5)

[17, 36, 1371, 9, 75, 17, 36, 399, 1120, 111]
he was expelled and that he was now facing three
 he was expelled and that he was now facing three going toward something sirius behind look heard come really seemed


wandb: Network error (ConnectionError), entering retry loop.
