In [1]:
import numpy as np
import pandas as pd
from keras.models import Sequential, load_model
from keras.layers import Dense, Embedding, LSTM, Dropout
from keras.utils import to_categorical
from random import randint
import re

In [2]:
import wandb

wandb.init(project="hp-cl", entity="nak-develops")

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mnataliekraft5426[0m ([33mnak-develops[0m). Use [1m`wandb login --relogin`[0m to force relogin


## Gather and preprocess text data 

In [3]:
# mi_acl = pd.read_csv("../data/harrypotter_acl.csv")
# mi_acl = hp_acl[~pd.isna(hp_acl['sentence_clean'])]
# mi_acl.shape

mi_cl = pd.read_csv("../data/mind_cl.csv")
mi_cl = mi_cl[~pd.isna(mi_cl['sentence_clean'])]
mi_cl.shape

(22770, 7)

## Tokenize Input Data 

In [4]:
from tensorflow.keras.preprocessing.text import Tokenizer
from nltk.tokenize import word_tokenize

# anti curriculum learning
hp_text_words = " ".join(mi_cl['sentence_clean'])

hp_text_words = (word_tokenize(hp_text_words))
n_words = len(hp_text_words)
unique_words = len(set(hp_text_words))

print('Total Words: %d' % n_words)
print('Unique Words: %d' % unique_words)

tokenizer = Tokenizer(num_words=n_words)
tokenizer.fit_on_texts(hp_text_words)

Total Words: 179988
Unique Words: 20901


In [5]:
vocab_size = len(tokenizer.word_index) + 1
word_2_index = tokenizer.word_index
print("Vocab Size: " + str(vocab_size))

Vocab Size: 20902


## Modify the shape of the data 

In [6]:
input_sequence = []
output_words = []
input_seq_length = 10

for i in range(0, n_words - input_seq_length , 1):
    in_seq = hp_text_words[i:i + input_seq_length]
    out_seq = hp_text_words[i + input_seq_length]
    input_sequence.append([word_2_index[word] for word in in_seq])
    output_words.append(word_2_index[out_seq])

print(input_sequence[0])

[1424, 1735, 7153, 3037, 7154, 3854, 2005, 3037, 3038, 3037]


In [7]:
X = np.reshape(input_sequence, (len(input_sequence), input_seq_length, 1))
X = X / float(vocab_size)

y = to_categorical(output_words)

In [8]:
print("X shape:", X.shape)
print("y shape:", y.shape)

X shape: (179978, 10, 1)
y shape: (179978, 20902)


## Training the Model

In [9]:
model = Sequential()
model.add(LSTM(800, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
model.add(LSTM(800, return_sequences=True))
model.add(LSTM(800))
model.add(Dense(y.shape[1], activation='softmax'))

model.summary()

model.compile(loss='categorical_crossentropy', optimizer='adam')

model.fit(X, y, batch_size=80, epochs=4, verbose=1)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 10, 800)           2566400   
                                                                 
 lstm_1 (LSTM)               (None, 10, 800)           5123200   
                                                                 
 lstm_2 (LSTM)               (None, 800)               5123200   
                                                                 
 dense (Dense)               (None, 20902)             16742502  
                                                                 
Total params: 29,555,302
Trainable params: 29,555,302
Non-trainable params: 0
_________________________________________________________________
Epoch 1/4


2022-12-13 00:51:38.705662: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


  76/2250 [>.............................] - ETA: 22:11 - loss: 9.3774

wandb: Network error (ConnectTimeout), entering retry loop.


Epoch 2/4
Epoch 3/4
Epoch 4/4

wandb: Network error (ConnectionError), entering retry loop.




<keras.callbacks.History at 0x659d5dcd0>

## Experiment 

In [25]:
predicted_w = []
def prediction(sent):
    random_seq = [word_2_index[i] for i in sent]
    print(random_seq)
    index_2_word = dict(map(reversed, word_2_index.items()))

    word_sequence = [index_2_word[value] for value in random_seq]

    print(' '.join(word_sequence))

    [predicted_w.append(t) for t in random_seq]
    for i in range(10):
        int_sample = np.reshape(random_seq, (1, len(random_seq), 1))
        int_sample = int_sample / float(vocab_size)
        predicted_word_index = model.predict(int_sample, verbose=0)
        aa = pd.DataFrame(predicted_word_index).transpose().reset_index()
        aa = aa.loc[~aa.index.isin(predicted_w)]
        predicted_word_id = aa.sort_values(by=[0], ascending=False).reset_index()['index'][0]

        predicted_w.append(predicted_word_id)
        seq_in = [index_2_word[index] for index in random_seq]
        word_sequence.append(index_2_word[ predicted_word_id])

        random_seq.append(predicted_word_id)
        random_seq = random_seq[1:len(random_seq)]


    final_output = ""
    for word in word_sequence:
        final_output = final_output + " " + word

    print(final_output)

In [34]:
trans1 = ['team', 'won', 'against', 'messi', 'he', 'went', 'all', 'out', 'on', 'team']
trans2 = ['marc', 'blanchette', 'holds', 'his', 'ellsworth', 'city', 'council', 'seat', 'after']
trans3 = ['republicans', 'argue', 'impeachment', 'case', 'falls', 'short', 'of', 'proving', 'trump', 'misconduct']
trans4 = ['miami', 'dade', 'mayor', 'carlos', 'gimenez', 'vetoes', 'resolution', 'that', 'would', 'have']
trans5 = ['chemical', 'plant', 'sued', 'for', 'poor', 'care', 'of', 'toxic', 'materials', 'throughout']

In [27]:
prediction(trans1)

[62, 2018, 1412, 10350, 1061, 929, 1280, 544, 61, 62]
team won against messi he went all out on team
 team won against messi he went all out on team new state trump to says school week man in best


In [28]:
prediction(trans2)

[10789, 15695, 1156, 846, 15696, 32, 525, 936, 156]
marc blanchette holds his ellsworth city council seat after
 marc blanchette holds his ellsworth city council seat after vs football police impeachment the day year home first county


In [29]:
prediction(trans3)

[599, 6223, 17, 217, 408, 682, 33, 6160, 2, 2825]
republicans argue impeachment case falls short of proving trump misconduct
 republicans argue impeachment case falls short of proving trump misconduct high get game veterans top win one season back fire


In [30]:
prediction(trans4)


[121, 1719, 265, 2732, 18557, 18558, 2736, 561, 221, 981]
miami dade mayor carlos gimenez vetoes resolution that would have
 miami dade mayor carlos gimenez vetoes resolution that would have report crash woman time old weather st make could shooting


In [35]:
prediction(trans5)

[4414, 1541, 3660, 29, 2087, 426, 33, 3088, 10892, 4203]
chemical plant sued for poor care of toxic materials throughout
 chemical plant sued for poor care of toxic materials throughout snow cold watch nfl killed us say world family found
