In [1]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets>=2.0.0 (from evaluate)
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
Collecting dill (from evaluate)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
Collecting responses<0.19 (from evaluate)
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting pyarrow-hotfix (from datasets>=2.0.0-

In [2]:
!pip install datasets



In [140]:
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences
from evaluate import load

In [141]:
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import categorical_crossentropy

In [142]:
from sklearn.preprocessing import LabelBinarizer

In [143]:
import nltk

In [144]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [145]:
from nltk.tokenize import word_tokenize

# Data Preparation

In [316]:
val_y_path = "/content/drive/MyDrive/NLP Labs/data/shakespeare/valid.original.nltktok"
val_x_path = "/content/drive/MyDrive/NLP Labs/data/shakespeare/valid.modern.nltktok"
train_y_path = "/content/drive/MyDrive/NLP Labs/data/shakespeare/train.original.nltktok"
train_x_path = "/content/drive/MyDrive/NLP Labs/data/shakespeare/train.modern.nltktok"
test_y_path = "/content/drive/MyDrive/NLP Labs/data/shakespeare/test.original.nltktok"
test_x_path = "/content/drive/MyDrive/NLP Labs/data/shakespeare/test.modern.nltktok"

In [317]:
val_y = pd.read_table(val_y_path, header=None, names=["Sentences"])
val_x = pd.read_table(val_x_path, header=None, names=["Sentences"])
test_y = pd.read_table(test_y_path, header=None, names=["Sentences"])
test_x = pd.read_table(test_x_path, header=None, names=["Sentences"])
train_y = pd.read_table(train_y_path, header=None, names=["Sentences"])
train_x = pd.read_table(train_x_path, header=None, names=["Sentences"])

In [318]:
df_x = pd.concat([val_x, pd.concat([test_x, test_x])])

In [319]:
df_y = pd.concat([val_y, pd.concat([test_y, test_y])])

# Define Functions

In [150]:
def tokenize(data):
    data['Sentences_Tokens'] = data['Sentences'].apply(lambda x: word_tokenize(x.lower(), language='english'))

In [151]:
def append_start_end_token(data):
    data['Sentences_Tokens'] = data['Sentences_Tokens'].apply(lambda x: np.concatenate((['<START>'], x, ['<END>'])))

In [152]:
def map_to_index(data, w_to_i):
    data['Sentences_Index'] = data['Sentences_Tokens'].apply(lambda x: [w_to_i[word] for word in x])

In [153]:
def create_vocabulary(sentences):
    vocab = set()
    for sentence in sentences:
        vocab.update(sentence)
    vocab = list(vocab)
    w_to_i = {word: index for index, word in enumerate(vocab)}
    i_to_w = {index: word for index, word in enumerate(vocab)}

    return vocab, w_to_i, i_to_w

In [154]:
def create_model(padding_size, vocabulary_size_x, vocabulary_size_y, embedding_size):
    encoder_inputs = Input(shape=(padding_size,))
    encoder_embedding = Embedding(input_dim=vocabulary_size_x,
                                  output_dim=embedding_size)(encoder_inputs)
    encoder = LSTM(128, return_state=True)
    _, state_h, state_c = encoder(encoder_embedding)
    encoder_states = [state_h, state_c]

    decoder_inputs = Input(shape=(padding_size,))
    decoder_embedding = Embedding(input_dim=vocabulary_size_y, output_dim=embedding_size,
                                  trainable=False)(decoder_inputs)
    decoder = LSTM(128, return_state=True)
    decoder_outputs, _, _ = decoder(decoder_embedding,
                                    initial_state=encoder_states)

    decoder_outputs = Dense(vocabulary_size_y, activation='softmax')(decoder_outputs)

    model = Model([encoder_inputs, decoder_inputs],
                  decoder_outputs)

    model.compile(optimizer=Adam(learning_rate=0.01),
                  loss=categorical_crossentropy)

    return model

In [155]:
def convert(sentences, id_to_word):
    out_sentences = []

    for sent in sentences:
        out_sentences.append(' '.join([id_to_word[s] for s in sent]))

    return out_sentences

In [156]:
def create_train_data(sentences, translations):
    input_sentences, input_translations, next_words = [], [], []
    for sentence, rephrase in zip(sentences, translations):
        for i in range(1, len(rephrase)):
            input_sentences.append(sentence)
            input_translations.append(rephrase[:i])
            next_words.append(rephrase[i])
    return input_sentences, input_translations, next_words

In [505]:
def decode(model, input_sent, word_to_id, padding_size):
    generated_sent = [word_to_id['<START>']]

    for i in range(padding_size):
        output_sent = pad_sequences([generated_sent], padding_size)
        predictions = model.predict([np.expand_dims(input_sent, axis=0), output_sent])
        next_word = np.argmax(predictions)
        generated_sent.append(next_word)

    return generated_sent

In [158]:
def remove_non_alphabetic(text):
    clean_text = ''.join(char for char in text if char.isalpha() or char.isspace())
    return clean_text

# Create Vocabulary

In [320]:
df_x = df_x.applymap(remove_non_alphabetic)
df_y = df_y.applymap(remove_non_alphabetic)

In [321]:
tokenize(df_x)
tokenize(df_y)

In [322]:
append_start_end_token(df_x)
append_start_end_token(df_y)

In [323]:
df_x["Sentences_Tokens"]

0       [<START>, now, you, lie, there, on, the, path,...
1       [<START>, she, said, if, she, were, interested...
2       [<START>, besides, she, treats, me, more, resp...
3       [<START>, whats, the, obvious, conclusion, fro...
4       [<START>, just, think, i, could, be, count, ma...
                              ...                        
1457               [<START>, thats, good, my, boy, <END>]
1458        [<START>, but, where, have, you, been, <END>]
1459    [<START>, ill, tell, you, before, you, have, t...
1460    [<START>, you, have, the, sacred, power, to, c...
1461    [<START>, i, carry, no, hatred, holy, man, bec...
Name: Sentences_Tokens, Length: 4142, dtype: object

In [324]:
vocab_x, w_to_i_x, i_to_w_x = create_vocabulary(df_x['Sentences_Tokens'].values.tolist())
vocab_y, w_to_i_y, i_to_w_y = create_vocabulary(df_y['Sentences_Tokens'].values.tolist())

In [325]:
# w_to_i_x

In [326]:
map_to_index(df_x, w_to_i_x)
map_to_index(df_y, w_to_i_y)
indices_x = df_x['Sentences_Index'].values.tolist()
indices_y = df_y['Sentences_Index'].values.tolist()

In [327]:
# indices_x

In [328]:
from sklearn.model_selection import train_test_split

In [329]:
train_x, test_x, train_y, test_y = train_test_split(indices_x, indices_y,
                                                            test_size=0.1, random_state=0)

In [330]:
mean_word_length = sum(len(sentence) for sentence in df_x['Sentences_Tokens']) / len(df_x['Sentences_Tokens'])

In [331]:
mean_word_length

10.420328343795267

In [332]:
padding_size = 10

In [333]:
input_x, input_y, next_words = create_train_data(train_x, train_y)

In [334]:
input_x_padded = pad_sequences(input_x, padding_size)
input_y_padded = pad_sequences(input_y, padding_size)

In [335]:
input_y_padded[:10]

array([[   0,    0,    0,    0,    0,    0,    0,    0,    0, 2652],
       [   0,    0,    0,    0,    0,    0,    0,    0, 2652, 1879],
       [   0,    0,    0,    0,    0,    0,    0, 2652, 1879, 1394],
       [   0,    0,    0,    0,    0,    0, 2652, 1879, 1394, 3064],
       [   0,    0,    0,    0,    0, 2652, 1879, 1394, 3064,  938],
       [   0,    0,    0,    0, 2652, 1879, 1394, 3064,  938, 2424],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0, 2652],
       [   0,    0,    0,    0,    0,    0,    0,    0, 2652, 2099],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0, 2652],
       [   0,    0,    0,    0,    0,    0,    0,    0, 2652, 2184]],
      dtype=int32)

In [336]:
next_words[:5]

[1879, 1394, 3064, 938, 2424]

# Create Model

In [337]:
embedding_size = 512

In [338]:
model = create_model(padding_size, len(vocab_x), len(vocab_y), embedding_size)

In [339]:
label_binarizer = LabelBinarizer()
label_binarizer.fit_transform(list(w_to_i_y.values()))
next_words = label_binarizer.transform(next_words)

In [340]:
input_y_padded[:5]

array([[   0,    0,    0,    0,    0,    0,    0,    0,    0, 2652],
       [   0,    0,    0,    0,    0,    0,    0,    0, 2652, 1879],
       [   0,    0,    0,    0,    0,    0,    0, 2652, 1879, 1394],
       [   0,    0,    0,    0,    0,    0, 2652, 1879, 1394, 3064],
       [   0,    0,    0,    0,    0, 2652, 1879, 1394, 3064,  938]],
      dtype=int32)

- Decided on batch_size of 64 as I read that its used as a general value, but after reading: https://wandb.ai/ayush-thakur/dl-question-bank/reports/What-s-the-Optimal-Batch-Size-to-Train-a-Neural-Network---VmlldzoyMDkyNDU, I decided to try the value of 16. Event though 16 will take double the time of 64, this paper suggests the best results and I want to test that.
- Started with 10 epochs, it was clear that it was a very far from the ideal number of epochs, so I increased to 50
- 50 showed improvements, but the predictions were still incoherent.
- Decided to go with 500 epochs to test what happens with even bigger values for epoch size. After failing mid way through the execution which was supposed to last 3 and a half hours, I decided to stay try with 100 epochs.
- After trying with 100 epochs, the performance was proven better with 50 epochs.

In [388]:
# model.fit([input_x_padded, input_y_padded],
#               next_words,
#               epochs=100, batch_size=16)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.src.callbacks.History at 0x7e2398d62590>

In [389]:
# model.save_weights('/content/drive/MyDrive/NLP Labs/models/EncoderDecoder_13.h5')

In [458]:
model.load_weights('/content/drive/MyDrive/NLP Labs/models/EncoderDecoder_12.h5')

# Decode

In [510]:
output_y = []

In [511]:
padded_sentences_x = pad_sequences(test_x, padding_size)

Reduced the number of predicted sentences to 20 for debugging purposes

In [512]:
for sentence in padded_sentences_x[:20]:
        pred = decode(model, sentence, w_to_i_y, padding_size)
        output_y.append(pred)



In [513]:
input_x = convert(test_x, i_to_w_x)
output_y_gt = convert(test_y, i_to_w_y)
output_y_pred = convert(output_y, i_to_w_y)
# output_y_pred = [[i_to_w_y[w] for w in sent] for sent in output_y]

In [516]:
# for in_x, gt_y, pred_y in zip(input_x, output_y_gt, output_y_pred):
#         print(f'Input sentence: {in_x}')
#         print(f'GT translation: {gt_y}')
#         print(f'Pred translation: {pred_y}')
#         print()

In [514]:
from evaluate import load

In [517]:
metric = load('bleu')
results = metric.compute(predictions=output_y_pred, references=output_y_gt[:20])
score = results['bleu']
print(f'BLEU score: {score}')

BLEU score: 0.07271793025092739


In [518]:
from nltk.translate.meteor_score import single_meteor_score

In [526]:
from nltk.translate import meteor
from nltk import word_tokenize

In [528]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [529]:
m_score = 0.0
for hyp, ref in zip(output_y_gt[:20], output_y_pred):
        m_score += round(meteor([word_tokenize(hyp)], word_tokenize(ref)), 4)

In [531]:
print(m_score)

5.977399999999999
