# Libraries

In [None]:
!pip install datasets
from datasets import load_dataset

import pandas as pd
import numpy as np
import statistics

from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences, to_categorical
from scipy.sparse import lil_matrix

from keras.layers import Dense, LSTM, Embedding, TimeDistributed, Bidirectional, Concatenate, Attention
from keras import Input, Model
from keras.callbacks import EarlyStopping
from keras.optimizers import RMSprop, Adam

Collecting datasets
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: xxhash, dill, multiprocess, datasets
Successfully installed datasets-2

# Load Data

In [None]:
# train_data = pd.read_parquet('C:/Users/USER/Desktop/Omar/train-00000-of-00001.parquet', engine = 'pyarrow')
# test_data = pd.read_parquet('C:/Users/USER/Desktop/Omar/validation-00000-of-00001.parquet', engine = 'pyarrow')
datasets = load_dataset("squad")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/7.62k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

In [None]:
# train_data = train_data.drop(columns=['id', 'title', 'context'])
# test_data = test_data.drop(columns=['id', 'title', 'context'])
train_data = datasets['train']
test_data = datasets['validation']

# Input/Output

In [None]:
# [:-1] to remove the ?
x_train = np.array(['<bos> '+question[:-1]+' <eos>' for question in train_data['question'][:8000]])
# not sure how to handle the multiple answers in test dataset so we only took the first answer
y_train = np.array([answer['text'][0] for answer in train_data['answers'][:8000]])

x_test = np.array(['<bos> '+question[:-1]+' <eos>' for question in test_data['question'][:2000]])
y_test = np.array([answer['text'][0] for answer in test_data['answers'][:2000]])

# max length results are very bad, we will use 90% instead
length_x = int(statistics.quantiles([len(sent.split()) for sent in x_train], n=20)[17])
length_y = int(statistics.quantiles([len(sent.split()) for sent in y_train], n=20)[17])

# length_x = max(len(sent.split()) for sent in x_train)
# length_y = max(len(sent.split()) for sent in y_train)

# Tokenization/Padding

In [None]:
tok = Tokenizer(oov_token='<oov>')
tok.fit_on_texts(np.concatenate((x_train, y_train), axis=0))
train_seq_x = tok.texts_to_sequences(x_train)
test_seq_x = tok.texts_to_sequences(x_test)
train_seq_y = tok.texts_to_sequences(y_train)
test_seq_y = tok.texts_to_sequences(y_test)
print (f'size of vocab is {len(tok.word_index)}')

size of vocab is 11462


In [None]:
train_padded_x = pad_sequences(train_seq_x, maxlen=length_x, padding='post', truncating='post', dtype="float32")
test_padded_x = pad_sequences(test_seq_x, maxlen=length_x, padding='post', truncating='post', dtype="float32")

train_padded_y = pad_sequences(train_seq_y, maxlen=length_y, padding='post', truncating='post', dtype="float32")
test_padded_y = pad_sequences(test_seq_y, maxlen=length_y, padding='post', truncating='post', dtype="float32")

train_in = np.array([np.append(2, row) for row in train_padded_y], dtype="float32")
test_in = np.array([np.append(2, row) for row in test_padded_y], dtype="float32")

train_out = np.array([np.append(row, 3) for row in train_padded_y], dtype="float32")
test_out = np.array([np.append(row, 3) for row in test_padded_y], dtype="float32")

# Index

In [None]:
vocab = len(tok.word_index) + 1

dictionary = tok.word_index

idx2word = {}
for k, v in dictionary.items():
    idx2word[v] = k

# One-hot

In [None]:
sparse_train = lil_matrix((len(train_out), vocab))

for row, sequence in enumerate(train_out):
    one_hot = to_categorical(sequence, num_classes=vocab).sum(axis=0)
    sparse_train[row] = one_hot

sparse_test = lil_matrix((len(test_out), vocab))

for row, sequence in enumerate(test_out):
    one_hot = to_categorical(sequence, num_classes=vocab).sum(axis=0)
    sparse_test[row] = one_hot

In [None]:
one_hot_train = np.zeros((len(train_out),length_y+1,vocab), dtype="float32")
for row in range(0, len(train_out)):
    word = 0
    for col in sparse_train.rows[row]:
        one_hot_train[row][word][col] = 1
        word += 1

one_hot_test = np.zeros((len(test_out),length_y+1,vocab), dtype="float32")
for row in range(0, len(test_out)):
    word = 0
    for col in sparse_train.rows[row]:
        one_hot_test[row][word][col] = 1
        word += 1

# Model

In [None]:
embd_size = 32
unit_size = 128

encoder_inputs = Input(shape=(length_x, ), dtype='float32',)
encoder_embedding_layer = Embedding(input_dim=vocab, output_dim=embd_size, input_length=length_x, mask_zero=True)
encoder_embedding = encoder_embedding_layer(encoder_inputs)
encoder_LSTM = Bidirectional(LSTM(unit_size, return_state=True, dropout=0.5))
encoder_outputs, state_hf, state_cf, state_hb, state_cb = encoder_LSTM(encoder_embedding)

final_enc_h = Concatenate()([state_hf, state_hb])
final_enc_c = Concatenate()([state_cf, state_cb])
encoder_states = [final_enc_h, final_enc_c]

decoder_inputs = Input(shape=(length_y+1, ), dtype='float32',)
decoder_embedding_layer = Embedding(input_dim=vocab, output_dim=embd_size, input_length=length_y+1, mask_zero=True)
decoder_embedding = decoder_embedding_layer(decoder_inputs)

decoder_LSTM = LSTM(unit_size*2, return_state=True, return_sequences=True, dropout=0.35)
decoder_outputs, _, _ = decoder_LSTM(decoder_embedding, initial_state=encoder_states)

attention_layer = Attention()
attention_result = attention_layer([decoder_outputs, encoder_outputs])
decoder_concat_input = Concatenate(axis=-1)([decoder_outputs, attention_result])

logit_layer = Dense(units=vocab, activation='softmax')
# outputs = logit_layer(decoder_concat_input)
outputs = TimeDistributed(logit_layer)(decoder_concat_input)
model = Model([encoder_inputs, decoder_inputs], outputs)

model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 17)]                 0         []                            
                                                                                                  
 embedding (Embedding)       (None, 17, 32)               366816    ['input_1[0][0]']             
                                                                                                  
 input_2 (InputLayer)        [(None, 6)]                  0         []                            
                                                                                                  
 bidirectional (Bidirection  [(None, 256),                164864    ['embedding[0][0]']           
 al)                          (None, 128),                                                    

In [None]:
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# optimizer = Adam(learning_rate = 0.002, clipvalue=1)
optimizer = RMSprop(learning_rate = 0.001, clipvalue=1)
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics='accuracy')

history = model.fit([train_padded_x, train_in], one_hot_train,
                    batch_size=32,
                    validation_data=([test_padded_x, test_in], one_hot_test),
                    epochs=200,
                    callbacks=[early_stopping])

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200


# Testing Model

In [None]:
predictions = model.predict([train_padded_x, train_in], batch_size=64)

predicted_indices = np.argmax(predictions, axis=-1)

predicted_train = []
for sample_indices in predicted_indices:
    predicted_train.append([idx2word[idx+1] for idx in sample_indices])



In [None]:
random_indexes = np.random.choice(len(train_padded_x), size=5, replace=False)
for i in random_indexes:
    print(x_train[i][6:-6])
    print(y_train[i])
    print(predicted_train[i])

All of his pieces include what instrument
piano
['<oov>', 'the', 'the', 'the', 'the', 'the']
What is Mr. White dying of
thallium poisoning.
['<oov>', 'the', 'a', 'solar', 'solar', 'solar']
What is the primary seminary of the Congregation of the Holy Cross
Moreau Seminary
['<oov>', 'the', 'a', 'solar', 'solar', 'solar']
What related fields do linguistic anthropologists draw on
sociolinguistics, pragmatics, cognitive linguistics, semiotics, discourse analysis, and narrative analysis
['<oov>', 'the', 'a', 'music', 'island', 'island']
What can happen when antibiotics are used with other drugs
Additional side-effects
['<oov>', 'the', 'a', 'music', 'video', 'video']


In [None]:
predictions = model.predict([test_padded_x, test_in], batch_size=64)

predicted_indices = np.argmax(predictions, axis=-1)

predicted_words = []
for sample_indices in predicted_indices:
    predicted_words.append([idx2word[idx+1] for idx in sample_indices])



In [None]:
random_indexes = np.random.choice(len(test_padded_x), size=5, replace=False)
for i in random_indexes:
    print(x_test[i][6:-6])
    print(y_test[i])
    print(predicted_words[i])

What was Kenneth Swezey's job
journalist
['<oov>', 'the', 'the', 'the', 'the', 'the']
What one word did the NFL commissioner use to describe what Super Bowl 50 was intended to be
spectacular
['<oov>', 'the', 'the', 'the', 'the', 'the']
What city, raided by clans and dukes, preceded the founding of Warszowa
Jazdów
['<oov>', 'the', 'the', 'the', 'the', 'the']
Who was Tesla's nephew
Sava Kosanović
['<oov>', 'the', 'a', 'solar', 'solar', 'solar']
Why did the police bring Tesla back to Gospic
for not having a residence permit.
['<oov>', 'the', 'a', 'music', 'island', 'island']
