In [1]:
# Install necessary libraries
!pip install datasets pydot graphviz nltk tensorflow keras

# Load necessary libraries
from datasets import load_dataset
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences, plot_model
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Bidirectional, Dense, Dropout, Attention
from tensorflow.keras.optimizers import Adam
import nltk
import string
from nltk.corpus import stopwords

# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('punkt')

# Load dataset
dataset = load_dataset("rajpurkar/squad")

# Convert dataset to DataFrame
def dataset_to_dataframe(dataset_split):
    features = dataset_split.features
    data = []
    for row in dataset_split:
        data.append(row)
    return pd.DataFrame(data, columns=features.keys())

train = dataset_to_dataframe(dataset["train"])
val = dataset_to_dataframe(dataset["validation"])

# Drop unnecessary columns
train = train.drop(['id', 'title'], axis=1)
val = val.drop(['id', 'title'], axis=1)

# Extract text answers from dictionaries
train['answers'] = [entry['text'][0] for entry in train['answers']]
val['answers'] = [entry['text'][0] for entry in val['answers']]

# Convert text to lowercase
train['context'] = train['context'].str.lower()
train['question'] = train['question'].str.lower()
train['answers'] = train['answers'].str.lower()

val['context'] = val['context'].str.lower()
val['question'] = val['question'].str.lower()
val['answers'] = val['answers'].str.lower()

# Drop duplicates
train = train.drop_duplicates()
val = val.drop_duplicates()

# Preprocess text
stop_words = stopwords.words('english')

def preprocess_text(text):
    text = "".join([char for char in text if char not in string.punctuation])
    tokens = nltk.word_tokenize(text)
    tokens = [token for token in tokens if token not in stop_words]
    return " ".join(tokens)

# Limit the dataset for quicker testing
train = train[:10000]
val = val[:10000]

train['context'] = train['context'].apply(preprocess_text)
val['context'] = val['context'].apply(preprocess_text)

# Combine context & question
combined_text = [f"{c}, {q}" for c, q in zip(train['context'], train['question'])]
combined_text_val = [f"{c}, {q}" for c, q in zip(val['context'], val['question'])]

# Tokenization & Padding
max_words = 21877  # Adjust based on your dataset size
max_len = 340  # Adjust based on your dataset

tok = Tokenizer(num_words=max_words, char_level=False, oov_token='UNK')
tok.fit_on_texts(train['context'])

answer_seq = tok.texts_to_sequences(train['answers'])
comb_seq = tok.texts_to_sequences(combined_text)

answer_seq = pad_sequences(answer_seq, maxlen=max_len, padding='post')
comb_seq = pad_sequences(comb_seq, maxlen=max_len, padding='post')

# Validation sequences
answer_seq_val = tok.texts_to_sequences(val['answers'])
comb_seq_val = tok.texts_to_sequences(combined_text_val)

answer_seq_val = pad_sequences(answer_seq_val, maxlen=max_len, padding='post')
comb_seq_val = pad_sequences(comb_seq_val, maxlen=max_len, padding='post')

# Build the model
embedding_dim = 128
combined_seq = Input(shape=(max_len,), name='input')
comb_embedding = Embedding(input_dim=max_words, output_dim=embedding_dim)(combined_seq)
comb_lstm = Bidirectional(LSTM(256, return_sequences=True))(comb_embedding)
comb_lstm = Bidirectional(LSTM(256, return_sequences=True))(comb_lstm)
dropout_layer1 = Dropout(rate=0.5)(comb_lstm)
comb_lstm = Bidirectional(LSTM(128, return_sequences=True))(dropout_layer1)
attention_layer = Attention()([comb_lstm, comb_lstm])
output = Dense(1, activation='softmax')(attention_layer)

model = Model(inputs=combined_seq, outputs=output)
model.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])

model.summary()

# Train the model
model.fit(comb_seq, answer_seq, epochs=10, batch_size=32, validation_split=0.2)

# Visualize the model
plot_model(model, to_file="plot.png", show_shapes=True, show_layer_names=True)

# Inference examples
input_question_1 = "what's my name?"
input_context_1 = "i’m mohammd and i live in jordan"
input_text_1 = f"{input_context_1}, {input_question_1}"

input_question_2 = "where do you live?"
input_context_2 = "i’m mohammd and i live in jordan"
input_text_2 = f"{input_context_2}, {input_question_2}"

def get_prediction(input_text):
    input_seq = tok.texts_to_sequences([input_text])
    input_pad = pad_sequences(input_seq, maxlen=max_len, padding='post')
    predictions = model.predict(input_pad)
    predicted_index = np.argmax(predictions[0])
    index_to_word = {index: word for word, index in tok.word_index.items()}
    return index_to_word.get(predicted_index, "UNK")

# Get predictions
predicted_answer_1 = get_prediction(input_text_1)
predicted_answer_2 = get_prediction(input_text_2)

print("Predicted Answer for Question 1:", predicted_answer_1)
print("Predicted Answer for Question 2:", predicted_answer_2)


Collecting datasets
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: xxhash, dill, multiprocess, datasets
Successfully installed datasets-2

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/7.62k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input (InputLayer)          [(None, 340)]                0         []                            
                                                                                                  
 embedding (Embedding)       (None, 340, 128)             2800256   ['input[0][0]']               
                                                                                                  
 bidirectional (Bidirection  (None, 340, 512)             788480    ['embedding[0][0]']           
 al)                                                                                              
                                                                                                  
 bidirectional_1 (Bidirecti  (None, 340, 512)             1574912   ['bidirectional[0][0]']   