# Import Required Libraries
Import the necessary libraries, including TensorFlow, pandas, and numpy.

In [36]:
# Import the necessary libraries
import tensorflow as tf
import pandas as pd
import numpy as np

# Load the Dataset
Load the question-answer dataset from the provided URL or local file.

In [38]:
dataset = pd.read_csv("question_answer_pairs.csv")
dataset.head()

Unnamed: 0,ArticleTitle,Question,Answer,DifficultyFromQuestioner,DifficultyFromAnswerer,ArticleFile
0,Alessandro_Volta,Was Alessandro Volta a professor of chemistry?,Alessandro Volta was not a professor of chemis...,easy,easy,data/set4/a10
1,Alessandro_Volta,Was Alessandro Volta a professor of chemistry?,No,easy,hard,data/set4/a10
2,Alessandro_Volta,Did Alessandro Volta invent the remotely opera...,Alessandro Volta did invent the remotely opera...,easy,easy,data/set4/a10
3,Alessandro_Volta,Did Alessandro Volta invent the remotely opera...,Yes,easy,easy,data/set4/a10
4,Alessandro_Volta,Was Alessandro Volta taught in public schools?,Volta was taught in public schools.,easy,easy,data/set4/a10


# Explore the Dataset
Perform basic exploration of the dataset, including checking the number of samples, columns, and data types.

In [39]:
# Check the number of samples in the dataset
num_samples = dataset.shape[0]
print(f"Number of samples: {num_samples}")

Number of samples: 1458


In [40]:
# Check the columns in the dataset
columns = dataset.columns
print(f"Columns: {columns}")

Columns: Index(['ArticleTitle', 'Question', 'Answer', 'DifficultyFromQuestioner',
       'DifficultyFromAnswerer', 'ArticleFile'],
      dtype='object')


In [41]:
# Check the data types of the columns
data_types = dataset.dtypes
print(f"Data types:\n{data_types}")

Data types:
ArticleTitle                object
Question                    object
Answer                      object
DifficultyFromQuestioner    object
DifficultyFromAnswerer      object
ArticleFile                 object
dtype: object


In [42]:
# Display basic statistics of the dataset
dataset.describe(include='all')

Unnamed: 0,ArticleTitle,Question,Answer,DifficultyFromQuestioner,DifficultyFromAnswerer,ArticleFile
count,1458,1440,1222,1262,1222,1458
unique,56,831,759,3,5,56
top,Alessandro_Volta,Is the drum a member of the percussion group?,Yes,medium,easy,data/set4/a10
freq,44,4,145,432,473,44


In [43]:
# Checking for any missing values
dataset.isnull().sum()

ArticleTitle                  0
Question                     18
Answer                      236
DifficultyFromQuestioner    196
DifficultyFromAnswerer      236
ArticleFile                   0
dtype: int64

In [44]:
# Check for duplicates
duplicates = dataset.duplicated()

# Print the number of duplicate rows
print(f"Number of duplicate rows: {duplicates.sum()}")

Number of duplicate rows: 113


In [45]:
# Unique values
print("Unique questions:", dataset['Question'].nunique())
print("Unique answers:", dataset['Answer'].nunique())

Unique questions: 831
Unique answers: 759


In [46]:
# Drop duplicates
dataset.drop_duplicates(inplace=True)

In [47]:
# Drop null values
dataset.dropna(inplace=True)

# Sample 10% of the Data
Randomly sample 10% of the dataset to test training speed and feasibility.

In [48]:
# Sample 10% of the Data

# Calculate the number of samples to take (10% of the dataset)
sample_size = int(0.1 * num_samples)

# Randomly sample 10% of the dataset
sampled_dataset = dataset.sample(n=sample_size, random_state=42)

# Check the number of samples in the sampled dataset
sampled_num_samples = sampled_dataset.shape[0]
print(f"Number of samples in the sampled dataset: {sampled_num_samples}")

# Check the columns in the sampled dataset
sampled_columns = sampled_dataset.columns
print(f"Columns in the sampled dataset: {sampled_columns}")

# Check the data types of the columns in the sampled dataset
sampled_data_types = sampled_dataset.dtypes
print(f"Data types in the sampled dataset:\n{sampled_data_types}")

# Display basic statistics of the sampled dataset
sampled_dataset.describe(include='all')

Number of samples in the sampled dataset: 145
Columns in the sampled dataset: Index(['ArticleTitle', 'Question', 'Answer', 'DifficultyFromQuestioner',
       'DifficultyFromAnswerer', 'ArticleFile'],
      dtype='object')
Data types in the sampled dataset:
ArticleTitle                object
Question                    object
Answer                      object
DifficultyFromQuestioner    object
DifficultyFromAnswerer      object
ArticleFile                 object
dtype: object


Unnamed: 0,ArticleTitle,Question,Answer,DifficultyFromQuestioner,DifficultyFromAnswerer,ArticleFile
count,145,145,145,145,145,145
unique,43,141,110,3,4,43
top,Piano,What are modern guitar strings constructed of?,yes,easy,medium,data/set2/a1
freq,7,2,17,54,58,7


In [49]:
# Display the first few rows of the sampled dataset to verify
sampled_dataset.head()

Unnamed: 0,ArticleTitle,Question,Answer,DifficultyFromQuestioner,DifficultyFromAnswerer,ArticleFile
504,Giant_Panda,What family is the panda a part of?,Ursidae (bear),medium,medium,data/set1/a4
544,Guitar,What are modern guitar strings constructed of?,"Metal, polymers, or animal or plant product ma...",medium,medium,data/set2/a7
511,Giant_Panda,What foods do pandas eat?,"bamboo, honeys, eggs, fish, yams, shrub leaves...",hard,medium,data/set1/a4
1194,San_Francisco,What makes San Francisco among the top-ten Nor...,San Francisco has a large hotel infrastructure...,medium,medium,data/set3/a8
436,Eel,Where is smoked eel considered a delicacy?,"Northern Germany, The Netherlands, Denmark, Sw...",medium,medium,data/set1/a8


# Preprocess the Data
Preprocess the sampled data, including tokenization and padding of text sequences.

In [52]:
# Preprocess the Data

# Tokenize the text sequences
tokenizer = tf.keras.preprocessing.text.Tokenizer()

# Fit the tokenizer on the questions and answers
tokenizer.fit_on_texts(sampled_dataset['Question']) 
tokenizer.fit_on_texts(sampled_dataset['Answer'])

# Convert the text sequences to integer sequences
question_sequences = tokenizer.texts_to_sequences(sampled_dataset['Question'])
answer_sequences = tokenizer.texts_to_sequences(sampled_dataset['Answer'])

# Pad the sequences to ensure uniform length
max_length = max(max(len(seq) for seq in question_sequences), max(len(seq) for seq in answer_sequences))
padded_question_sequences = tf.keras.preprocessing.sequence.pad_sequences(question_sequences, maxlen=max_length, padding='post')
padded_answer_sequences = tf.keras.preprocessing.sequence.pad_sequences(answer_sequences, maxlen=max_length, padding='post')

# Display the first few padded sequences to verify
print("Padded Question Sequences:\n", padded_question_sequences[:5])
print("Padded Answer Sequences:\n", padded_answer_sequences[:5])

Padded Question Sequences:
 [[  7 132   3   1 284   4 133   2   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0]
 [  7  11  42  57  86 134   2   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0]
 [  7 135  15 136  87   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0]
 [  7  88 137 138 285   1 286 287 288 289 290  58 291  10 292   0   0   0
    0   0   0   0   0]
 [ 19   3 293 294 295   4 296   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0]]
Padded Answer Sequences:
 [[538 539   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0]
 [261 262  21 263  21 264 265 266   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0]
 [267 540 541 542 543 544 545 546  10 547   0   0   0   0   0   0   0   0
    0   0   0   0   0]
 [137 138  20   4 250 548 549  10   4 171 550 551 552   5   1 553 554   0
    0   0   0   0   0]
 [ 32  73   1 555 556 557   0   0   0   0   0   0   0   0   0   0   0   

# Build a Simple TensorFlow Model
Build a simple TensorFlow model suitable for question-answering tasks.

In [56]:
# Build a Simple TensorFlow Model

# Define the model architecture
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=64),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Display the model summary
model.summary()

# Create a binary label column for demonstration purposes
sampled_dataset['Label'] = np.random.randint(0, 2, size=sampled_dataset.shape[0])

# Prepare the labels (assuming binary classification for simplicity)
labels = sampled_dataset['Label'].values

# Train the model on the sampled dataset
history = model.fit(padded_question_sequences, labels, epochs=5, batch_size=32, validation_split=0.2)

# Evaluate the model on the training data
loss, accuracy = model.evaluate(padded_question_sequences, labels)
print(f"Training Loss: {loss}")
print(f"Training Accuracy: {accuracy}")

Epoch 1/5
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 61ms/step - accuracy: 0.5188 - loss: 0.6931 - val_accuracy: 0.4828 - val_loss: 0.6934
Epoch 2/5
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.5305 - loss: 0.6902 - val_accuracy: 0.4828 - val_loss: 0.6932
Epoch 3/5
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.5694 - loss: 0.6876 - val_accuracy: 0.4483 - val_loss: 0.6927
Epoch 4/5
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.7389 - loss: 0.6824 - val_accuracy: 0.4828 - val_loss: 0.6926
Epoch 5/5
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.7722 - loss: 0.6729 - val_accuracy: 0.4828 - val_loss: 0.6928
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.7873 - loss: 0.6644
Training Loss: 0.6670048236846924
Training Accuracy: 0.7448275685310364


# Train the Model on Sample Data
Train the TensorFlow model on the sampled data and record the training time.

In [57]:
# Train the Model on Sample Data

import time

# Start the timer
start_time = time.time()

# Train the model on the sampled dataset
history = model.fit(padded_question_sequences, labels, epochs=5, batch_size=32, validation_split=0.2)

# Stop the timer
end_time = time.time()

# Calculate the training time
training_time = end_time - start_time
print(f"Training Time: {training_time} seconds")

# Evaluate the model on the training data
loss, accuracy = model.evaluate(padded_question_sequences, labels)
print(f"Training Loss: {loss}")
print(f"Training Accuracy: {accuracy}")

Epoch 1/5
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - accuracy: 0.7970 - loss: 0.6607 - val_accuracy: 0.4828 - val_loss: 0.6921
Epoch 2/5
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.8665 - loss: 0.6321 - val_accuracy: 0.4828 - val_loss: 0.6947
Epoch 3/5
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.8718 - loss: 0.5909 - val_accuracy: 0.5172 - val_loss: 0.6932
Epoch 4/5
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.9239 - loss: 0.5205 - val_accuracy: 0.5517 - val_loss: 0.7049
Epoch 5/5
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.9287 - loss: 0.4031 - val_accuracy: 0.5517 - val_loss: 0.8256
Training Time: 0.31087493896484375 seconds
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8340 - loss: 0.3733
Training Loss: 0.465528279542923
Training Accuracy: 0.77931034

# Evaluate Training Speed and Feasibility
Evaluate the training speed and feasibility based on the recorded training time and model performance.

In [58]:
# Evaluate Training Speed and Feasibility

# Record the training time and model performance
import time

# Start the timer
start_time = time.time()

# Train the model on the sampled dataset
history = model.fit(padded_question_sequences, labels, epochs=5, batch_size=32, validation_split=0.2)

# Stop the timer
end_time = time.time()

# Calculate the training time
training_time = end_time - start_time
print(f"Training Time: {training_time} seconds")

# Evaluate the model on the training data
loss, accuracy = model.evaluate(padded_question_sequences, labels)
print(f"Training Loss: {loss}")
print(f"Training Accuracy: {accuracy}")

# Record the results
results = {
    "Training Time (seconds)": training_time,
    "Training Loss": loss,
    "Training Accuracy": accuracy
}

# Display the results
print("Evaluation Results:")
for key, value in results.items():
    print(f"{key}: {value}")

Epoch 1/5
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - accuracy: 0.9360 - loss: 0.2725 - val_accuracy: 0.5862 - val_loss: 0.7855
Epoch 2/5
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.8141 - loss: 0.3991 - val_accuracy: 0.5517 - val_loss: 1.0732
Epoch 3/5
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step - accuracy: 0.8278 - loss: 0.3320 - val_accuracy: 0.5172 - val_loss: 0.7795
Epoch 4/5
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.9803 - loss: 0.1830 - val_accuracy: 0.5517 - val_loss: 0.8611
Epoch 5/5
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.9411 - loss: 0.1966 - val_accuracy: 0.4828 - val_loss: 0.8691
Training Time: 0.32560300827026367 seconds
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9457 - loss: 0.1964
Training Loss: 0.2928147315979004
Training Accuracy: 0.8827586