In [126]:
# Import required libraries, modules and functions

# Import pandas library for data frame processing
import pandas as pd

# Import matplotlib library for visualisation
import matplotlib.pyplot as plt

# Import spaCy Natural Language Processing Library
import spacy

# Import Counter function
from collections import Counter

# Import Numpy for array processing of features
import numpy as np

# import torch libraru
import torch

import nltk
from nltk.probability import FreqDist

import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np


In [61]:
# import training data from train.csv and store as dataframe 'ToxicityTrainingDF'
WikiQATrainingRawDF = pd.read_csv("WikiQA-train.csv")

In [62]:
# Check out values for a randome number of rows
WikiQATrainingRawDF

Unnamed: 0,QuestionID,Question,DocumentID,DocumentTitle,SentenceID,Sentence,Label
0,Q1,how are glacier caves formed?,D1,Glacier cave,D1-0,A partly submerged glacier cave on Perito More...,0
1,Q1,how are glacier caves formed?,D1,Glacier cave,D1-1,The ice facade is approximately 60 m high,0
2,Q1,how are glacier caves formed?,D1,Glacier cave,D1-2,Ice formations in the Titlis glacier cave,0
3,Q1,how are glacier caves formed?,D1,Glacier cave,D1-3,A glacier cave is a cave formed within the ice...,1
4,Q1,how are glacier caves formed?,D1,Glacier cave,D1-4,"Glacier caves are often called ice caves , but...",0
...,...,...,...,...,...,...,...
20342,Q3043,what is section eight housing,D2807,Section 8 (housing),D2807-8,A tenant who leaves a subsidized project will ...,0
20343,Q3043,what is section eight housing,D2807,Section 8 (housing),D2807-9,The United States Department of Housing and Ur...,0
20344,Q3044,what is the main type of restaurant,D2808,Category:Types of restaurants,D2808-0,Restaurants categorized by type and informatio...,0
20345,Q3046,what is us dollar worth based on,D2810,History of the United States dollar,D2810-0,U.S. Federal Reserve notes in the mid-1990s,0


In [63]:
# Group by 'Question' and aggregate 'Sentence' by joining them with a space
WikiQATrainingDF = WikiQATrainingRawDF.groupby('Question')['Sentence'].agg(' '.join).reset_index()

# Rename the 'Sentence' Column to 'Answer'
WikiQATrainingDF.rename(columns={'Sentence': 'Answer'}, inplace=True)


In [64]:
WikiQATrainingDF

Unnamed: 0,Question,Answer
0,HOW MANY BROTHELS WERE THERE IN THE UNITED STA...,The regulation of prostitution in the United S...
1,HOW MANY STRIPES ARE ON THE AMERICAN FLAG,The national flag of the United States of Amer...
2,How Did Paul Bragg Die,"Paul Bragg (February 6, 1895 â€“ December 7, 1..."
3,How Do You Find the mean of the squares of the...,"In mathematics , a square number or perfect sq..."
4,How Do You Get Hepatitis C,Hepatitis C is an infectious disease affecting...
...,...,...
2112,who wrote the song cocaine,Cocaine is a song written and recorded by JJ C...
2113,who wrote the song feelin alright,"Feelin' Alright? also known as ""Feeling Alrigh..."
2114,who wrote the song in the mood,In the Mood is a big band era #1 hit recorded ...
2115,who wrote what's my name rihanna,What's My Name? is a song recorded by Barbadia...


In [65]:
#Export grouped question and answer pair from the normalized 'WikiQATrainingDF' dataframe
WikiQATrainingDF.to_csv('GroupedWIKIQATraining.csv', index=False)

In [66]:
# Load Spacy small english model to object "nlp"
nlp = spacy.load('en_core_web_sm')

In [67]:
# preprocess_text is a text normalizing function to remove, punctuations and lowecase all the text,
# This will standardize the text for feature extraction step.
def preprocess_text(text):
    doc = nlp(text)
    # Keep only meaningful tokens
    tokens = [token.text.lower() for token in doc 
             if not token.is_punct 
             and not token.is_space
             and len(token.text) > 2]
    return ' '.join(tokens)

In [68]:
# Normalize the text data in the 'Question' column in 'WikiQATrainingDF' frame
WikiQATrainingDF['Question'] = WikiQATrainingDF['Question'].apply(preprocess_text)
# Normalize the text data in the 'Answer' column in 'WikiQATrainingDF' dataframe
WikiQATrainingDF['Answer'] = WikiQATrainingDF['Answer'].apply(preprocess_text)

In [69]:
#Check content of 'WikiQATrainingDF' dataframe
WikiQATrainingDF

Unnamed: 0,Question,Answer
0,how many brothels were there the united states...,the regulation prostitution the united states ...
1,how many stripes are the american flag,the national flag the united states america of...
2,how did paul bragg die,paul bragg february 1895 december 1976 nutriti...
3,how you find the mean the squares the first co...,mathematics square number perfect square integ...
4,how you get hepatitis,hepatitis infectious disease affecting primari...
...,...,...
2112,who wrote the song cocaine,cocaine song written and recorded cale 1976 bu...
2113,who wrote the song feelin alright,feelin alright also known feeling alright song...
2114,who wrote the song the mood,the mood big band era hit recorded american ba...
2115,who wrote what name rihanna,what name song recorded barbadian recording ar...


In [70]:
#Export normalized question and answer pair from the normalized 'WikiQATrainingDF' dataframe
WikiQATrainingDF.to_csv('ProcessedWikiQATraining.csv', index=False)

In [71]:
# Find out what is the maximum number of words in the 'Question' column
QuestionMaxWords = WikiQATrainingDF['Question'].str.split().str.len().max()
print('Maximum number of words in "Question" columns is ' + str(QuestionMaxWords))


Maximum number of words in "Question" columns is 18


In [72]:
AnswerMaxWords = WikiQATrainingDF['Answer'].str.split().str.len().max()
print('Maximum number of words in "Answer" columns is ' + str(AnswerMaxWords))

Maximum number of words in "Answer" columns is 706


In [73]:
# Tokenization function
def tokenize_text(text, max_words):
    # Tokenize and truncate
    tokens = [token.text for token in nlp(text)][:max_words]
    return tokens

In [78]:
# Prepare tokenized data
WikiQATrainingDF['TokenizedQuestion'] = WikiQATrainingDF['Question'].apply(lambda x: tokenize_text(x, max_words=18))
WikiQATrainingDF['TokenizedAnswer'] = WikiQATrainingDF['Answer'].apply(lambda x: tokenize_text(x, max_words=706))

In [79]:
WikiQATrainingDF

Unnamed: 0,Question,Answer,tokenized_question,tokenized_answer,TokenizedQuestion,TokenizedAnswer
0,how many brothels were there the united states...,the regulation prostitution the united states ...,"[how, many, brothels, were, there, the, united...","[the, regulation, prostitution, the, united, s...","[how, many, brothels, were, there, the, united...","[the, regulation, prostitution, the, united, s..."
1,how many stripes are the american flag,the national flag the united states america of...,"[how, many, stripes, are, the, american, flag]","[the, national, flag, the, united, states, ame...","[how, many, stripes, are, the, american, flag]","[the, national, flag, the, united, states, ame..."
2,how did paul bragg die,paul bragg february 1895 december 1976 nutriti...,"[how, did, paul, bragg, die]","[paul, bragg, february, 1895, december, 1976, ...","[how, did, paul, bragg, die]","[paul, bragg, february, 1895, december, 1976, ..."
3,how you find the mean the squares the first co...,mathematics square number perfect square integ...,"[how, you, find, the, mean, the, squares, the,...","[mathematics, square, number, perfect, square,...","[how, you, find, the, mean, the, squares, the,...","[mathematics, square, number, perfect, square,..."
4,how you get hepatitis,hepatitis infectious disease affecting primari...,"[how, you, get, hepatitis]","[hepatitis, infectious, disease, affecting, pr...","[how, you, get, hepatitis]","[hepatitis, infectious, disease, affecting, pr..."
...,...,...,...,...,...,...
2112,who wrote the song cocaine,cocaine song written and recorded cale 1976 bu...,"[who, wrote, the, song, cocaine]","[cocaine, song, written, and, recorded, cale, ...","[who, wrote, the, song, cocaine]","[cocaine, song, written, and, recorded, cale, ..."
2113,who wrote the song feelin alright,feelin alright also known feeling alright song...,"[who, wrote, the, song, feelin, alright]","[feelin, alright, also, known, feeling, alrigh...","[who, wrote, the, song, feelin, alright]","[feelin, alright, also, known, feeling, alrigh..."
2114,who wrote the song the mood,the mood big band era hit recorded american ba...,"[who, wrote, the, song, the, mood]","[the, mood, big, band, era, hit, recorded, ame...","[who, wrote, the, song, the, mood]","[the, mood, big, band, era, hit, recorded, ame..."
2115,who wrote what name rihanna,what name song recorded barbadian recording ar...,"[who, wrote, what, name, rihanna]","[what, name, song, recorded, barbadian, record...","[who, wrote, what, name, rihanna]","[what, name, song, recorded, barbadian, record..."


In [82]:
WikiQATrainingDF

Unnamed: 0,Question,Answer,TokenizedQuestion,TokenizedAnswer
0,how many brothels were there the united states...,the regulation prostitution the united states ...,"[how, many, brothels, were, there, the, united...","[the, regulation, prostitution, the, united, s..."
1,how many stripes are the american flag,the national flag the united states america of...,"[how, many, stripes, are, the, american, flag]","[the, national, flag, the, united, states, ame..."
2,how did paul bragg die,paul bragg february 1895 december 1976 nutriti...,"[how, did, paul, bragg, die]","[paul, bragg, february, 1895, december, 1976, ..."
3,how you find the mean the squares the first co...,mathematics square number perfect square integ...,"[how, you, find, the, mean, the, squares, the,...","[mathematics, square, number, perfect, square,..."
4,how you get hepatitis,hepatitis infectious disease affecting primari...,"[how, you, get, hepatitis]","[hepatitis, infectious, disease, affecting, pr..."
...,...,...,...,...
2112,who wrote the song cocaine,cocaine song written and recorded cale 1976 bu...,"[who, wrote, the, song, cocaine]","[cocaine, song, written, and, recorded, cale, ..."
2113,who wrote the song feelin alright,feelin alright also known feeling alright song...,"[who, wrote, the, song, feelin, alright]","[feelin, alright, also, known, feeling, alrigh..."
2114,who wrote the song the mood,the mood big band era hit recorded american ba...,"[who, wrote, the, song, the, mood]","[the, mood, big, band, era, hit, recorded, ame..."
2115,who wrote what name rihanna,what name song recorded barbadian recording ar...,"[who, wrote, what, name, rihanna]","[what, name, song, recorded, barbadian, record..."


In [118]:
# Build vocabulary from TokenizedQuestion

# For questions
question_tokens = [word for question in WikiQATrainingDF['TokenizedQuestion'] for word in question]
question_fdist = FreqDist(question_tokens)
question_vocab = {
    word: idx for idx, (word, _) in enumerate(
        question_fdist.most_common(), 
        start=len(['<pad>', '<unk>', '<sos>', '<eos>'])
    )
}
question_vocab.update({
    '<pad>': 0,
    '<unk>': 1,
    '<sos>': 2,
    '<eos>': 3
})

In [119]:

# Build vocabulary from TokenizedAnsweer
# For answers (similar approach)
answer_tokens = [word for answer in WikiQATrainingDF['TokenizedAnswer'] for word in answer]
answer_fdist = FreqDist(answer_tokens)
answer_vocab = {
    word: idx for idx, (word, _) in enumerate(
        answer_fdist.most_common(), 
        start=len(['<pad>', '<unk>', '<sos>', '<eos>'])
    )
}
answer_vocab.update({
    '<pad>': 0,
    '<unk>': 1,
    '<sos>': 2,
    '<eos>': 3
})

In [129]:
# Hyperparameters
EMBEDDING_DIM = 256
HIDDEN_DIM = 512
LEARNING_RATE = 0.001
BATCH_SIZE = 64
NUM_EPOCHS = 10

# Prepare input and target tensors
def prepare_tensors(WikiQATrainingDF, question_vocab, answer_vocab):
    # Convert questions to indices
    questions_indices = []
    for question in WikiQATrainingDF['Question']:
        # Convert words to indices, pad or truncate to max length
        indices = [question_vocab.get(word, question_vocab['<unk>']) for word in question[:18]]
        indices += [question_vocab['<pad>']] * (18 - len(indices))
        questions_indices.append(indices)
    
    # Convert answers to indices
    answers_indices = []
    for answer in WikiQATrainingDF['Answer']:
        # Convert words to indices, pad or truncate to max length
        indices = [answer_vocab.get(word, answer_vocab['<unk>']) for word in answer[:706]]
        indices += [answer_vocab['<pad>']] * (706 - len(indices))
        answers_indices.append(indices)
    
    # Convert to PyTorch tensors
    X = torch.tensor(questions_indices, dtype=torch.long)
    y = torch.tensor(answers_indices, dtype=torch.long)
    
    return X, y

# Prepare data
X, y = prepare_tensors(WikiQATrainingDF, question_vocab, answer_vocab)

# Create DataLoader
dataset = torch.utils.data.TensorDataset(X, y)
dataloader = torch.utils.data.DataLoader(
    dataset, 
    batch_size=BATCH_SIZE, 
    shuffle=True
)

# Encoder
encoder = nn.LSTM(
    input_size=len(question_vocab),
    hidden_size=HIDDEN_DIM,
    num_layers=2,
    batch_first=True,
    dropout=0.3
)

# Decoder
decoder = nn.LSTM(
    input_size=len(answer_vocab),
    hidden_size=HIDDEN_DIM,
    num_layers=2,
    batch_first=True,
    dropout=0.3
)

# Output layer
output_layer = nn.Linear(HIDDEN_DIM, len(answer_vocab))

# Loss and Optimizer
criterion = nn.CrossEntropyLoss(ignore_index=answer_vocab['<pad>'])
optimizer = optim.Adam(
    list(encoder.parameters()) + 
    list(decoder.parameters()) + 
    list(output_layer.parameters()), 
    lr=LEARNING_RATE
)

# One-hot embedding
def one_hot_embedding(X, vocab):
    return nn.functional.one_hot(X, num_classes=len(vocab)).float()

# Training loop
for epoch in range(NUM_EPOCHS):
    total_loss = 0
    
    for batch_x, batch_y in dataloader:
        # Zero gradients
        optimizer.zero_grad()
        
        # One-hot encode inputs
        batch_x_embedded = one_hot_embedding(batch_x, question_vocab)
        batch_y_embedded = one_hot_embedding(batch_y, answer_vocab)
        
        # Encoder forward pass
        encoder_outputs, (hidden, cell) = encoder(batch_x_embedded)
        
        # Prepare decoder input (first token is <sos>)
        decoder_input = torch.full(
            (batch_x.size(0), 1), 
            answer_vocab['<sos>'], 
            dtype=torch.long
        )
        decoder_input_embedded = one_hot_embedding(decoder_input, answer_vocab)
        
        # Decoder forward pass with teacher forcing
        decoder_outputs = []
        for t in range(batch_y.size(1)):
            decoder_output, (hidden, cell) = decoder(decoder_input_embedded, (hidden, cell))
            
            # Project output
            output = output_layer(decoder_output.squeeze(1))
            decoder_outputs.append(output)
            
            # Use ground truth as next input (teacher forcing)
            if t < batch_y.size(1) - 1:
                decoder_input = batch_y[:, t+1].unsqueeze(1)
                decoder_input_embedded = one_hot_embedding(decoder_input, answer_vocab)
        
        # Stack decoder outputs
        decoder_outputs = torch.stack(decoder_outputs, dim=1)
        
        # Compute loss
        loss = criterion(
            decoder_outputs.reshape(-1, len(answer_vocab)), 
            batch_y.reshape(-1)
        )
        
        # Backpropagate
        loss.backward()
        
        # Clip gradients
        torch.nn.utils.clip_grad_norm_(
            list(encoder.parameters()) + 
            list(decoder.parameters()) + 
            list(output_layer.parameters()), 
            max_norm=1
        )
        
        # Optimizer step
        optimizer.step()
        
        total_loss += loss.item()
    
    # Print average loss per epoch
    print(f'Epoch [{epoch+1}/{NUM_EPOCHS}], Loss: {total_loss/len(dataloader):.4f}')



ModuleNotFoundError: No module named 'torch._C._dynamo.guards'; 'torch._C._dynamo' is not a package

In [None]:
# Optional: Save model
torch.save({
    'encoder_state': encoder.state_dict(),
    'decoder_state': decoder.state_dict(),
    'output_layer_state': output_layer.state_dict()
}, 'seq2seq_model.pth')

In [128]:
print(torch.__version__)

2.2.2+cu121
