In [18]:
from torchdata import datapipes as dp
from torchtext import datasets
import spacy
from torchtext.vocab import build_vocab_from_iterator
import warnings
warnings.filterwarnings("ignore")

eng = spacy.load("en_core_web_md")
tokeniser = eng.tokenizer

In [2]:
from torchtext import transforms as T 
from torchtext.datasets import SQuAD2

In [5]:
sq_pipe = SQuAD2(split='dev')
test_data = list(sq_pipe)[0]
test_data

('The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse ("Norman" comes from "Norseman") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries.',
 'In what country is Normandy located?',
 ['France', 'France', 'France', 'France'],
 [159, 159, 159, 159])

In [None]:
# we have tokenise all parts of the datapoint, and then seperate them for training
[x.text for x in eng.tokenizer(test_data[0])]

In [26]:
def tokenize(text, ind):
    return [token.text for token in eng.tokenizer(text[ind])]

tokenize(test_data, 1)

['In', 'what', 'country', 'is', 'Normandy', 'located', '?']

In [42]:
def tokenize_sent(text):
    return [token.text for token in eng.tokenizer(text)]

In [None]:
sq_pipe = sq_pipe.map(tokenize)

In [14]:
def getToken(pipe, place):
    for para, quest, answer, loc in pipe:
        if place == 0:
            yield tokenize(para)
        elif place == 1:
            yield tokenize(quest)

In [16]:
for token in getToken(sq_pipe, 0):
    print(token)
    break

['The', 'Normans', '(', 'Norman', ':', 'Nourmands', ';', 'French', ':', 'Normands', ';', 'Latin', ':', 'Normanni', ')', 'were', 'the', 'people', 'who', 'in', 'the', '10th', 'and', '11th', 'centuries', 'gave', 'their', 'name', 'to', 'Normandy', ',', 'a', 'region', 'in', 'France', '.', 'They', 'were', 'descended', 'from', 'Norse', '(', '"', 'Norman', '"', 'comes', 'from', '"', 'Norseman', '"', ')', 'raiders', 'and', 'pirates', 'from', 'Denmark', ',', 'Iceland', 'and', 'Norway', 'who', ',', 'under', 'their', 'leader', 'Rollo', ',', 'agreed', 'to', 'swear', 'fealty', 'to', 'King', 'Charles', 'III', 'of', 'West', 'Francia', '.', 'Through', 'generations', 'of', 'assimilation', 'and', 'mixing', 'with', 'the', 'native', 'Frankish', 'and', 'Roman', '-', 'Gaulish', 'populations', ',', 'their', 'descendants', 'would', 'gradually', 'merge', 'with', 'the', 'Carolingian', '-', 'based', 'cultures', 'of', 'West', 'Francia', '.', 'The', 'distinct', 'cultural', 'and', 'ethnic', 'identity', 'of', 'the', 

In [19]:
for token in getToken(sq_pipe, 1):
    print(token)
    break

['In', 'what', 'country', 'is', 'Normandy', 'located', '?']


In [21]:
storyvocab = build_vocab_from_iterator(
    getToken(sq_pipe, 0),
    min_freq=2,
    specials=['<pad>', '<sos>', '<eos>', '<unk>'],
    special_first=True
)
storyvocab.set_default_index(storyvocab['<unk>'])

In [22]:
questvocab = build_vocab_from_iterator(
    getToken(sq_pipe, 1),
    special_first=True,
    specials=['<pad>', '<sos>', '<eos>', '<unk>'],
    min_freq=2
)
questvocab.set_default_index(questvocab['<unk>'])

In [84]:
story_transform = T.Sequential(
    T.VocabTransform(storyvocab),
    T.AddToken(1, begin=True),
    T.AddToken(2, begin=False),
)

In [81]:
quest_transform = T.Sequential(
    T.VocabTransform(questvocab),
    T.AddToken(1, begin=True),
    T.AddToken(2, begin=False),
)

In [82]:
def do_transform(row):
    # print(row)
    return [story_transform(tokenize_sent(row[0])),
            quest_transform(tokenize_sent(row[1]))] 

In [None]:
do_transform(test_data)

In [None]:
morph_pipe = sq_pipe.map(do_transform)
morplist = list(morph_pipe)
morplist[0]

In [86]:
story_idex_list = storyvocab.get_itos()
quest_idex_list = questvocab.get_itos()

def show_sentence_back(tokens):
    pass

for mo in morplist[:2]:
    print(' '.join([story_idex_list[ind] for ind in mo[0]]), end='\n')
    print(' '.join([quest_idex_list[ind] for ind in mo[1]]), end='\n')

<sos> The Normans ( Norman : Nourmands ; French : Normands ; Latin : Normanni ) were the people who in the 10th and 11th centuries gave their name to Normandy , a region in France . They were descended from Norse ( " Norman " comes from " Norseman " ) raiders and pirates from Denmark , Iceland and Norway who , under their leader Rollo , agreed to swear fealty to King Charles III of West Francia . Through generations of assimilation and mixing with the native Frankish and Roman - Gaulish populations , their descendants would gradually merge with the Carolingian - based cultures of West Francia . The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century , and it continued to evolve over the succeeding centuries . <eos>
<sos> In what country is Normandy located ? <eos>
<sos> The Normans ( Norman : Nourmands ; French : Normands ; Latin : Normanni ) were the people who in the 10th and 11th centuries gave their name to Normandy , a regio

In [87]:
quest_idex_list  # contains the words that are present in questions
len(quest_idex_list)

7038

In [88]:
# After the tokens are successfully extracted, we can start embedding
n_story = len(story_idex_list)
n_quest = len(quest_idex_list)
emb_dim = 64

import torch
from torch import nn

story_embed = nn.Embedding(n_story, emb_dim)
quest_embed = nn.Embedding(n_quest, emb_dim)

In [71]:
emb_story_1 = story_embed(torch.tensor(morplist[58][0]))
emb_story_1.shape

torch.Size([148, 64])

In [75]:
print(story_embed)
print(quest_embed)

Embedding(19154, 64)
Embedding(7038, 64)


In [77]:
morplist[58][1]

[1, 3785, 246, 10168, 659, 4, 7152, 1509, 2]

In [89]:
quest_1 = quest_embed(torch.tensor(morplist[58][1]))
quest_1.shape

torch.Size([9, 64])