In [32]:
import torch
import os
import time
from torch import optim
from torch import nn
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset
from torchdata import datapipes as dp
from torchtext.datasets import (
    IMDB,
    DBpedia,
    CC100,
    PennTreebank,
    AG_NEWS,
    YahooAnswers,
    SQuAD2,
    SST2
)
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
import tarfile
from functools import partial
from torchtext.prototype.generate import GenerationUtils

In [34]:
sst2_train, sst2_dev, sst2_valid = SST2(split=('train','dev','test'),)

In [37]:
print(list(sst2_dev)[9])
print(list(sst2_train)[9])
print(list(sst2_valid)[9])

("in exactly 89 minutes , most of which passed as slowly as if i 'd been sitting naked on an igloo , formula 51 sank from quirky to jerky to utter turkey .", 0)
("are more deeply thought through than in most ` right-thinking ' films", 1)
('this is junk food cinema at its greasiest .',)


In [17]:
train_sq2, test_sq2 = SQuAD2(split=('train', 'dev'))
task = 'summarize'

In [19]:
list(test_sq2)[0]

('The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse ("Norman" comes from "Norseman") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries.',
 'In what country is Normandy located?',
 ['France', 'France', 'France', 'France'],
 [159, 159, 159, 159])

In [21]:
def apply_prefix(task, x):
    """The function removes 2 columns and returns a processed tuple""" 
    return f"{task}: " + x[0], x[1]

In [22]:
train_sq2 = train_sq2.map(partial(apply_prefix, task))  # A partial is created with apply_prefix, and for 
# the next variable, data is taken from pipe and applied
test_sq2 = test_sq2.map(partial(apply_prefix, task))

In [23]:
x = list(train_sq2)[2]
x

('summarize: Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny\'s Child. Managed by her father, Mathew Knowles, the group became one of the world\'s best-selling girl groups of all time. Their hiatus saw the release of Beyoncé\'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".',
 "When did Beyonce leave Destiny's Child and become a solo singer?")

In [24]:
train_sq2_batch = train_sq2.batch(8)
test_sq2_batch = test_sq2.batch(8)

In [26]:
x = list(train_sq2_batch)
x[0]

[('summarize: Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny\'s Child. Managed by her father, Mathew Knowles, the group became one of the world\'s best-selling girl groups of all time. Their hiatus saw the release of Beyoncé\'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".',
  'When did Beyonce start becoming popular?'),
 ('summarize: Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancin

In [29]:
test_sq2_rws = test_sq2_batch.rows2columnar(["explanation", "question"])
train_sq2_rws = train_sq2_batch.rows2columnar(["explanation", "question"])

In [30]:
list(train_sq2_rws)[0]

defaultdict(list,
            {'explanation': ['summarize: Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny\'s Child. Managed by her father, Mathew Knowles, the group became one of the world\'s best-selling girl groups of all time. Their hiatus saw the release of Beyoncé\'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".',
              'summarize: Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing 

In [31]:
test_sq2_dataloader = DataLoader(test_sq2_batch, shuffle=True, batch_size=None)
train_sq2_dataloader = DataLoader(train_sq2_batch, shuffle=True, batch_size=None)

In [16]:
train_sq2_iter = iter(train_sq2_dataloader)
next(train_sq2_iter)  # Converts into the iterator that can be used for training

[["summarize: summarize: Situated on one of the world's largest natural harbors, New York City consists of five boroughs, each of which is a separate county of New York State. The five boroughs – Brooklyn, Queens, Manhattan, the Bronx, and Staten Island – were consolidated into a single city in 1898. With a census-estimated 2014 population of 8,491,079 distributed over a land area of just 305 square miles (790 km2), New York is the most densely populated major city in the United States. As many as 800 languages are spoken in New York, making it the most linguistically diverse city in the world. By 2014 census estimates, the New York City metropolitan region remains by a significant margin the most populous in the United States, as defined by both the Metropolitan Statistical Area (20.1 million residents) and the Combined Statistical Area (23.6 million residents). In 2013, the MSA produced a gross metropolitan product (GMP) of nearly US$1.39 trillion, while in 2012, the CSA generated a 