Work on Multiple datasets, from manipulation, modeling and finally prediction. 

- IMDB

- DBPedia 

- YahooAnswers

In [9]:
import torch
from torch import nn
from torchtext.datasets import IMDB, DBpedia, YahooAnswers, AG_NEWS
from torch.utils.data import DataLoader
import os
import tarfile
from torchdata import datapipes as dp
from torch import optim
import time

In [None]:
torch.load()

In [2]:
import logging
tlog = logging.getLogger('tlog')
tlog.setLevel(logging.INFO)
thndl = logging.StreamHandler()
tform = logging.Formatter('%(levelname)s|%(message)s')
thndl.setFormatter(tform)
tlog.addHandler(thndl)
tlog.info("This is awesome")
tlog.debug("this is nice")

INFO|This is awesome


In [78]:
train_ag, test_ag = AG_NEWS(split=('train', 'test'))

In [79]:
for i, x in enumerate(train_ag):
    print(x)
    if i == 4:
        break

(3, "Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again.")
(3, 'Carlyle Looks Toward Commercial Aerospace (Reuters) Reuters - Private investment firm Carlyle Group,\\which has a reputation for making well-timed and occasionally\\controversial plays in the defense industry, has quietly placed\\its bets on another part of the market.')
(3, "Oil and Economy Cloud Stocks' Outlook (Reuters) Reuters - Soaring crude prices plus worries\\about the economy and the outlook for earnings are expected to\\hang over the stock market next week during the depth of the\\summer doldrums.")
(3, 'Iraq Halts Oil Exports from Main Southern Pipeline (Reuters) Reuters - Authorities have halted oil export\\flows from the main pipeline in southern Iraq after\\intelligence showed a rebel militia could strike\\infrastructure, an oil official said on Saturday.')
(3, 'Oil prices soar to all-time record, posing new menace t

In [84]:
ag_labels = [lab for lab, _ in train_ag]
set_ag = set(ag_labels)
set_ag

{1, 2, 3, 4}

In [3]:
def get_labels(iter_data):
    labels = [x[0] for x in iter_data]
    set_labels = set(labels)
    return set_labels

In [5]:
# get the datasets first. (IMDB)

train_imdb, test_imdb = IMDB(split=('train','test'))
train_list = list(train_imdb)
print(train_list[0])

(1, 'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far between

In [6]:
set_imdb_labels = get_labels(test_imdb)
set_imdb_labels

{1, 2}

In [9]:
# get the datasets first. (DBPedia failed, moving to YahooAnswers which also failed)
# Proceeding to work on extracting the files and then creating the datapipes 

In [10]:
# extract the data from the tgz file
# extract the dbpedia folder to get the train and test csv
with tarfile.open('../data/dbpedia_csv.tar.gz',mode='r:gz') as db:
    print(db.members[0].name)
    db.extractall("../data/db_pedia/")  # this will create a folder and place data in it

dbpedia_csv


In [26]:
# extract the yahoo answers folder, to get train and test csv
with tarfile.open('../data/yahoo_answers_csv.tar.gz', mode='r:gz') as yh:
    print(yh.members[0].name)
    yh.extractall("../data/")

yahoo_answers_csv


In [10]:
# extract the yahoo answers folder, to get train and test csv
with tarfile.open('../data/cnn_stories.tgz', mode='r:gz') as yh:
    print(yh.members[0].name)
    yh.extractall("../data/")

./cnn/stories


### A detour to create datapipes from csv file

- List all CSV files in a directory

- Load CSV files

- Parse CSV file and yield rows

- Split our dataset into training and validation sets

In [7]:
# Starting by reading the folder
dbpedia = "../data/db_pedia/dbpedia_csv/"
filepipe = dp.iter.FileLister(dbpedia).filter(lambda file: file.endswith(".csv"))
print(list(filepipe))

['../data/db_pedia/dbpedia_csv/test.csv', '../data/db_pedia/dbpedia_csv/train.csv']


In [11]:
def create_pipe_from_folder(folder_path: str, train_split: float):
    """Returns train and test pipe from the folders containing csv files"""
    filepipe = dp.iter.FileLister(folder_path).filter(lambda file: file.endswith('.csv'))
    tlog.info(list(filepipe))
    readpipe = dp.iter.FileOpener(filepipe, mode='rt', encoding='utf-8')
    csvpipe = readpipe.parse_csv(delimiter=',')
    tot_length = len(list(csvpipe))
    tlog.info(tot_length)
    test_split = 1 - train_split  # get the train split
    train_pipe, test_pipe = csvpipe.random_split(total_length=tot_length,
                                                 weights={
                                                     "train":train_split,
                                                     "test":test_split
                                                 }, seed=1234)
    tlog.info("Returning pipes...")                                                 
    return train_pipe, test_pipe

In [8]:
readpipe = dp.iter.FileOpener(filepipe, mode='rt',encoding='utf-8')  # mode rt is read in text mode
csvpipe = readpipe.parse_csv(delimiter=',')
csv_list = list(csvpipe)
len(csv_list)
db_train_pipe, db_test_pipe = csvpipe.random_split(total_length=630000,  # ensure the correct value is provided
                                                    weights={"train": 0.75,
                                                             "test": 0.25},
                                                    seed=1234)
db_train_list = list(db_train_pipe)
len(db_train_list)

472500

In [47]:
for x in db_train_pipe:
    print(x)
    break

# the data to be tokenised is present in the 2nd idx. 

['1', 'Odd Lot Entertainment', " OddLot Entertainment founded in 2001 by longtime producers Gigi Pritzker and Deborah Del Prete (The Wedding Planner) is a film production and financing company based in Culver City California.OddLot produced the film version of Orson Scott Card's sci-fi novel Ender's Game. A film version of this novel had been in the works in one form or another for more than a decade by the time of its release."]


In [9]:
set_db_labels = get_labels(db_train_pipe)
set_db_labels

{'1', '10', '11', '12', '13', '14', '2', '3', '4', '5', '6', '7', '8', '9'}

In [12]:
yh_train_pipe, yh_test_pipe = create_pipe_from_folder(folder_path='../data/yahoo_answers_csv/',
                                                      train_split=0.75)

INFO|['../data/yahoo_answers_csv/test.csv', '../data/yahoo_answers_csv/train.csv']
INFO|1460000
INFO|Returning pipes...


In [13]:
for x in yh_train_pipe:
    print(x)
    break

# the data to be tokenised is present in the 2nd idx. 

['2', 'Why does Zebras have stripes?', 'What is the purpose or those stripes? Who do they serve the Zebras in the wild life?', 'this provides camouflage - predator vision is such that it is usually difficult for them to see complex patterns']


In [14]:
set_yh_label = get_labels(yh_train_pipe)
set_yh_label

{'1', '10', '2', '3', '4', '5', '6', '7', '8', '9'}

In [15]:
# Starting the tokenisation process with IMDB first

from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

tokenizer = get_tokenizer("basic_english")
# Other tokenizers available are en, spacy, revtok, toktok, spacy, moses, subword

In [41]:
spacy_tokeniser = get_tokenizer("spacy")



In [42]:
spacy_tokeniser("This is spacy example")

['This', 'is', 'spacy', 'example']

In [17]:
def yield_tokens(data_iter):
    # there is class in the 1st element, which is discarded
    for _, text in data_iter:
        yield tokenizer(text)

def yield_tokens_other(data_iter):
    # there is class and labels in 0 and 1st idx
    for _, _, text in data_iter:
        yield tokenizer(text)

def yield_tokens_yahoo(data_iter):
    # there is class and labels in 0 and 1st idx
    for _, _, _, text in data_iter:
        yield tokenizer(text)

In [18]:
for x in yield_tokens_other(db_train_pipe):
    print(x)
    break

['oddlot', 'entertainment', 'founded', 'in', '2001', 'by', 'longtime', 'producers', 'gigi', 'pritzker', 'and', 'deborah', 'del', 'prete', '(', 'the', 'wedding', 'planner', ')', 'is', 'a', 'film', 'production', 'and', 'financing', 'company', 'based', 'in', 'culver', 'city', 'california', '.', 'oddlot', 'produced', 'the', 'film', 'version', 'of', 'orson', 'scott', 'card', "'", 's', 'sci-fi', 'novel', 'ender', "'", 's', 'game', '.', 'a', 'film', 'version', 'of', 'this', 'novel', 'had', 'been', 'in', 'the', 'works', 'in', 'one', 'form', 'or', 'another', 'for', 'more', 'than', 'a', 'decade', 'by', 'the', 'time', 'of', 'its', 'release', '.']


In [22]:
imdb_vocab = build_vocab_from_iterator(yield_tokens(train_imdb), specials=['<unk>'])
imdb_vocab.set_default_index(imdb_vocab['<unk>'])

In [23]:
imdb_vocab(tokenizer("there is a lot"))

[42, 9, 4, 192]

In [19]:
# build vocab for dbpedia
db_vocab = build_vocab_from_iterator(yield_tokens_other(db_train_pipe), specials=['<unk>'])
db_vocab.set_default_index(db_vocab['<unk>'])

In [20]:
yh_vocab = build_vocab_from_iterator(yield_tokens_yahoo(yh_train_pipe), special_first=['<unk>'])
yh_vocab.set_default_index(db_vocab['<unk>'])

In [21]:
# imdb text pipeline to convert the sentence into indices of the vocabulary
imdb_text_pipeline = lambda x: imdb_vocab(tokenizer(x))
label_pipeline = lambda x: int(x) - 1
# db text pipeline to convert the sentence into indices of the vocabulary
db_text_pipeline = lambda x: db_vocab(tokenizer(x))
# yahoo text pipeline to convert the sentence into indices of the vocabulary
yh_text_pipeline = lambda x: yh_vocab(tokenizer(x))

In [51]:
device = torch.device("cuda" if torch.cuda.is_available() else 'cpu')

def collate_imdb(batch):
    "batches of lables and text returned as label and text lists"
    label_list, text_list, offsets = [], [], [0]
    for _label, _text in batch:
        label_list.append(label_pipeline(_label))
        process_text = torch.tensor(imdb_text_pipeline(_text), dtype=torch.int64)
        text_list.append(process_text)
        offsets.append(process_text.size(0))

    label_list = torch.tensor(label_list, dtype=torch.int64)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text_list = torch.cat(text_list)
    return label_list.to(device), text_list.to(device), offsets.to(device)

train_imdb_loader = DataLoader(
    train_imdb, batch_size=8, shuffle=False, collate_fn=collate_imdb
)

test_imdb_loader = DataLoader(
    test_imdb, batch_size=8, shuffle=False, collate_fn=collate_imdb
)


In [42]:
num_class_imdb = len(set_imdb_labels)
num_class_imdb

2

In [None]:
imdb_vocab_len = len(imdb_vocab)
imdb_vocab_len

In [36]:
from torch import nn


class TextClassificationModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class):
        super(TextClassificationModel, self).__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=False)  # intialize embedding bag obj
        self.fc = nn.Linear(embed_dim, num_class)  # create Linear model
        self.init_weights()  # init_weights method is defined below

    def init_weights(self):
        # assigning weights uniformly
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, text, offsets):
        # model embeds the text
        embedded = self.embedding(text, offsets)
        return self.fc(embedded)

In [None]:
model_imdb = TextClassificationModel(imdb_vocab_len, 64, num_class_imdb)
model_imdb = model_imdb.to(device=device)

In [None]:
EPOCHS = 10
LR = 5
BATCH = 64

criterion = torch.nn.CrossEntropyLoss()
optimizer = optim.SGD(model_imdb.parameters(), lr=LR)
scheduler = optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)
total_accu = None

In [49]:
import time

def train(model, dataloader):
    tlog.info('Entering into Train')
    model.train()
    total_acc, total_count = 0, 0
    log_interval = 500  # logging interval
    start_time = time.time()
    # starting the for loop for training
    for idx, (label, text, offsets) in enumerate(dataloader):
        optimizer.zero_grad()
        predicted_label = model(text, offsets)  # based on forward pass
        loss = criterion(predicted_label, label)  # getting the loss
        loss.backward()  # back propagation
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        # torch.nn.utils.clip_grad_norm_ performs gradient clipping. It is used to mitigate the problem of exploding gradients, which is of particular concern for recurrent networks (which LSTMs are a type of).
        optimizer.step()
        total_acc += (predicted_label.argmax(1) == label).sum().item()
        total_count += label.size(0)
        if idx % log_interval == 0 and idx > 0:
            elapsed = time.time() - start_time
            print(
                "epoch {:3d}"
                "| accuracy {:8.3f}".format(
                    epoch, total_acc / total_count
                )
            )
            total_acc, total_count = 0, 0
            start_time = time.time()

In [48]:
def evaluate(model, dataloader):
    tlog.info('Entering into Evaluate')
    model_imdb.eval()
    total_acc, total_count = 0, 0

    with torch.no_grad():
        for idx, (label, text, offsets) in enumerate(dataloader):
            predicted_label = model(text, offsets)  # predict without calculating grad
            loss = criterion(predicted_label, label)  # check the loss
            total_acc += (predicted_label.argmax(1) == label).sum().item()  # get accuracy
            total_count += label.size(0)  # count the total
    return total_acc / total_count

In [40]:
for epoch in range(1, EPOCHS + 1):
    epoch_st = time.time()
    train(model_imdb, train_imdb_loader)
    accu_val = evaluate(model_imdb, test_imdb_loader)
    if total_accu is not None and total_accu > accu_val:
        scheduler.step()
    else:
        total_accu = accu_val
    print("-" * 59)
    print(f"End of Epoch:{epoch:.3f} | time: {time.time()} | accuracy: {accu_val}")

INFO|Entering into Train


epoch   1| accuracy    0.872
epoch   1| accuracy    0.885


INFO|Entering into Evaluate


epoch   1| accuracy    0.868


INFO|Entering into Train


-----------------------------------------------------------
End of Epoch:1.000 | time: 1706177418.1865578 | accuracy: 0.501
epoch   2| accuracy    0.872
epoch   2| accuracy    0.885


INFO|Entering into Evaluate


epoch   2| accuracy    0.868


INFO|Entering into Train


-----------------------------------------------------------
End of Epoch:2.000 | time: 1706177422.0654423 | accuracy: 0.501
epoch   3| accuracy    0.872
epoch   3| accuracy    0.885


INFO|Entering into Evaluate


epoch   3| accuracy    0.868


INFO|Entering into Train


-----------------------------------------------------------
End of Epoch:3.000 | time: 1706177425.919627 | accuracy: 0.501
epoch   4| accuracy    0.872
epoch   4| accuracy    0.885


INFO|Entering into Evaluate


epoch   4| accuracy    0.868


INFO|Entering into Train


-----------------------------------------------------------
End of Epoch:4.000 | time: 1706177429.8609402 | accuracy: 0.501
epoch   5| accuracy    0.872
epoch   5| accuracy    0.885


INFO|Entering into Evaluate


epoch   5| accuracy    0.868


INFO|Entering into Train


-----------------------------------------------------------
End of Epoch:5.000 | time: 1706177433.8397303 | accuracy: 0.501
epoch   6| accuracy    0.872
epoch   6| accuracy    0.885


INFO|Entering into Evaluate


epoch   6| accuracy    0.868


INFO|Entering into Train


-----------------------------------------------------------
End of Epoch:6.000 | time: 1706177437.803828 | accuracy: 0.501
epoch   7| accuracy    0.872
epoch   7| accuracy    0.885


INFO|Entering into Evaluate


epoch   7| accuracy    0.868


INFO|Entering into Train


-----------------------------------------------------------
End of Epoch:7.000 | time: 1706177441.7510226 | accuracy: 0.501
epoch   8| accuracy    0.872
epoch   8| accuracy    0.885


INFO|Entering into Evaluate


epoch   8| accuracy    0.868


INFO|Entering into Train


-----------------------------------------------------------
End of Epoch:8.000 | time: 1706177445.7017293 | accuracy: 0.501
epoch   9| accuracy    0.872
epoch   9| accuracy    0.885


INFO|Entering into Evaluate


epoch   9| accuracy    0.868


INFO|Entering into Train


-----------------------------------------------------------
End of Epoch:9.000 | time: 1706177449.663182 | accuracy: 0.501
epoch  10| accuracy    0.872
epoch  10| accuracy    0.885


INFO|Entering into Evaluate


epoch  10| accuracy    0.868
-----------------------------------------------------------
End of Epoch:10.000 | time: 1706177453.6257198 | accuracy: 0.501


In [44]:
save_model = "/home/kamal/gitfolders/pytorch_hardway/data/model_archive/imbd_model.pt"
imdb_model_cp = {
    "epoch": EPOCHS,
    "model": model_imdb.state_dict(),
    "optimizer": optimizer.state_dict()
}
torch.save(imdb_model_cp, save_model)

In [50]:
# infering classification from the sentence, using the given model

trial = list(test_imdb)[10][1]

In [58]:
# We have to tokenize, vectorize and then send to the model
trial_token = imdb_text_pipeline(trial)
trial_token = torch.tensor(trial_token, dtype=torch.int64, device=device)
offset = torch.tensor([0], device=device)
with torch.no_grad():
    pred = model_imdb(trial_token, offset)

print(pred)

tensor([[0.1476, 0.0676]], device='cuda:0')


In [22]:
def classification_inference(sentence, tokeniser_pipeline, model):
    tokens = tokeniser_pipeline(sentence)
    tensor_tokens = torch.tensor(tokens,
                                 dtype=torch.int64,
                                 device=device)
    offset = torch.tensor([0], device=device)
    with torch.no_grad():
        pred = model(tokens, offset)

    return pred

In [30]:
from typing import List, Callable

def collate_db(batch: List):
    """Batches of lables and text returned as label and text lists"""
    label_list, text_list, offsets = [], [], [0]
    for point in batch:
        label_list.append(label_pipeline(point[0]))  # labels are in the first element
        process_text = torch.tensor(db_text_pipeline(point[-1]), dtype=torch.int64)
        # text are in the last element in the datasets
        text_list.append(process_text)
        offsets.append(process_text.size(0))

    label_list = torch.tensor(label_list, dtype=torch.int64)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text_list = torch.cat(text_list)
    return label_list.to(device), text_list.to(device), offsets.to(device)

In [31]:
db_test_loader = DataLoader(
    dataset=db_test_pipe, batch_size=8, shuffle=False, collate_fn=collate_db
)

db_train_loader = DataLoader(
    dataset=db_train_pipe, batch_size=8, shuffle=False, collate_fn=collate_db
)

In [55]:
def collate_yh(batch: List):
    """Batches of lables and text returned as label and text lists
    and takes care the model recieves standard data input"""
    label_list, text_list, offsets = [], [], [0]
    for point in batch:
        label_list.append(label_pipeline(point[0]))  # labels are in the first element
        process_text = torch.tensor(yh_text_pipeline(point[-1]), dtype=torch.int64)
        # text are in the last element in the datasets
        text_list.append(process_text)
        offsets.append(process_text.size(0))

    label_list = torch.tensor(label_list, dtype=torch.int64)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text_list = torch.cat(text_list)
    return label_list.to(device), text_list.to(device), offsets.to(device)

In [33]:
yh_test_loader = DataLoader(
    dataset=yh_test_pipe, batch_size=8, shuffle=False, collate_fn=collate_yh
)

yh_train_loader = DataLoader(
    dataset=yh_train_pipe, batch_size=8, shuffle=False, collate_fn=collate_yh
)

In [37]:
db_vocab_size = len(db_vocab)
db_num_class = len(set_db_labels)

model_db = TextClassificationModel(db_vocab_size, 64, db_num_class)
model_db = model_db.to(device=device)

In [None]:
EPOCHS = 10
LR = 5
BATCH = 64
criterion = torch.nn.CrossEntropyLoss()
optimizer = optim.SGD(model_db.parameters(), lr=LR)
scheduler = optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)
total_accu = None 
    
for epoch in range(1, EPOCHS + 1):
    epoch_st = time.time()
    train(model_db, db_train_loader)
    accu_val = evaluate(model_db, db_test_loader)
    if total_accu is not None and total_accu > accu_val:
        scheduler.step()
    else:
        total_accu = accu_val
    print("-" * 59)
    print(f"End of Epoch:{epoch:.3f} | time: {time.time()} | accuracy: {accu_val}")

In [38]:
yh_vocab_size = len(yh_vocab)
yh_num_classes = len(set_yh_label)
model_yh = TextClassificationModel(yh_vocab_size, 64, yh_num_classes)
model_yh = model_yh.to(device=device)

In [None]:
EPOCHS = 10
LR = 5
BATCH = 64
criterion = torch.nn.CrossEntropyLoss()
optimizer = optim.SGD(model_yh.parameters(), lr=LR)
scheduler = optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)
total_accu = None 
    
for epoch in range(1, EPOCHS + 1):
    epoch_st = time.time()
    train(model_yh, yh_train_loader)
    accu_val = evaluate(model_yh, yh_test_loader)
    if total_accu is not None and total_accu > accu_val:
        scheduler.step()
    else:
        total_accu = accu_val
    print("-" * 59)
    print(f"End of Epoch:{epoch:.3f} | time: {time.time()} | accuracy: {accu_val}")

In [1]:
from torchtext.datasets import CNNDM, SQuAD2

squad_train, squad_test = SQuAD2(split=('train', 'dev'))

In [6]:
len(list(squad_test))

11873

In [3]:
# CNN data_source
cnndm_train, cnndm_val, cnndm_test = CNNDM(split=('train', 'val', 'test'))