In [73]:
from transformers import AutoTokenizer
from datasets import load_dataset, concatenate_datasets, ReadInstruction
import numpy as np
import torch

In [4]:
from config import config

In [None]:
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path="bert-base-uncased")

## 1. Amazon Review

In [5]:
domains = ['Kitchen_v1_00', 'Electronics_v1_00', 'Books_v1_01', 'Video_DVD_v1_00']

## 2. IMDB-SST2 Dataset

In [96]:

def imdb_sst2_loaders(config, tokenizer):

    """
        We have to ensure that sample size as well label distribution remains uniform across the distribution and set. 
    """

    sst2 = load_dataset("toriving/sst2") # sst2 has train, valid and test. We're mering test and valid set into test set
    imdb = load_dataset("imdb")

    sst2_train = sst2['train'].shuffle()
    sst2_test = concatenate_datasets([sst2['validation'], sst2['test']]).shuffle()

    imdb_train = imdb['train'].shuffle()
    imdb_test = imdb['test'].shuffle()


    train_label_values = []
    test_label_values = []

    labels = np.unique(sst2_train['label']).tolist()

    for label in labels: # assuming that there's no label shift 

        # min number of samples of label in  both dataset  in train dataset
        train_min = min(len(sst2_train.filter(lambda example: example['label'] == int(label))), len(imdb_train.filter(lambda example: example['label'] == int(label))))
        
        train_label_values.append(train_min)

        # min number of samples of label in  both dataset  in test dataset
        test_min = min(len(sst2_test.filter(lambda example: example['label'] == int(label))), len(imdb_test.filter(lambda example: example['label'] == int(label))))
        test_label_values.append(test_min)
    

    train_label_dist = min(train_label_values)
    
    test_label_dist = min(test_label_values)


    ## 
    dsets = {

        "sst2":{
            "train":[],
            "test":[]
        },

        "imdb":{
            "train":[],
            "test":[]
        },
    }


    for label in labels:

        sst2_train_label = sst2_train.shuffle().filter(lambda example: example['label']==int(label)).select(range(train_label_dist))

        sst2_test_label = sst2_test.shuffle().filter(lambda example: example['label']==int(label)).select(range(test_label_dist))

        imdb_train_label = imdb_train.shuffle().filter(lambda example: example['label']==int(label)).select(range(train_label_dist))

        imdb_test_label = imdb_test.shuffle().filter(lambda example: example['label']==int(label)).select(range(test_label_dist))


        dsets['sst2']['train'].append(sst2_train_label)
        dsets['sst2']['test'].append(sst2_test_label)

        dsets['imdb']['train'].append(imdb_train_label)
        dsets['imdb']['test'].append(imdb_test_label)
        

    ## split the data based on sample distribution as well as label distribution
    sst2_train = concatenate_datasets(dsets=dsets['sst2']['train']).shuffle()
    sst2_test = concatenate_datasets(dsets=dsets['sst2']['test']).shuffle()

    imdb_train = concatenate_datasets(dsets=dsets['imdb']['train']).shuffle()
    imdb_test = concatenate_datasets(dsets=dsets['imdb']['test']).shuffle()


    # tokenize the dataset

    # this can be done with loop but who cares 

    # sst2
    # train
    sst2_train_tokenized = sst2_train.map(lambda x: tokenizer(x['text'], padding='max_length', truncation=True, max_length=config['max_seq_length']), batched=True)
    sst2_train_tokenized.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
    # test
    sst2_test_tokenized = sst2_test.map(lambda x: tokenizer(x['text'], padding='max_length', truncation=True, max_length=config['max_seq_length']), batched=True)
    sst2_test_tokenized.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])


    # imdb
    # train
    imdb_train_tokenized = imdb_train.map(lambda x: tokenizer(x['text'], padding='max_length', truncation=True, max_length=config['max_seq_length']), batched=True)
    imdb_train_tokenized.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

    # test
    imdb_test_tokenized = imdb_test.map(lambda x: tokenizer(x['text'], padding='max_length', truncation=True, max_length=config['max_seq_length']), batched=True)
    imdb_test_tokenized.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])


    # print(sst2_train_tokenized, sst2_test_tokenized, imdb_train_tokenized, imdb_test_tokenized)

    sst2_trainloader = torch.utils.data.DataLoader(dataset = sst2_train_tokenized, batch_size=config["batch_size"], shuffle=True, num_workers=4)

    sst2_testloader = torch.utils.data.DataLoader(dataset = sst2_test_tokenized, batch_size=config["batch_size"], shuffle=False, num_workers=4)

    imdb_trainloader = torch.utils.data.DataLoader(dataset = imdb_train_tokenized, batch_size=config["batch_size"], shuffle=True, num_workers=4)

    imdb_testloader = torch.utils.data.DataLoader(dataset = imdb_test_tokenized, batch_size=config["batch_size"], shuffle=False, num_workers=4)

    return {
        "sst2":{
            "train":sst2_trainloader,
            "valid":sst2_testloader,
            "test":sst2_testloader,
        },
        "imdb":{
            "train":imdb_trainloader,
            "valid":imdb_testloader,
            "test":imdb_testloader
        }
    }


In [97]:
loaders = imdb_sst2_loaders(config=config['tasks']['imdb_sst2_sa'], tokenizer=tokenizer)

Using custom data configuration default
Reusing dataset ss_t2 (/home/macab/.cache/huggingface/datasets/ss_t2/default/0.0.0/90167692658fa4abca2ffa3ede1a43a71e2bf671078c5c275c64c4231d5a62fa)
Reusing dataset imdb (/home/macab/.cache/huggingface/datasets/imdb/plain_text/1.0.0/4ea52f2e58a08dbc12c2bd52d0d92b30b88c00230b4522801b3636782f625c5b)
100%|██████████| 7/7 [00:00<00:00, 35.05ba/s]
100%|██████████| 25/25 [00:00<00:00, 32.82ba/s]
100%|██████████| 3/3 [00:00<00:00, 58.81ba/s]
100%|██████████| 25/25 [00:00<00:00, 31.54ba/s]
100%|██████████| 7/7 [00:00<00:00, 34.59ba/s]
100%|██████████| 25/25 [00:00<00:00, 32.49ba/s]
100%|██████████| 3/3 [00:00<00:00, 59.12ba/s]
100%|██████████| 25/25 [00:00<00:00, 32.61ba/s]
100%|██████████| 7/7 [00:00<00:00, 35.17ba/s]
100%|██████████| 3/3 [00:00<00:00, 37.41ba/s]
100%|██████████| 25/25 [00:00<00:00, 32.67ba/s]
100%|██████████| 25/25 [00:00<00:00, 32.24ba/s]
100%|██████████| 7/7 [00:00<00:00, 34.56ba/s]
100%|██████████| 3/3 [00:00<00:00, 36.28ba/s]
100%|

In [102]:
for domain in loaders:
    print(domain, len(loaders[domain]['train']), len(loaders[domain]['test']), len(loaders[domain]['valid']))

sst2 828 335 335
imdb 828 335 335


## 3. MNLI

In [7]:
from config import config
from datasets import load_dataset, concatenate_datasets
import numpy as np
import torch
from transformers import AutoTokenizer

In [2]:
task = 'mnli'

In [3]:
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path="bert-base-uncased")

In [8]:
def mnli_loaders(config, tokenizer):

    domains = config['domains']

    train = 'train'
    test = 'validation_matched'

    # load the dataset
    dataset = load_dataset("multi_nli")
    dataset = dataset.remove_columns(['pairID', 'promptID', 'premise_binary_parse', 'premise_parse', 'hypothesis_binary_parse', 'hypothesis_parse']) # remove the unrelated fields

    labels = set(dataset['train']['label'])

    # which label has least number of samples in train data as well as (valid)validation data in all domains
    train_label_dist = 25000 # manually checked 
    test_label_dist = 600 # manually checked 
 

    domain_dsets = {}
    for domain in domains:

        domain_dsets[domain] = {
            "train":[]
        }
        domain_dsets[domain].update({"test":[]})

    
    for label in labels:

        for domain in domains:

            train = dataset['train'].filter(lambda example:example['genre']==domain).filter(lambda example:example['label']==label).select(range(train_label_dist))
            test = dataset['validation_matched'].filter(lambda example:example['genre']==domain).filter(lambda example:example['label']==label).select(range(test_label_dist))

            domain_dsets[domain]['train'].append(train)
            domain_dsets[domain]['test'].append(test)
    

    # concatenate the dataset and shuffle them
    # before it would be list of datasets and after it will be single dataset
    for domain in domains:

        # concate class label datasets
        domain_dsets[domain]['train'] = concatenate_datasets(dsets=domain_dsets[domain]['train']).shuffle()
        domain_dsets[domain]['test'] = concatenate_datasets(dsets=domain_dsets[domain]['test']).shuffle()


        # # tokenize 
        domain_dsets[domain]['train'] = domain_dsets[domain]['train'].map(lambda x: tokenizer(x['premise'], x['hypothesis'], padding='max_length', truncation=True, max_length=config['max_seq_length']), batched=True)
        domain_dsets[domain]['test'] = domain_dsets[domain]['test'].map(lambda x: tokenizer(x['premise'], x['hypothesis'], padding='max_length', truncation=True, max_length=config['max_seq_length']), batched=True)
        


        # change the dtype 
        domain_dsets[domain]['train'].set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
        domain_dsets[domain]['test'].set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

        # create dataloaders
        domain_dsets[domain]['train'] = torch.utils.data.DataLoader(dataset = domain_dsets[domain]['train'], batch_size=config["batch_size"], shuffle=True, num_workers=4)
        domain_dsets[domain]['test'] = torch.utils.data.DataLoader(dataset = domain_dsets[domain]['test'], batch_size=config["batch_size"], shuffle=True, num_workers=4)
        domain_dsets[domain]['valid'] = domain_dsets[domain]['test'] # validation and test will be same
        
        


    # why the hell I am doing this?
    loaders = domain_dsets
    

    return loaders

In [9]:
loaders = mnli_loaders(config=config['tasks'][task], tokenizer=tokenizer)

Using custom data configuration default
Reusing dataset multi_nli (/home/macab/.cache/huggingface/datasets/multi_nli/default/0.0.0/591f72eb6263d1ab527561777936b199b714cda156d35716881158a2bd144f39)
Loading cached processed dataset at /home/macab/.cache/huggingface/datasets/multi_nli/default/0.0.0/591f72eb6263d1ab527561777936b199b714cda156d35716881158a2bd144f39/cache-5fac4257da3c539c.arrow
Loading cached processed dataset at /home/macab/.cache/huggingface/datasets/multi_nli/default/0.0.0/591f72eb6263d1ab527561777936b199b714cda156d35716881158a2bd144f39/cache-0c1db9bb01906573.arrow
Loading cached processed dataset at /home/macab/.cache/huggingface/datasets/multi_nli/default/0.0.0/591f72eb6263d1ab527561777936b199b714cda156d35716881158a2bd144f39/cache-9166034fb95d402f.arrow
Loading cached processed dataset at /home/macab/.cache/huggingface/datasets/multi_nli/default/0.0.0/591f72eb6263d1ab527561777936b199b714cda156d35716881158a2bd144f39/cache-898cee5d2d382de5.arrow
Loading cached processed da

In [123]:
data = dsets['slate']['train'].map(lambda x: tokenizer(x['premise'], x['hypothesis'], padding='max_length', truncation=True, max_length=config['max_seq_length']), batched=True)

100%|██████████| 75/75 [00:45<00:00,  1.66ba/s]


In [10]:
for domain in loaders:
    print(domain, len(loaders[domain]['train']), len(loaders[domain]['test']), len(loaders[domain]['valid']))

government 2344 57 57
telephone 2344 57 57
fiction 2344 57 57
travel 2344 57 57
slate 2344 57 57


## 4. Paraphrase Dataset

In [2]:
from config import config
from datasets import load_dataset, concatenate_datasets
import numpy as np
import torch
from transformers import AutoTokenizer
from tqdm.notebook import tqdm

In [3]:
task="paraphrase"

In [4]:
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path="bert-base-uncased")

In [41]:
def paraphrase_loaders(config, tokenizer, max_len=256):

    domains = config['domains']

    # which label has least number of samples in train data as well as (test)validation data in all domains
    train_label_dist = 21829 # manually checked 
    test_label_dist = 7075 # manually checked 


    # we are  not going to take words greater than 256
    paws = load_dataset("paws", 'labeled_final')
    qqp = load_dataset("glue", 'qqp')


    # # If you want to filter the data based on length | no filtering in actul experiment
    for _, (paws_set, qqp_set) in enumerate(zip(paws.keys(), qqp.keys())):

        # # applying filter
        # paws[paws_set] = paws[paws_set].filter(lambda example : (len(example['sentence1'])+len(example['sentence2']))<=max_len)
        # qqp[qqp_set] = qqp[qqp_set].filter(lambda example : (len(example['question1'])+len(example['question2']))<=max_len)

        # both paws and qqp has difference names for 2 input sentences 
        # paws = (sentence1, sentence2) and qqp = (question1, question2) update qqp col to match paws

        qqp[qqp_set] = qqp[qqp_set].rename_column('question1', 'sentence1') 
        qqp[qqp_set] = qqp[qqp_set].rename_column('question2', 'sentence2')


    # # merge the validation and test of both datasets
    paws['test'] = concatenate_datasets(dsets=[paws['test'], paws['validation']])
    qqp['test'] = concatenate_datasets(dsets=[qqp['test'], qqp['validation']])

    datasets = {
        "paws":paws,
        "qqp":qqp
    }

    labels = list(set(qqp['train']['label']))

    domain_dsets = {}
    for domain in domains:

        domain_dsets[domain] = {
            "train":[]
        }
        domain_dsets[domain].update({"test":[]})

    # take equal numbe of samples for each domain for each label for each set
    for label in labels:
        
        for domain in domain_dsets:

            train = datasets[domain]['train'].filter(lambda example:example['label']==label).shuffle().select(range(train_label_dist))
            test = datasets[domain]['test'].filter(lambda example:example['label']==label).shuffle().select(range(test_label_dist))

            domain_dsets[domain]['train'].append(train)
            domain_dsets[domain]['test'].append(test)





    # concatenate the dataset and shuffle them
    # before it would be list of datasets and after it will be single dataset
    for domain in domains:

        # concate class label datasets
        domain_dsets[domain]['train'] = concatenate_datasets(dsets=domain_dsets[domain]['train']).shuffle()
        domain_dsets[domain]['test'] = concatenate_datasets(dsets=domain_dsets[domain]['test']).shuffle()


        # # tokenize 
        domain_dsets[domain]['train'] = domain_dsets[domain]['train'].map(lambda x: tokenizer(x['sentence1'], x['sentence2'], padding='max_length', truncation=True, max_length=config['max_seq_length']), batched=True)
        domain_dsets[domain]['test'] = domain_dsets[domain]['test'].map(lambda x: tokenizer(x['sentence1'], x['sentence2'], padding='max_length', truncation=True, max_length=config['max_seq_length']), batched=True)
        


        # change the dtype 
        domain_dsets[domain]['train'].set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
        domain_dsets[domain]['test'].set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

        # create dataloaders
        domain_dsets[domain]['train'] = torch.utils.data.DataLoader(dataset = domain_dsets[domain]['train'], batch_size=config["batch_size"], shuffle=True, num_workers=4)
        domain_dsets[domain]['test'] = torch.utils.data.DataLoader(dataset = domain_dsets[domain]['test'], batch_size=config["batch_size"], shuffle=True, num_workers=4)
        domain_dsets[domain]['valid'] = domain_dsets[domain]['test'] # validation and test will be same
        
        


    # why the hell I am doing this?
    loaders = domain_dsets
    

    return loaders



In [42]:
loaders = paraphrase_loaders(config=config['tasks'][task], tokenizer=tokenizer)

Reusing dataset paws (/home/macab/.cache/huggingface/datasets/paws/labeled_final/1.1.0/09d8fae989bb569009a8f5b879ccf2924d3e5cd55bfe2e89e6dab1c0b50ecd34)
Reusing dataset glue (/home/macab/.cache/huggingface/datasets/glue/qqp/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)
100%|██████████| 364/364 [00:08<00:00, 43.55ba/s]
100%|██████████| 432/432 [00:08<00:00, 49.26ba/s]
100%|██████████| 50/50 [00:01<00:00, 48.48ba/s]
100%|██████████| 16/16 [00:00<00:00, 41.63ba/s]
100%|██████████| 364/364 [00:07<00:00, 47.04ba/s]
100%|██████████| 432/432 [00:08<00:00, 49.18ba/s]
100%|██████████| 50/50 [00:01<00:00, 46.91ba/s]
100%|██████████| 16/16 [00:00<00:00, 48.24ba/s]
100%|██████████| 44/44 [00:21<00:00,  2.04ba/s]
100%|██████████| 15/15 [00:06<00:00,  2.19ba/s]
100%|██████████| 44/44 [00:35<00:00,  1.23ba/s]
100%|██████████| 15/15 [00:11<00:00,  1.28ba/s]


In [43]:
for domain in loaders:
    print(domain, len(loaders[domain]['train']), len(loaders[domain]['test']), len(loaders[domain]['valid']))

qqp 1365 443 443
paws 1365 443 443


In [26]:
paws, qqp = paraphrase_loaders(config=config['tasks'][task], tokenizer=tokenizer)

Reusing dataset paws (/home/macab/.cache/huggingface/datasets/paws/labeled_final/1.1.0/09d8fae989bb569009a8f5b879ccf2924d3e5cd55bfe2e89e6dab1c0b50ecd34)
Reusing dataset glue (/home/macab/.cache/huggingface/datasets/glue/qqp/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


In [45]:
# 

In [31]:
# qqp

In [35]:
labels = list(set(qqp['train']['label']))

In [36]:
labels

[0, 1]

In [29]:
domain_dsets`

{'qqp': {'train': [], 'test': []}, 'paws': {'train': [], 'test': []}}

In [11]:
domain_dsets = paraphrase_loaders(config=config['tasks'][task], tokenizer=tokenizer)

In [12]:
domain_dsets

{'qqp': {'train': [], 'test': []}, 'paws': {'train': [], 'test': []}}

In [19]:
np.unique(paws['train']['label'], return_counts=True)

(array([0, 1]), array([27572, 21829]))

In [20]:
np.unique(concatenate_datasets(dsets=[paws['test'], paws['validation']])['label'], return_counts=True)

(array([0, 1]), array([8925, 7075]))

In [21]:
concatenate_datasets?

[0;31mSignature:[0m
[0mconcatenate_datasets[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mdsets[0m[0;34m:[0m [0mList[0m[0;34m[[0m[0mdatasets[0m[0;34m.[0m[0marrow_dataset[0m[0;34m.[0m[0mDataset[0m[0;34m][0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0minfo[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mAny[0m[0;34m,[0m [0mNoneType[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0msplit[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mAny[0m[0;34m,[0m [0mNoneType[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0maxis[0m[0;34m:[0m [0mint[0m [0;34m=[0m [0;36m0[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Converts a list of :class:`Dataset` with the same schema into a single :class:`Dataset`.

Args:
    dsets (:obj:`List[datasets.Dataset]`): List of Datasets to concatenate.
    info (:class:`DatasetInfo`, optional): Dataset information,

In [16]:
qqp = load_dataset("glue", 'qqp')
paws = load_dataset("paws", 'labeled_final')

Reusing dataset glue (/home/macab/.cache/huggingface/datasets/glue/qqp/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)
Reusing dataset paws (/home/macab/.cache/huggingface/datasets/paws/labeled_final/1.1.0/09d8fae989bb569009a8f5b879ccf2924d3e5cd55bfe2e89e6dab1c0b50ecd34)


In [18]:
paws

DatasetDict({
    train: Dataset({
        features: ['id', 'sentence1', 'sentence2', 'label'],
        num_rows: 49401
    })
    test: Dataset({
        features: ['id', 'sentence1', 'sentence2', 'label'],
        num_rows: 8000
    })
    validation: Dataset({
        features: ['id', 'sentence1', 'sentence2', 'label'],
        num_rows: 8000
    })
})

In [139]:
qqp

DatasetDict({
    train: Dataset({
        features: ['question1', 'question2', 'label', 'idx'],
        num_rows: 363846
    })
    validation: Dataset({
        features: ['question1', 'question2', 'label', 'idx'],
        num_rows: 40430
    })
    test: Dataset({
        features: ['question1', 'question2', 'label', 'idx'],
        num_rows: 390965
    })
})

In [140]:
paws

DatasetDict({
    train: Dataset({
        features: ['id', 'sentence1', 'sentence2', 'label'],
        num_rows: 49401
    })
    test: Dataset({
        features: ['id', 'sentence1', 'sentence2', 'label'],
        num_rows: 8000
    })
    validation: Dataset({
        features: ['id', 'sentence1', 'sentence2', 'label'],
        num_rows: 8000
    })
})

In [141]:
for _, (paws_set, qqp_set) in enumerate(zip(paws.keys(), qqp.keys())):
    print(paws_set, qqp_set)

train train
test validation
validation test


In [138]:
print(len(paws['train'].filter(lambda example : (len(example['sentence1'])+len(example['sentence2']))<=256)), len(paws['test'].filter(lambda example : (len(example['sentence1'])+len(example['sentence2']))<=256)), len(paws['validation'].filter(lambda example : (len(example['sentence1'])+len(example['sentence2']))<=256)))

100%|██████████| 50/50 [00:01<00:00, 45.38ba/s]
100%|██████████| 8/8 [00:00<00:00, 45.87ba/s]
100%|██████████| 8/8 [00:00<00:00, 47.58ba/s]32676 5278 5418



In [137]:
print(len(qqp['train'].filter(lambda example : (len(example['question1'])+len(example['question2']))<=256)), len(qqp['test'].filter(lambda example : (len(example['question1'])+len(example['question2']))<=256)), len(qqp['validation'].filter(lambda example : (len(example['question1'])+len(example['question2']))<=256)))

100%|██████████| 364/364 [00:07<00:00, 46.61ba/s]
100%|██████████| 391/391 [00:08<00:00, 45.49ba/s]
100%|██████████| 41/41 [00:00<00:00, 46.21ba/s]243752 257088 26991



In [46]:
dataset['train']['questions'][0]['text'][0]

'What is the step by step guide to invest in share market in india?'

In [85]:
paws = load_dataset("paws", 'labeled_final')

Reusing dataset paws (/home/macab/.cache/huggingface/datasets/paws/labeled_final/1.1.0/09d8fae989bb569009a8f5b879ccf2924d3e5cd55bfe2e89e6dab1c0b50ecd34)


In [57]:
for set in paws.keys():
    print(set, np.unique(paws[set]['label'], return_counts=True))

train (array([0, 1]), array([27572, 21829]))
test (array([0, 1]), array([4464, 3536]))
validation (array([0, 1]), array([4461, 3539]))


dict_keys(['train', 'test', 'validation'])

In [58]:
qqp = load_dataset("glue", 'qqp')

Downloading and preparing dataset glue/qqp (download: 39.76 MiB, generated: 106.55 MiB, post-processed: Unknown size, total: 146.32 MiB) to /home/macab/.cache/huggingface/datasets/glue/qqp/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad...
Downloading: 100%|██████████| 41.7M/41.7M [00:06<00:00, 6.46MB/s]
Dataset glue downloaded and prepared to /home/macab/.cache/huggingface/datasets/glue/qqp/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad. Subsequent calls will reuse this data.


In [66]:
qqp = load_dataset("glue", 'qqp')

Reusing dataset glue (/home/macab/.cache/huggingface/datasets/glue/qqp/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


In [98]:
qqp['train'] = qqp['train'].filter(lambda example : (len(example['question1'])+len(example['question2']))<=128)

100%|██████████| 364/364 [00:08<00:00, 44.60ba/s]


In [99]:
qqp

DatasetDict({
    train: Dataset({
        features: ['question1', 'question2', 'label', 'idx'],
        num_rows: 243752
    })
    validation: Dataset({
        features: ['question1', 'question2', 'label', 'idx'],
        num_rows: 40430
    })
    test: Dataset({
        features: ['question1', 'question2', 'label', 'idx'],
        num_rows: 390965
    })
})

In [100]:
ls_1 = []
ls_2 = []


for _, (q1, q2) in tqdm(enumerate(zip(qqp['train']['question1'], qqp['train']['question2']))):

    ls_1.append(len(q2))
    ls_2.append(len(q2))
    # print(q1, q2)
    # break

0it [00:00, ?it/s]

In [101]:
print(len(ls_1), len(ls_2))

243752 243752


In [102]:
print(min(ls_1), min(ls_2))

1 1


In [103]:
print(max(ls_1), max(ls_2))

122 122


In [104]:
print(sum(ls_1)/len(ls_1), sum(ls_2)/len(ls_2))

44.41006022514687 44.41006022514687


In [114]:
paws = load_dataset("paws", 'labeled_final')

Reusing dataset paws (/home/macab/.cache/huggingface/datasets/paws/labeled_final/1.1.0/09d8fae989bb569009a8f5b879ccf2924d3e5cd55bfe2e89e6dab1c0b50ecd34)


In [115]:
paws['train'] = paws['train'].filter(lambda example : (len(example['sentence1'])+len(example['sentence2']))<=250)
paws['test'] = paws['test'].filter(lambda example : (len(example['sentence1'])+len(example['sentence2']))<=250)
paws['validation'] = paws['validation'].filter(lambda example : (len(example['sentence1'])+len(example['sentence2']))<=250)

100%|██████████| 50/50 [00:01<00:00, 43.78ba/s]
100%|██████████| 8/8 [00:00<00:00, 43.21ba/s]
100%|██████████| 8/8 [00:00<00:00, 45.03ba/s]


In [116]:
paws

DatasetDict({
    train: Dataset({
        features: ['id', 'sentence1', 'sentence2', 'label'],
        num_rows: 31025
    })
    test: Dataset({
        features: ['id', 'sentence1', 'sentence2', 'label'],
        num_rows: 5024
    })
    validation: Dataset({
        features: ['id', 'sentence1', 'sentence2', 'label'],
        num_rows: 5212
    })
})

In [88]:
ls_1 = []
ls_2 = []



for _, (q1, q2) in tqdm(enumerate(zip(paws['train']['sentence1'], paws['train']['sentence2']))):

    ls_1.append(len(q2))
    ls_2.append(len(q2))
    # print(q1, q2)
    # break

0it [00:00, ?it/s]

In [89]:
print(len(ls_1), len(ls_2))

49401 49401


In [90]:
print(min(ls_1), min(ls_2))

6 6


In [91]:
print(max(ls_1), max(ls_2))

225 225


In [92]:
print(sum(ls_1)/len(ls_1), sum(ls_2)/len(ls_2))

113.75154349102245 113.75154349102245


DatasetDict({
    train: Dataset({
        features: ['question1', 'question2', 'label', 'idx'],
        num_rows: 363846
    })
    validation: Dataset({
        features: ['question1', 'question2', 'label', 'idx'],
        num_rows: 40430
    })
    test: Dataset({
        features: ['question1', 'question2', 'label', 'idx'],
        num_rows: 390965
    })
})

In [118]:
mnli = load_dataset("glue", 'mnli')

Reusing dataset glue (/home/macab/.cache/huggingface/datasets/glue/mnli/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


In [119]:
mnli

DatasetDict({
    train: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 392702
    })
    validation_matched: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 9815
    })
    validation_mismatched: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 9832
    })
    test_matched: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 9796
    })
    test_mismatched: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 9847
    })
})

In [124]:
mnli['train'] = mnli['train'].filter(lambda example : (len(example['premise'])+len(example['hypothesis']))<=128)

100%|██████████| 393/393 [00:08<00:00, 45.52ba/s]


In [125]:
ls_1 = []
ls_2 = []



for _, (q1, q2) in tqdm(enumerate(zip(mnli['train']['premise'], mnli['train']['hypothesis']))):

    ls_1.append(len(q2))
    ls_2.append(len(q2))
    # print(q1, q2)
    # break

0it [00:00, ?it/s]

In [126]:
print(len(ls_1), len(ls_2))

143489 143489


In [127]:
print(max(ls_1), max(ls_2))

110 110


In [128]:
print(sum(ls_1)/len(ls_1), sum(ls_2)/len(ls_2))

38.63926154618124 38.63926154618124


In [46]:
mrpc = load_dataset("glue", "mrpc")

Downloading and preparing dataset glue/mrpc (download: 1.43 MiB, generated: 1.43 MiB, post-processed: Unknown size, total: 2.85 MiB) to /home/macab/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad...
Downloading: 6.22kB [00:00, 3.87MB/s]
Downloading: 1.05MB [00:01, 670kB/s]
Downloading: 441kB [00:00, 457kB/s]
                                Dataset glue downloaded and prepared to /home/macab/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad. Subsequent calls will reuse this data.


In [47]:
mrpc

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})