In [73]:
from transformers import AutoTokenizer
from datasets import load_dataset, concatenate_datasets, ReadInstruction
import numpy as np
import torch

In [4]:
from config import config

In [None]:
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path="bert-base-uncased")

## 1. Amazon Review

In [5]:
domains = ['Kitchen_v1_00', 'Electronics_v1_00', 'Books_v1_01', 'Video_DVD_v1_00']

## 2. IMDB-SST2 Dataset

In [96]:

def imdb_sst2_loaders(config, tokenizer):

    """
        We have to ensure that sample size as well label distribution remains uniform across the distribution and set. 
    """

    sst2 = load_dataset("toriving/sst2") # sst2 has train, valid and test. We're mering test and valid set into test set
    imdb = load_dataset("imdb")

    sst2_train = sst2['train'].shuffle()
    sst2_test = concatenate_datasets([sst2['validation'], sst2['test']]).shuffle()

    imdb_train = imdb['train'].shuffle()
    imdb_test = imdb['test'].shuffle()


    train_label_values = []
    test_label_values = []

    labels = np.unique(sst2_train['label']).tolist()

    for label in labels: # assuming that there's no label shift 

        # min number of samples of label in  both dataset  in train dataset
        train_min = min(len(sst2_train.filter(lambda example: example['label'] == int(label))), len(imdb_train.filter(lambda example: example['label'] == int(label))))
        
        train_label_values.append(train_min)

        # min number of samples of label in  both dataset  in test dataset
        test_min = min(len(sst2_test.filter(lambda example: example['label'] == int(label))), len(imdb_test.filter(lambda example: example['label'] == int(label))))
        test_label_values.append(test_min)
    

    train_label_dist = min(train_label_values)
    
    test_label_dist = min(test_label_values)


    ## 
    dsets = {

        "sst2":{
            "train":[],
            "test":[]
        },

        "imdb":{
            "train":[],
            "test":[]
        },
    }


    for label in labels:

        sst2_train_label = sst2_train.shuffle().filter(lambda example: example['label']==int(label)).select(range(train_label_dist))

        sst2_test_label = sst2_test.shuffle().filter(lambda example: example['label']==int(label)).select(range(test_label_dist))

        imdb_train_label = imdb_train.shuffle().filter(lambda example: example['label']==int(label)).select(range(train_label_dist))

        imdb_test_label = imdb_test.shuffle().filter(lambda example: example['label']==int(label)).select(range(test_label_dist))


        dsets['sst2']['train'].append(sst2_train_label)
        dsets['sst2']['test'].append(sst2_test_label)

        dsets['imdb']['train'].append(imdb_train_label)
        dsets['imdb']['test'].append(imdb_test_label)
        

    ## split the data based on sample distribution as well as label distribution
    sst2_train = concatenate_datasets(dsets=dsets['sst2']['train']).shuffle()
    sst2_test = concatenate_datasets(dsets=dsets['sst2']['test']).shuffle()

    imdb_train = concatenate_datasets(dsets=dsets['imdb']['train']).shuffle()
    imdb_test = concatenate_datasets(dsets=dsets['imdb']['test']).shuffle()


    # tokenize the dataset

    # this can be done with loop but who cares 

    # sst2
    # train
    sst2_train_tokenized = sst2_train.map(lambda x: tokenizer(x['text'], padding='max_length', truncation=True, max_length=config['max_seq_length']), batched=True)
    sst2_train_tokenized.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
    # test
    sst2_test_tokenized = sst2_test.map(lambda x: tokenizer(x['text'], padding='max_length', truncation=True, max_length=config['max_seq_length']), batched=True)
    sst2_test_tokenized.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])


    # imdb
    # train
    imdb_train_tokenized = imdb_train.map(lambda x: tokenizer(x['text'], padding='max_length', truncation=True, max_length=config['max_seq_length']), batched=True)
    imdb_train_tokenized.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

    # test
    imdb_test_tokenized = imdb_test.map(lambda x: tokenizer(x['text'], padding='max_length', truncation=True, max_length=config['max_seq_length']), batched=True)
    imdb_test_tokenized.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])


    # print(sst2_train_tokenized, sst2_test_tokenized, imdb_train_tokenized, imdb_test_tokenized)

    sst2_trainloader = torch.utils.data.DataLoader(dataset = sst2_train_tokenized, batch_size=config["batch_size"], shuffle=True, num_workers=4)

    sst2_testloader = torch.utils.data.DataLoader(dataset = sst2_test_tokenized, batch_size=config["batch_size"], shuffle=False, num_workers=4)

    imdb_trainloader = torch.utils.data.DataLoader(dataset = imdb_train_tokenized, batch_size=config["batch_size"], shuffle=True, num_workers=4)

    imdb_testloader = torch.utils.data.DataLoader(dataset = imdb_test_tokenized, batch_size=config["batch_size"], shuffle=False, num_workers=4)

    return {
        "sst2":{
            "train":sst2_trainloader,
            "valid":sst2_testloader,
            "test":sst2_testloader,
        },
        "imdb":{
            "train":imdb_trainloader,
            "valid":imdb_testloader,
            "test":imdb_testloader
        }
    }


In [97]:
loaders = imdb_sst2_loaders(config=config['tasks']['imdb_sst2_sa'], tokenizer=tokenizer)

Using custom data configuration default
Reusing dataset ss_t2 (/home/macab/.cache/huggingface/datasets/ss_t2/default/0.0.0/90167692658fa4abca2ffa3ede1a43a71e2bf671078c5c275c64c4231d5a62fa)
Reusing dataset imdb (/home/macab/.cache/huggingface/datasets/imdb/plain_text/1.0.0/4ea52f2e58a08dbc12c2bd52d0d92b30b88c00230b4522801b3636782f625c5b)
100%|██████████| 7/7 [00:00<00:00, 35.05ba/s]
100%|██████████| 25/25 [00:00<00:00, 32.82ba/s]
100%|██████████| 3/3 [00:00<00:00, 58.81ba/s]
100%|██████████| 25/25 [00:00<00:00, 31.54ba/s]
100%|██████████| 7/7 [00:00<00:00, 34.59ba/s]
100%|██████████| 25/25 [00:00<00:00, 32.49ba/s]
100%|██████████| 3/3 [00:00<00:00, 59.12ba/s]
100%|██████████| 25/25 [00:00<00:00, 32.61ba/s]
100%|██████████| 7/7 [00:00<00:00, 35.17ba/s]
100%|██████████| 3/3 [00:00<00:00, 37.41ba/s]
100%|██████████| 25/25 [00:00<00:00, 32.67ba/s]
100%|██████████| 25/25 [00:00<00:00, 32.24ba/s]
100%|██████████| 7/7 [00:00<00:00, 34.56ba/s]
100%|██████████| 3/3 [00:00<00:00, 36.28ba/s]
100%|

In [102]:
for domain in loaders:
    print(domain, len(loaders[domain]['train']), len(loaders[domain]['test']), len(loaders[domain]['valid']))

sst2 828 335 335
imdb 828 335 335


## 3. MNLI

In [7]:
from config import config
from datasets import load_dataset, concatenate_datasets
import numpy as np
import torch
from transformers import AutoTokenizer

In [2]:
task = 'mnli'

In [3]:
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path="bert-base-uncased")

In [8]:
def mnli_loaders(config, tokenizer):

    domains = config['domains']

    train = 'train'
    test = 'validation_matched'

    # load the dataset
    dataset = load_dataset("multi_nli")
    dataset = dataset.remove_columns(['pairID', 'promptID', 'premise_binary_parse', 'premise_parse', 'hypothesis_binary_parse', 'hypothesis_parse']) # remove the unrelated fields

    labels = set(dataset['train']['label'])

    # which label has least number of samples in train data as well as (valid)validation data in all domains
    train_label_dist = 25000 # manually checked 
    test_label_dist = 600 # manually checked 
 

    domain_dsets = {}
    for domain in domains:

        domain_dsets[domain] = {
            "train":[]
        }
        domain_dsets[domain].update({"test":[]})

    
    for label in labels:

        for domain in domains:

            train = dataset['train'].filter(lambda example:example['genre']==domain).filter(lambda example:example['label']==label).select(range(train_label_dist))
            test = dataset['validation_matched'].filter(lambda example:example['genre']==domain).filter(lambda example:example['label']==label).select(range(test_label_dist))

            domain_dsets[domain]['train'].append(train)
            domain_dsets[domain]['test'].append(test)
    

    # concatenate the dataset and shuffle them
    # before it would be list of datasets and after it will be single dataset
    for domain in domains:

        # concate class label datasets
        domain_dsets[domain]['train'] = concatenate_datasets(dsets=domain_dsets[domain]['train']).shuffle()
        domain_dsets[domain]['test'] = concatenate_datasets(dsets=domain_dsets[domain]['test']).shuffle()


        # # tokenize 
        domain_dsets[domain]['train'] = domain_dsets[domain]['train'].map(lambda x: tokenizer(x['premise'], x['hypothesis'], padding='max_length', truncation=True, max_length=config['max_seq_length']), batched=True)
        domain_dsets[domain]['test'] = domain_dsets[domain]['test'].map(lambda x: tokenizer(x['premise'], x['hypothesis'], padding='max_length', truncation=True, max_length=config['max_seq_length']), batched=True)
        


        # change the dtype 
        domain_dsets[domain]['train'].set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
        domain_dsets[domain]['test'].set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

        # create dataloaders
        domain_dsets[domain]['train'] = torch.utils.data.DataLoader(dataset = domain_dsets[domain]['train'], batch_size=config["batch_size"], shuffle=True, num_workers=4)
        domain_dsets[domain]['test'] = torch.utils.data.DataLoader(dataset = domain_dsets[domain]['test'], batch_size=config["batch_size"], shuffle=True, num_workers=4)
        domain_dsets[domain]['valid'] = domain_dsets[domain]['test'] # validation and test will be same
        
        


    # why the hell I am doing this?
    loaders = domain_dsets
    

    return loaders

In [9]:
loaders = mnli_loaders(config=config['tasks'][task], tokenizer=tokenizer)

Using custom data configuration default
Reusing dataset multi_nli (/home/macab/.cache/huggingface/datasets/multi_nli/default/0.0.0/591f72eb6263d1ab527561777936b199b714cda156d35716881158a2bd144f39)
Loading cached processed dataset at /home/macab/.cache/huggingface/datasets/multi_nli/default/0.0.0/591f72eb6263d1ab527561777936b199b714cda156d35716881158a2bd144f39/cache-5fac4257da3c539c.arrow
Loading cached processed dataset at /home/macab/.cache/huggingface/datasets/multi_nli/default/0.0.0/591f72eb6263d1ab527561777936b199b714cda156d35716881158a2bd144f39/cache-0c1db9bb01906573.arrow
Loading cached processed dataset at /home/macab/.cache/huggingface/datasets/multi_nli/default/0.0.0/591f72eb6263d1ab527561777936b199b714cda156d35716881158a2bd144f39/cache-9166034fb95d402f.arrow
Loading cached processed dataset at /home/macab/.cache/huggingface/datasets/multi_nli/default/0.0.0/591f72eb6263d1ab527561777936b199b714cda156d35716881158a2bd144f39/cache-898cee5d2d382de5.arrow
Loading cached processed da

In [123]:
data = dsets['slate']['train'].map(lambda x: tokenizer(x['premise'], x['hypothesis'], padding='max_length', truncation=True, max_length=config['max_seq_length']), batched=True)

100%|██████████| 75/75 [00:45<00:00,  1.66ba/s]


In [10]:
for domain in loaders:
    print(domain, len(loaders[domain]['train']), len(loaders[domain]['test']), len(loaders[domain]['valid']))

government 2344 57 57
telephone 2344 57 57
fiction 2344 57 57
travel 2344 57 57
slate 2344 57 57


In [64]:
dataset['train'].filter(lambda example:example['genre']==domains[0]).filter(lambda example:example['label']==0)

Loading cached processed dataset at /home/macab/.cache/huggingface/datasets/multi_nli/default/0.0.0/591f72eb6263d1ab527561777936b199b714cda156d35716881158a2bd144f39/cache-64e13bf7a635c074.arrow
100%|██████████| 78/78 [00:01<00:00, 45.74ba/s]


Dataset({
    features: ['premise', 'hypothesis', 'genre', 'label'],
    num_rows: 25783
})

In [11]:
dataset = load_dataset("multi_nli")
dataset = dataset.remove_columns(['pairID', 'promptID', 'premise_binary_parse', 'premise_parse', 'hypothesis_binary_parse', 'hypothesis_parse'])


Using custom data configuration default
Reusing dataset multi_nli (/home/macab/.cache/huggingface/datasets/multi_nli/default/0.0.0/591f72eb6263d1ab527561777936b199b714cda156d35716881158a2bd144f39)


In [27]:
domains = config['tasks']['mnli']['domains']

In [34]:
for domain in domains:

    lbs, count = np.unique(dataset['validation_matched'].filter(lambda example:example['genre']==domain)['label'], return_counts=True)
    print(domain, lbs, count, sum(count))


Loading cached processed dataset at /home/macab/.cache/huggingface/datasets/multi_nli/default/0.0.0/591f72eb6263d1ab527561777936b199b714cda156d35716881158a2bd144f39/cache-56053c705d9f0a3d.arrow
Loading cached processed dataset at /home/macab/.cache/huggingface/datasets/multi_nli/default/0.0.0/591f72eb6263d1ab527561777936b199b714cda156d35716881158a2bd144f39/cache-0811af6c0bdbe470.arrow
Loading cached processed dataset at /home/macab/.cache/huggingface/datasets/multi_nli/default/0.0.0/591f72eb6263d1ab527561777936b199b714cda156d35716881158a2bd144f39/cache-e18ac1254bdc5157.arrow
Loading cached processed dataset at /home/macab/.cache/huggingface/datasets/multi_nli/default/0.0.0/591f72eb6263d1ab527561777936b199b714cda156d35716881158a2bd144f39/cache-eb6fe35e67d25fbd.arrow
Loading cached processed dataset at /home/macab/.cache/huggingface/datasets/multi_nli/default/0.0.0/591f72eb6263d1ab527561777936b199b714cda156d35716881158a2bd144f39/cache-d05f2139d738c19a.arrow
government [0 1 2] [710 600 63

In [36]:
for domain in domains:

    lbs, count = np.unique(dataset['train'].filter(lambda example:example['genre']==domain)['label'], return_counts=True)
    print(domain, lbs, count, sum(count))


Loading cached processed dataset at /home/macab/.cache/huggingface/datasets/multi_nli/default/0.0.0/591f72eb6263d1ab527561777936b199b714cda156d35716881158a2bd144f39/cache-2b4b12d7e08d89e0.arrow
Loading cached processed dataset at /home/macab/.cache/huggingface/datasets/multi_nli/default/0.0.0/591f72eb6263d1ab527561777936b199b714cda156d35716881158a2bd144f39/cache-ffbccc4cc301c667.arrow
Loading cached processed dataset at /home/macab/.cache/huggingface/datasets/multi_nli/default/0.0.0/591f72eb6263d1ab527561777936b199b714cda156d35716881158a2bd144f39/cache-66cee0b143350377.arrow
Loading cached processed dataset at /home/macab/.cache/huggingface/datasets/multi_nli/default/0.0.0/591f72eb6263d1ab527561777936b199b714cda156d35716881158a2bd144f39/cache-7aed209c06bd7807.arrow
Loading cached processed dataset at /home/macab/.cache/huggingface/datasets/multi_nli/default/0.0.0/591f72eb6263d1ab527561777936b199b714cda156d35716881158a2bd144f39/cache-bc42cb155eb5cb84.arrow
government [0 1 2] [25783 2578

In [41]:
dsets = {}

for domain in domains:

    dsets[domain] = {
        "train":[]
    }
    dsets[domain].update({"test":[]})

In [42]:
dsets

{'government': {'train': [], 'test': []},
 'telephone': {'train': [], 'test': []},
 'fiction': {'train': [], 'test': []},
 'travel': {'train': [], 'test': []},
 'slate': {'train': [], 'test': []}}

In [45]:
labels = set(dataset['train']['label'])

In [48]:
for each in labels:
    print(each, type(each))

0 <class 'int'>
1 <class 'int'>
2 <class 'int'>


In [85]:
dataset_tor = load_dataset("toriving/imdb")

Using custom data configuration default
Reusing dataset imdb (/home/macab/.cache/huggingface/datasets/imdb/default/0.0.0/959c9f4e491d324dc25551fd2818d074406dee0809641149ea7fcd1c54ecb798)
