In [1]:
import torch
import numpy as np 
 
from torch.utils.data import DataLoader
import torch

from transformers import BertConfig, AutoTokenizer, AutoModel
import json
import random
# =====================

def set_random_seed(seed: int):
    """set seeds for reproducibility"""
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    
set_random_seed(7)

train_data = json.load(open('data/iemocap.train.flatten.json'))
all_labels = []
for sample in train_data:
    all_labels.append(sample[1])


# count label 
num_labels = len(set(all_labels))
 

# init model 
# Load config from pretrained name or path 
pre_trained_model_name = 'roberta-base'
config = BertConfig.from_pretrained(pre_trained_model_name)  # Load pretrained bert
bert_tokenizer = AutoTokenizer.from_pretrained(pre_trained_model_name)

class BatchPreprocessor(object): 

    def __call__(self, batch):
        raw_sentences = []

        # collect all sentences
        for sample in batch:
            raw_sentences.append(sample[0])

        # label processing 
        labels = []
        for sample in batch:
            label = sample[1]
            labels.append(int(label))

        word_ids_from_bert_tokenizer = bert_tokenizer(raw_sentences,  padding='max_length', max_length=512, truncation=True, return_tensors='pt')

        return (word_ids_from_bert_tokenizer, torch.FloatTensor(labels), raw_sentences) 




You are using a model of type roberta to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.


In [2]:

batch_size = 32
data_folder= "data/"

# dataset_example should support operator index_selection for create the data_loader object
test_loader = DataLoader(json.load(open(f"{data_folder}/iemocap.test.flatten.json")), batch_size=batch_size, collate_fn=BatchPreprocessor(), shuffle=True)
train_loader = DataLoader(json.load(open(f"{data_folder}/iemocap.train.flatten.json")), batch_size=batch_size, collate_fn=BatchPreprocessor(), shuffle=True)
valid_loader = DataLoader(json.load(open(f"{data_folder}/iemocap.valid.flatten.json")), batch_size=batch_size, collate_fn=BatchPreprocessor(), shuffle=True)
for e in test_loader:
    print('First epoch data:')
    print('input data\n', e[0])
    print('label data\n',e[1])
    print('padding mask data\n',e[2])
    print(e[0]['input_ids'].device)
    break  

First epoch data:
input data
 {'input_ids': tensor([[    0,  1185,   214,  ...,     1,     1,     1],
        [    0,  3084,  7252,  ...,     1,     1,     1],
        [    0,  9904,     9,  ...,     1,     1,     1],
        ...,
        [    0, 33082,     4,  ...,     1,     1,     1],
        [    0,  3684,   235,  ...,     1,     1,     1],
        [    0, 10127,  5219,  ...,     1,     1,     1]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}
label data
 tensor([3., 5., 1., 2., 1., 1., 2., 5., 2., 5., 3., 2., 3., 5., 4., 4., 2., 1.,
        5., 2., 4., 2., 4., 5., 2., 5., 5., 1., 1., 2., 3., 3.])
padding mask data
 ["You're an autono- autonomous human being.  You could- You could go into the com-", "No um, I don't want to argue with him, but it's time we realized that nobody else believes tha

In [3]:
print('train size', len(train_loader))
print('test size',  len(test_loader))


train size 162
test size 51
