In [1]:
import os
import pandas as pd
# torch
import torch
from torch import Generator
from torch.utils.data import DataLoader, ConcatDataset, Subset, random_split, RandomSampler
# native
from NlpAnalytics import *



In [2]:
### for reproducibility
generator = Generator().manual_seed(42)
### Load tokenizer
tokenizer = BertLoader(load_tokenizer=True).tokenizer

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


### Example 1: DataFrame [text | label]

In [3]:
table_1 = \
[
    ['aaa', 0],
    ['bbb', 1],
    ['aa', 0]
]
df_1 = pd.DataFrame(table_1, columns = ['text', 'label'])
dataset_1 = DatasetNLP(input_df=df_1, tokenizer=tokenizer, cols_label=['label'])
# # re-generate everything
# dataset_1.run_all(ret_token_type_ids=True)
# __getitem__
display(dataset_1[0])
# wanna check desccriptions? and tensors ? 
all_tensors , desc = dataset_1.export_tensors_with_desc()
desc

(tensor([  101, 13360,   102,     0]), tensor([1, 1, 1, 0]), tensor(0))

['text_input_ids', 'text_attention_mask', 'label']

### Example 2: DataFrame [text 1 | text 2]

In [4]:
table_2 = \
[
    ['aaa', 'bbb'],
    ['bbb', 'ccc'],
    ['ccc', 'aaa']
]
df_2 = pd.DataFrame(table_2, columns = ['text 1', 'text 2'])
dataset_2 = DatasetNLP(input_df=df_2, tokenizer=tokenizer)
# __getitem__
display(dataset_2[0])
# wanna check desccriptions? and tensors ? 
all_tensors , desc = dataset_2.export_tensors_with_desc()
desc

(tensor([  101, 13360,   102,     0]),
 tensor([1, 1, 1, 0]),
 tensor([  101, 22861,  2497,   102]),
 tensor([1, 1, 1, 1]))

['text 1_input_ids',
 'text 1_attention_mask',
 'text 2_input_ids',
 'text 2_attention_mask']

### Example 3: DataFrame [text 1 | text 2 | text 3 | label 1 | label 2]

In [5]:
### double labelling, useful for hierachical labels
table_3 = \
[
    ['aaa', 'bbb', 'dd', 2 , 0],
    ['bbb', 'ccc', '33', 3, 1],
    ['ccc', 'aaa', '44', 4, 0]
]
df_3 = pd.DataFrame(table_3, columns = ['text 1', 'text 2', 'text 3', 'label 1', 'label 2'])
dataset_3 = DatasetNLP(input_df=df_3, 
                       tokenizer=tokenizer, 
                       cols_to_tokenize=['text 1', 'text 2'], 
                       cols_label=['label 1', 'label 2'] )
# __getitem__
display(dataset_3[0])
# wanna check desccriptions? and tensors ? 
all_tensors , desc = dataset_3.export_tensors_with_desc()
desc

(tensor([  101, 13360,   102,     0]),
 tensor([1, 1, 1, 0]),
 tensor([  101, 22861,  2497,   102]),
 tensor([1, 1, 1, 1]),
 tensor(2),
 tensor(0))

['text 1_input_ids',
 'text 1_attention_mask',
 'text 2_input_ids',
 'text 2_attention_mask',
 'label 1',
 'label 2']

### Make TensorDataset out of DatasetNlp

In [6]:
### This is useful say when the tokenization takes a long time, and you just want to do once.
### Once you're happy, you can export the tensordata, you can just load the dataset as standard TensorDataset
root_path = "./NlpAnalytics/data/dummy_data/"
_ = dataset_3.export_as_tensordataset("dummy_tensordata", root_path) # the return value will be the tensordataaset, if you want see it

In [7]:
# read it back as tensordataset
dataset_3_tsdata = torch.load(os.path.join(root_path, "dummy_tensordata.pt"))

### ConcatDataset

In [8]:
### if you have multiple dataset, and merge some of them
table_4 = \
[
    ['aaa', 'bbb', 'dd', 2 , 0],
    ['bbb', 'ccc', '33', 3, 1],
    ['ccc', 'aaa', '44', 4, 0]
]
df_4 = pd.DataFrame(table_4, columns = ['text 1', 'text 2', 'text 3', 'label 1', 'label 2'])
dataset_4 = DatasetNLP(input_df=df_3, tokenizer=tokenizer, cols_to_tokenize=['text 1', 'text 2'], cols_label=['label 1', 'label 2'])
table_5 = \
[
    ['a', 'vvvv', 'd', 62 , 0],
    ['bbb', 'c', '33', 13, 1],
    ['111', 'aaa', '22', 5, 0]
]
df_5 = pd.DataFrame(table_5, columns = ['text 1', 'text 2', 'text 3', 'label 1', 'label 2'])
dataset_5 = DatasetNLP(input_df=df_5, tokenizer=tokenizer, cols_to_tokenize=['text 1', 'text 2'], cols_label=['label 1', 'label 2'])
# merge (method 1)
dataset_6_1 = ConcatDataset([dataset_4, dataset_5])
# merge (method 2)
dataset_6_2 = dataset_4 + dataset_5
# validate if the same ?
display(dataset_6_2[3])
display(dataset_5[0])
# Remark: the __getitem__ function of ConcatDataset will delegate to the constituent Dataset class, which is DatasetNLP. So as long as we have our desired implementation of __getitem__ function in DatasetNLP, we don't need to worry
# the same applies to __len__, which basically calls datatset_5.__len__() + dataset_6.__len__()
len(dataset_6_2)

(tensor([ 101, 1037,  102,    0,    0,    0]),
 tensor([1, 1, 1, 0, 0, 0]),
 tensor([ 101, 1058, 2615, 2615, 2615,  102]),
 tensor([1, 1, 1, 1, 1, 1]),
 tensor(62),
 tensor(0))

(tensor([ 101, 1037,  102,    0,    0,    0]),
 tensor([1, 1, 1, 0, 0, 0]),
 tensor([ 101, 1058, 2615, 2615, 2615,  102]),
 tensor([1, 1, 1, 1, 1, 1]),
 tensor(62),
 tensor(0))

6

### Subset or random_split

In [9]:
### Create a subset of an existing dataset
dataset_7 = Subset(dataset_6_1, [1, 3, 5])
dataset_7[0]

(tensor([  101, 22861,  2497,   102]),
 tensor([1, 1, 1, 1]),
 tensor([  101, 10507,  2278,   102]),
 tensor([1, 1, 1, 1]),
 tensor(3),
 tensor(1))

In [10]:
### random split of a dataset => Subset
dataset_7_2_1, dataset_7_2_2 = random_split(dataset_6_2, [2, 4], generator=generator)
dataset_7_3_1, dataset_7_3_2 = random_split(dataset_6_2, [0.3, 0.7], generator=generator)

### Dataloader

Data loader. Combines a dataset and a sampler, and provides an iterable over the given dataset.

In [11]:
sample_dataloader = DataLoader(dataset_3, sampler=RandomSampler(dataset_3, generator=generator), batch_size=2)