In [2]:
import datasets
from datasets import Dataset, load_dataset
import pandas as pd

In [3]:
cola_dataset = load_dataset("glue", "cola")
cola_dataset

Generating train split: 100%|██████████| 8551/8551 [00:00<00:00, 957536.67 examples/s]
Generating validation split: 100%|██████████| 1043/1043 [00:00<00:00, 450763.43 examples/s]
Generating test split: 100%|██████████| 1063/1063 [00:00<00:00, 421293.13 examples/s]


DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 8551
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1043
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1063
    })
})

The above one is CoLA(Corpus of Linguistic Acceptability) dataset. The task is about given a sentence it has to be classified into one of the two classes.

❌ Unacceptable: Grammatically not correct (Label 0)

✅ Acceptable: Grammatically correct (Label 1)

### Save the dataset for local use

In [9]:
cola_dataset['train'].to_csv("../datasets/cola_train.csv")
cola_dataset['validation'].to_csv("../datasets/cola_validation.csv")
cola_dataset['test'].to_csv("../datasets/cola_test.csv")

Creating CSV from Arrow format: 100%|██████████| 9/9 [00:00<00:00, 478.60ba/s]
Creating CSV from Arrow format: 100%|██████████| 2/2 [00:00<00:00, 70.73ba/s]
Creating CSV from Arrow format: 100%|██████████| 2/2 [00:00<00:00, 351.31ba/s]


52273

In [25]:
train_dataset = cola_dataset['train']
val_dataset = cola_dataset['validation']
test_dataset = cola_dataset['test']

### Dataset exploration

In [26]:
train_dataset[0]

{'sentence': "Our friends won't buy this analysis, let alone the next one we propose.",
 'label': 1,
 'idx': 0}

### Filtering the dataset to see unacceptable sentences

In [23]:
unacceptable_sentences = cola_dataset['train'].filter(lambda x : x['label'] == 0)
unacceptable_sentences[2]

{'sentence': 'We yelled ourselves.', 'label': 0, 'idx': 22}

In [28]:
train_dataset.features

{'sentence': Value(dtype='string', id=None),
 'label': ClassLabel(names=['unacceptable', 'acceptable'], id=None),
 'idx': Value(dtype='int32', id=None)}

### Tokenizing

In [35]:
from transformers import AutoTokenizer

In [37]:
tokenizer = AutoTokenizer.from_pretrained("google/bert_uncased_L-2_H-128_A-2")

In [39]:
dir(tokenizer)

['SPECIAL_TOKENS_ATTRIBUTES',
 '__annotations__',
 '__call__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_add_tokens',
 '_auto_class',
 '_batch_encode_plus',
 '_call_one',
 '_convert_encoding',
 '_convert_id_to_token',
 '_convert_token_to_id_with_added_voc',
 '_create_repo',
 '_decode',
 '_decode_use_source_tokenizer',
 '_encode_plus',
 '_eventual_warn_about_too_long_sequence',
 '_eventually_correct_t5_max_length',
 '_from_pretrained',
 '_get_files_timestamps',
 '_get_padding_truncation_strategies',
 '_in_target_context_manager',
 '_pad',
 '_pad_token_type_id',
 '_processor_class',
 '_save_pretrained',
 '_set_model_specific_special_to

In [43]:
tokenizer.tokenize("Hello, how are you?")

['hello', ',', 'how', 'are', 'you', '?']

In [44]:
inputs = tokenizer("Hello, how are you?")

In [45]:
inputs

{'input_ids': [101, 7592, 1010, 2129, 2024, 2017, 1029, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}

In [46]:
tokenizer.encode("Hello, how are you?")

[101, 7592, 1010, 2129, 2024, 2017, 1029, 102]

In [47]:
tokenizer

BertTokenizerFast(name_or_path='google/bert_uncased_L-2_H-128_A-2', vocab_size=30522, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)