In [2]:
# !pip install datasets

## HuggingFace dataset for NLP
HF dataset has a library for easily accessing and sharing dataset for different tasks even NLP.

Text needs to be tokenized into individual tokens by a `tokenizer`.

### Load dataset

In [3]:
from datasets import load_dataset

In [19]:
dataset_nlp = load_dataset("glue", "mrpc", split="train")

In [18]:
dataset_nlp # are sentence 1 and sentence 2 similar?

Dataset({
    features: ['sentence1', 'sentence2', 'label', 'idx'],
    num_rows: 3668
})

### Check the dataset

In [8]:
type(dataset_nlp)

In [9]:
dataset_nlp.features

{'sentence1': Value(dtype='string', id=None),
 'sentence2': Value(dtype='string', id=None),
 'label': ClassLabel(names=['not_equivalent', 'equivalent'], id=None),
 'idx': Value(dtype='int32', id=None)}

In [12]:
sentence_1_ = dataset_nlp["sentence1"]
sentence_2_ = dataset_nlp["sentence2"]
label_ = dataset_nlp["label"]

len(sentence_1_), len(sentence_2_), len(label_)

(3668, 3668, 3668)

In [20]:
for i in range(2000,2500,50):
  print(f"{sentence_1_[i]}")
  print(f"{sentence_2_[i]}")
  print(f"{label_[i]}")
  print()

Law enforcement sources who spoke on condition of anonymity confirmed to The Associated Press that Limbaugh was being investigated by the Palm Beach County state attorney 's office .
Law enforcement officials confirmed that Limbaugh was being investigated by the Palm Beach County , Fla . , state attorney 's office .
1

Sylvan Shalom , the Israeli Foreign Minister , said there was a possibility that Mr Bush “ will come to this area ” .
Shalom said there was also a possibility that " the president will come to this area . "
1

Microsoft said Friday that it is halting development of future Macintosh versions of its Internet Explorer browser , citing competition from Apple Computer 's Safari browser .
Microsoft will stop developing versions of its Internet Explorer browser software for Macintosh computers , saying that Apple 's Safari is now all that Apple needs .
1

A call to Rev. Christopher Coyne , the spokesman for the archdiocese , was not immediately returned .
The Rev. Christopher J

### Tokenize dataset according to the desired model

In [5]:
# pretrained model and its tokenizer
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased")
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [21]:
# tokenizer
tokenizer("Hello I Love NLP", "Hello I love CV")

{'input_ids': [101, 7592, 1045, 2293, 17953, 2361, 102, 7592, 1045, 2293, 26226, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [27]:
tokenizer("love", "you")

{'input_ids': [101, 2293, 102, 2017, 102], 'token_type_ids': [0, 0, 0, 1, 1], 'attention_mask': [1, 1, 1, 1, 1]}

In [28]:
def encode(examples):
  """tokenize the data"""
  tokenized = tokenizer(examples["sentence1"], examples["sentence2"])
  return tokenized

In [30]:
dataset_nlp = dataset_nlp.map(encode, batched=True)

Map:   0%|          | 0/3668 [00:00<?, ? examples/s]

In [31]:
dataset_nlp

Dataset({
    features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 3668
})

In [34]:
print(dataset_nlp[0])

{'sentence1': 'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .', 'sentence2': 'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .', 'label': 1, 'idx': 0, 'input_ids': [101, 2572, 3217, 5831, 5496, 2010, 2567, 1010, 3183, 2002, 2170, 1000, 1996, 7409, 1000, 1010, 1997, 9969, 4487, 23809, 3436, 2010, 3350, 1012, 102, 7727, 2000, 2032, 2004, 2069, 1000, 1996, 7409, 1000, 1010, 2572, 3217, 5831, 5496, 2010, 2567, 1997, 9969, 4487, 23809, 3436, 2010, 3350, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


### Add new feature in dataset

In [35]:
dataset_nlp = dataset_nlp.map(
    lambda examples: {
        "labels": examples["label"]
    },
    batched=True
)

Map:   0%|          | 0/3668 [00:00<?, ? examples/s]

In [39]:
dataset_nlp["labels"][:10]

[1, 0, 1, 0, 1, 1, 0, 1, 0, 0]

### Set the format of dataset according to our framework we want to use

In [43]:
import torch

dataset_nlp.set_format(
    type="torch",
    columns=["input_ids", "token_type_ids", "attention_mask", "labels"]
)

dataloader = torch.utils.data.DataLoader(
    dataset_nlp, batch_size=32
)

In [44]:
dataset_nlp

Dataset({
    features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 3668
})

In [45]:
dataloader

<torch.utils.data.dataloader.DataLoader at 0x7bc9ba05d1e0>

### Prepare the dataset
by using the `prepare_tf_dataset` from the huggingface transformer library to prepare the dataset to be compidable with tensorflow.

In [47]:
dataset_input_ids = dataset_nlp["input_ids"]

In [70]:
len(dataset_input_ids), dataset_input_ids[0], type(dataset_input_ids[0])

(3668,
 tensor([  101,  2572,  3217,  5831,  5496,  2010,  2567,  1010,  3183,  2002,
          2170,  1000,  1996,  7409,  1000,  1010,  1997,  9969,  4487, 23809,
          3436,  2010,  3350,  1012,   102,  7727,  2000,  2032,  2004,  2069,
          1000,  1996,  7409,  1000,  1010,  2572,  3217,  5831,  5496,  2010,
          2567,  1997,  9969,  4487, 23809,  3436,  2010,  3350,  1012,   102]),
 torch.Tensor)

In [63]:
seq_length_dataset = [len(seq) for seq in dataset_input_ids]

In [69]:
sorted(seq_length_dataset, reverse=True)[:10]

[103, 100, 100, 98, 97, 96, 96, 91, 90, 89]

In [71]:
avg_seq_length = sum(seq_length_dataset) / len(seq_length_dataset)

In [72]:
print(f"Agerage: {avg_seq_length}")

Agerage: 53.24154852780807
