# Datasets

In [64]:
from datasets import *

## Load online data

This can be used to download all huggingface datasets : https://huggingface.co/datasets.


In [65]:
# To load the data, you have to pass the location of the dataset which 
# is composed of the name of the repository and the dataset name. 
# There is no use to provide the url.

# by default, data is a DatasetDict
# Depending ont he data structure to be downloaded, the content of this dict 
# will change.
# the following example has only on element in the dict since there is only one
# set is available.
# Otherwise, the dict may contain several elements such as train, valid, test. 


data =  load_dataset("yezhengli9/opus_books_demo")
data

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 127085
    })
})

In [66]:
# if we are only interested in one part of the dataset (eg. train), one can use
# the 'split' argument.
# And the data becomes Dataset but not a dict anymore.

data =  load_dataset("yezhengli9/opus_books_demo", split="train")
data

Dataset({
    features: ['id', 'translation'],
    num_rows: 127085
})

In [67]:
# One can also slicing the dataset to be loaded
# the Slicing can be done by providing :
#  * the start and the end indices.
#  * the percentage of the data size

data =  load_dataset("yezhengli9/opus_books_demo", split="train[:10]")
# data =  load_dataset("yezhengli9/opus_books_demo", split="train[:10%]")
# data =  load_dataset("yezhengli9/opus_books_demo", split=["train[:10%]", "train[90%:]"]) # return a list
data

Dataset({
    features: ['id', 'translation'],
    num_rows: 10
})

In [68]:
# Sometimes, there are several subsets in a dataset.
# We can download a subset by providing the name of this subset
# the following example has a subset named 'en-fr'

data_sub = load_dataset("Helsinki-NLP/opus_books", "en-fr")
data_sub

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 127085
    })
})

## Inspect dataset

The inspection is to see and check data.
The returned results are lists.
So they can't be used directly in subsequent operations such as dataloader.

In [69]:
# if the data is a dict, by specifying the key (eg. train), we can get its content
# otherwise, the data can be accessible directly.
data_sub["train"]["translation"][:2]

# 

[{'en': 'The Wanderer', 'fr': 'Le grand Meaulnes'},
 {'en': 'Alain-Fournier', 'fr': 'Alain-Fournier'}]

In [70]:
# we can get other attributes 

# all collumns
print("colums names: ", data.column_names)
print("features: ", data.features)

colums names:  ['id', 'translation']
features:  {'id': Value(dtype='string', id=None), 'translation': Translation(languages=['en', 'fr'], id=None)}


## Splitting dataset

In [71]:
# spliting data into different groups (eg. train, valid)
# this returns a DatasetDict

data.train_test_split(test_size=0.2)

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 8
    })
    test: Dataset({
        features: ['id', 'translation'],
        num_rows: 2
    })
})

## Filter & Manipulation

Those functions are to process the data for later usage, which is different from the inspection in the above section.
The results of those functions are still dataset.

In [72]:
# select the data, this return a Dataset, which is different from previous section
# for inspection, which return a list of dict

data.select([0,1])

Dataset({
    features: ['id', 'translation'],
    num_rows: 2
})

In [73]:
# For more complex filter, one can use filter method with a filter function composing of filter criteria
#this returns a Dataset
# The opration is not in place

data_bad = data.filter(lambda translates : "bad " in translates["translation"]["en"])
data_bad["translation"][:5]

Filter:   0%|          | 0/10 [00:00<?, ? examples/s]

[]

In [74]:
# To apply processings to data, one can use map method.

def replace(translate) :

    translate["translation"]['en'] = translate["translation"]['en'].replace("bad ", "good ")
    return translate

data_good = data.map(replace)

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

In [75]:
# here show there is no more sentences containing "bad "

data_good_b = data_good.filter(lambda translates : "bad " in translates["translation"]["en"])
data_good_b["translation"][:5]

Filter:   0%|          | 0/10 [00:00<?, ? examples/s]

[]

In [76]:
# a more complex example using tokenizer

from transformers import AutoTokenizer

# load tokenizer

tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-small")

In [77]:
# function to to the preprocessing
# https://huggingface.co/docs/transformers/tasks/translation

# Prefix the input with a prompt so T5 knows this is a translation task. 
# Some models capable of multiple NLP tasks require prompting for specific tasks.

source_lang = "en"
target_lang = "fr"
prefix = "translate English to French: "

def preprocess(example) :

    inputs = prefix + example["translation"][source_lang]
    targets = example["translation"][target_lang]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
    return model_inputs

In [78]:
preprocessed_data = data.map(preprocess)

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

In [79]:
# To accelerate the process, we can specify the number of process to be used.
# the more the process, the faster the process

preprocessed_data = data.map(preprocess, num_proc=4)
preprocessed_data

Map (num_proc=4):   0%|          | 0/10 [00:00<?, ? examples/s]

Dataset({
    features: ['id', 'translation', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 10
})

In [80]:
# batched processing 

# if we use the function above for batched process, we will get an error of indice.
# The error is caused by the fact that the function above doesn't support batched data,
# eg: without batch, each element is data[i]["translation"]['en'], which returns the 
# ieme element in english.
# however, data[i:i+batch_size]["translation"]['en'] triggers error.

def preprocess_batched(examples):
    
    inputs = [prefix + example[source_lang] for example in examples["translation"]]
    targets = [example[target_lang] for example in examples["translation"]]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
    return model_inputs

preprocessed_data = data.map(preprocess_batched, batched=True)
preprocessed_data

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Dataset({
    features: ['id', 'translation', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 10
})

In [81]:
preprocessed_data = data.map(preprocess_batched, batched=True, num_proc=4)

Map (num_proc=4):   0%|          | 0/10 [00:00<?, ? examples/s]

In [82]:
# we can remove the information we don't want
# here we see that colums names:  ['id', 'translation'] are removed

preprocessed_data = data.map(preprocess_batched, batched=True, remove_columns=data.column_names)
preprocessed_data

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 10
})

## Save & Load

In [83]:
# save 
preprocessed_data.save_to_disk("./processed_data")

Saving the dataset (0/1 shards):   0%|          | 0/10 [00:00<?, ? examples/s]

In [84]:
# load 
preprocessed_data = load_from_disk("./processed_data")

## Load local data

In [85]:
# we can load one or several files
# If there are several files, put all file names into a list and pass the list to data_files arg.

data = load_dataset("csv", data_files="./mydata.csv", split="train")

FileNotFoundError: Unable to find '/home/Qingyi/./mydata.csv'

In [None]:
# there is dedicated function to load csv file

data = Dataset.from_csv("./mydata.csv")

In [None]:
# load all files in a folder

data = load_dataset("csv", data_dir="./myfilespath", split="train")

In [None]:
# load from panda data

data = Dataset.from_pandas(data_pd)

In [None]:
# if the data is a list, we can't load them directly as dataset
# if so, we get an error: AttributeError: 'str' object has no attribute 'get'

data_list = ["sentence1", "sentence2"]

# we should decorate the list as

data_list = [{"text": line} for line in data_list]
print(data_list)

Dataset.from_list(data_list)

[{'text': 'sentence1'}, {'text': 'sentence2'}]


Dataset({
    features: ['text'],
    num_rows: 2
})

# Process Data For Training

Let's put is all together and construct some useful data structure for training.
There is not one way to prepare data for training. The choice depends on the data, its quality, format, etc.


## using the torch way

In [103]:
# step1. read data into pairs

from torch.utils.data import Dataset
from datasets import load_dataset

class TranslationDataset(Dataset) :

    def __init__(self) :

        super().__init__()
        self.data = load_dataset("opus_books", 'en-fr', split='train')
        
    def __len__(self) :
        return len(self.data)
    
    def __getitem__(self, index) :

        return self.data[index]["translation"]['en'], self.data[index]["translation"]['fr']
    
dataset = TranslationDataset()

In [104]:
# step2. split data

from torch.utils.data import random_split

train_set, valid_set = random_split(dataset, lengths=[0.8, 0.2])

In [105]:
# step3. preprocessing

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-small")

def preprocess(batch) :

    prefix = "translate English to French: "
    inputs = [prefix + example[0] for example in batch]
    targets = [example[1] for example in batch]
    model_inputs = tokenizer(inputs, text_target=targets, padding="max_length", max_length=500, truncation=True, return_tensors="pt")
    return model_inputs

In [106]:
# step4. construct dataloader

from torch.utils.data import DataLoader

train_loader = DataLoader(train_set, batch_size=16, shuffle=False, collate_fn=preprocess)
valid_loader = DataLoader(train_set, batch_size=1, shuffle=False, collate_fn=preprocess)

In [107]:
next(iter(train_loader))

{'input_ids': tensor([[13959,  1566,    12,  ...,     0,     0,     0],
        [13959,  1566,    12,  ...,     0,     0,     0],
        [13959,  1566,    12,  ...,     0,     0,     0],
        ...,
        [13959,  1566,    12,  ...,     0,     0,     0],
        [13959,  1566,    12,  ...,     0,     0,     0],
        [13959,  1566,    12,  ...,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([[ 8786,     3,    85,  ...,     0,     0,     0],
        [  325,     3,    40,  ...,     0,     0,     0],
        [  312,  1072,     9,  ...,     0,     0,     0],
        ...,
        [ 1022, 11857,     9,  ...,     0,     0,     0],
        [  312, 14879,     6,  ...,     0,     0,     0],
        [ 1636,  3307,    71,  ...,     0,     0,     0]])}

## Hugging face way

In [None]:
# step1. load data

data = load_dataset("opus_books", 'en-fr', split='train')


In [119]:
# step2. split data

data_set =  data.train_test_split(test_size=0.2)

In [140]:
# step3. preprocesse data

# Here should use batched processing

def preprocess_batched(examples):
    
    inputs = [prefix + example[source_lang] for example in examples["translation"]]
    targets = [example[target_lang] for example in examples["translation"]]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
    return model_inputs

preprocessed_data = data_set.map(preprocess_batched, batched=True, remove_columns=data_set["train"].column_names)

In [109]:
# step4. define tokenizer

from transformers import AutoTokenizer

checkpoint = "google-t5/t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [113]:
# step5. define model

# The reason to provide model to collator is that this information can
# inform the collator to shift or not the decoder input by an extra 
# token.
# This is vitally important for translation tasks. But not necessary for
# other tasks such as classifications.

from transformers import T5ForConditionalGeneration, AutoConfig

model = T5ForConditionalGeneration.from_pretrained("google-t5/t5-small")

In [143]:
# step6. get collator

from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [144]:
# step7. construct dataloader

from torch.utils.data import DataLoader

train_loader_col = DataLoader(preprocessed_data["train"], batch_size=10, shuffle=False, collate_fn=data_collator)
valid_loader_col = DataLoader(preprocessed_data["test"], batch_size=1, shuffle=False, collate_fn=data_collator)

In [145]:
# The data are not the same as the torch way since the split of the data is random.
next(iter(train_loader_col))

{'input_ids': tensor([[13959,  1566,    12,  2379,    10,   101,   646,    24,   294,    13,
             8,   684,  2111, 17310,   203,   977,    11,  1522,  1852,   470,
           281,   223,    12,    34,     5,     1,     0,     0],
        [13959,  1566,    12,  2379,    10,  1853, 10842, 10327,  3316,     1,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0],
        [13959,  1566,    12,  2379,    10,   901,     9,    77,    18,   371,
          1211,  8632,     1,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0],
        [13959,  1566,    12,  2379,    10,   101,   130,   840,    16,     8,
           740,    13,     8, 16808, 18867, 23081,    44,  2788,    15,    18,
           188,  5497,    88,    31,     7,  1121,     5,     1],
        [13959,  1566,    12,  2379,    10,   216,  4363,    44,    69,   234,
           