# Datasets

In [1]:
from datasets import *

## Load online data

This can be used to download all huggingface datasets : https://huggingface.co/datasets.


In [6]:
# To load the data, you have to pass the location of the dataset which 
# is composed of the name of the repository and the dataset name. 
# There is no use to provide the url.

# by default, data is a DatasetDict
# Depending ont he data structure to be downloaded, the content of this dict 
# will change.
# the following example has only on element in the dict since there is only one
# set is available.
# Otherwise, the dict may contain several elements such as train, valid, test. 


data =  load_dataset("yezhengli9/opus_books_demo")
data

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 127085
    })
})

In [28]:
# if we are only interested in one part of the dataset (eg. train), one can use
# the 'split' argument.
# And the data becomes Dataset but not a dict anymore.

data =  load_dataset("yezhengli9/opus_books_demo", split="train")
data

Dataset({
    features: ['id', 'translation'],
    num_rows: 127085
})

In [15]:
# One can also slicing the dataset to be loaded
# the Slicing can be done by providing :
#  * the start and the end indices.
#  * the percentage of the data size

data =  load_dataset("yezhengli9/opus_books_demo", split="train[:10]")
# data =  load_dataset("yezhengli9/opus_books_demo", split="train[:10%]")
# data =  load_dataset("yezhengli9/opus_books_demo", split=["train[:10%]", "train[90%:]"]) # return a list
data

Dataset({
    features: ['id', 'translation'],
    num_rows: 10
})

In [9]:
# Sometimes, there are several subsets in a dataset.
# We can download a subset by providing the name of this subset

data_sub = load_dataset("Helsinki-NLP/opus_books", "en-fr")
data_sub

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 127085
    })
})

## Inspect dataset

The inspection is to see and check data.
The returned results are lists.

In [25]:
# if the data is a dict, by specifying the key (eg. train), we can get its content
# otherwise, the data can be accessible directly.
data_sub["train"]["translation"][:2]

# 

[{'en': 'The Wanderer', 'fr': 'Le grand Meaulnes'},
 {'en': 'Alain-Fournier', 'fr': 'Alain-Fournier'}]

In [27]:
# we can get other attributes 

# all collumns
data.column_names
data.features

{'id': Value(dtype='string', id=None),
 'translation': Translation(languages=['en', 'fr'], id=None)}

## Splitting dataset

In [29]:
# spliting data into different groups (eg. train, valid)
# this returns a DatasetDict

data.train_test_split(test_size=0.2)

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 101668
    })
    test: Dataset({
        features: ['id', 'translation'],
        num_rows: 25417
    })
})

## Filter & Manipulation

Those functions are to process the data for later usage, which is different from the inspection in the above section.
The results of those functions are still dataset.

In [32]:
# select the data, this return a Dataset, which is different from previous section
# for inspection, which return a list of dict

data.select([0,1])

Dataset({
    features: ['id', 'translation'],
    num_rows: 2
})

In [38]:
# For more complex filter, one can use filter method with a flter function representing the filter 
# criteria, this returns a Dataset
# The opration is not in place

data_bad = data.filter(lambda translates : "bad " in translates["translation"]["en"])
data_bad["translation"][:5]

Filter:   0%|          | 0/127085 [00:00<?, ? examples/s]

[{'en': 'All day Millie had waited for the station omnibus to bring her a hat for the bad weather.',
  'fr': 'Toute la journée, Millie avait attendu une voiture de La Gare qui devait lui apporter un chapeau pour la mauvaise saison.'},
 {'en': 'And thus they had it out together, without the least bad feeling.',
  'fr': 'Et elles continuaient ainsi à se tenir tête sans la moindre humeur.'},
 {'en': 'At first Augustin, in a bad temper, watched from the classroom step as this play started.',
  'fr': 'Augustin, debout sur le seuil de la classe, regardait d’abord avec mauvaise humeur s’organiser ces jeux.'},
 {'en': "The bad lads of the countryside thought it a lark to smoke cigarettes, to put sugar and water on their hair to make it curl, to kiss girls from the Continuation School in the street, and to call out from behind a hedge, 'Pokebonnet,' to rag a passing nun.",
  'fr': 'Fumer la cigarette, se mettre de l’eau sucrée sur les cheveux pour qu’ils frisent, embrasser les filles du Cours C

In [58]:
# To apply an operation to data, one can use map method.

def replace(translate) :

    translate["translation"]['en'] = translate["translation"]['en'].replace("bad ", "good ")
    return translate

data_good = data.map(replace)

Map:   0%|          | 0/127085 [00:00<?, ? examples/s]

In [59]:
# here show there is no more sentences containing "bad "

data_good_b = data_good.filter(lambda translates : "bad " in translates["translation"]["en"])
data_good_b["translation"][:5]

Filter:   0%|          | 0/127085 [00:00<?, ? examples/s]

[]

In [66]:
# a more complex example using tokenizer

from transformers import AutoTokenizer

# load tokenizer

tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-small")

In [82]:
# function to to the preprocessing
# https://huggingface.co/docs/transformers/tasks/translation

# Prefix the input with a prompt so T5 knows this is a translation task. 
# Some models capable of multiple NLP tasks require prompting for specific tasks.

source_lang = "en"
target_lang = "fr"
prefix = "translate English to French: "

def preprocess(example) :

    inputs = prefix + example["translation"][source_lang]
    targets = example["translation"][target_lang]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
    return model_inputs

In [78]:
preprocessed_data = data.map(preprocess)

Map:   0%|          | 0/127085 [00:00<?, ? examples/s]

In [95]:
# To accelerate the process, we can specify the number of process.

preprocessed_data = data.map(preprocess, num_proc=4)
preprocessed_data

Map (num_proc=4):   0%|          | 0/127085 [00:00<?, ? examples/s]

In [91]:
# batched processing 

# if we use the function above for batched process, we will get an error of indice.
# The error is caused by the fact that the function above doesn't support batched data,
# eg: without batch, each element is data[i]["translation"]['en'], which returns the 
# ieme element in english.
# however, data[i:i+batch_size]["translation"]['en'] triggers error.

def preprocess_batched(examples):
    
    inputs = [prefix + example[source_lang] for example in examples["translation"]]
    targets = [example[target_lang] for example in examples["translation"]]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
    return model_inputs

preprocessed_data = data.map(preprocess_batched, batched=True)
preprocessed_data

Map:   0%|          | 0/127085 [00:00<?, ? examples/s]

In [94]:
preprocessed_data = data.map(preprocess_batched, batched=True, num_proc=4)

Map (num_proc=4):   0%|          | 0/127085 [00:00<?, ? examples/s]

In [119]:
# we can remove the information we don't want

preprocessed_data = data.map(preprocess_batched, batched=True, remove_columns=data.column_names)
preprocessed_data

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 127085
})

## Save & Load

In [105]:
# save 
preprocessed_data.save_to_disk("./processed_data")

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 127085
})

In [None]:
# load 
preprocessed_data = load_from_disk("./processed_data")

## Load local data

In [None]:
# we can load one or several files
# If there are several files, put all file names into a list and pass the list to data_files arg.

data = load_dataset("csv", data_files="./mydata.csv", split="train")

In [None]:
# there is dedicated function to load csv file

data = Dataset.from_csv("./mydata.csv")

In [None]:
# load all files in a folder

data = load_dataset("csv", data_dir="./myfilespath", split="train")

In [None]:
# load from panda data

data = Dataset.from_pandas(data_pd)

In [110]:
# if the data is a list, we can't load them directly as dataset
# if so, we get an error: AttributeError: 'str' object has no attribute 'get'

data_list = ["sentence1", "sentence2"]

# we should decorate the list as

data_list = [{"text": line} for line in data_list]
print(data_list)

Dataset.from_list(data_list)

[{'text': 'sentence1'}, {'text': 'sentence2'}]


Dataset({
    features: ['text'],
    num_rows: 2
})