In [7]:
# Kindly review the Generators, List, Dictionary warmups

# Preprocessing involves visualising how the data transforms, 
# followed by remembering and choosing correct methods & then implementing.

# Practicing writing functions that takes different types of args, and 
# managing the processing inside the function. 

In [1]:
from datasets import (
    load_dataset,
    list_datasets,
    load_dataset_builder,
    get_dataset_split_names,
    get_dataset_infos,
    get_dataset_config_info,
    get_dataset_config_names,
    load_from_disk,
    list_metrics,
    load_metric
)
import warnings
warnings.filterwarnings("ignore")
from rich import print

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
ds_list = list_datasets()

# Choose 2 different datasets and do the below warmups for 2 weeks

In [None]:
metrics_list = list_metrics()

In [None]:
len(ds_list)

In [None]:
for ind, x in enumerate(ds_list):
    print(x)
    if ind > 3:
        break

In [None]:
len(metrics_list)

In [None]:
for ind, x in enumerate(metrics_list):
    print(x)
    if ind > 3:
        break

In [None]:
# we might want the info on dataset before pulling it to locally
ds_info = get_dataset_infos('rotten_tomatoes')
print(ds_info)

In [None]:
# get the split names
ds_splits = get_dataset_split_names('rotten_tomatoes')
ds_splits

In [None]:
glue_config_names = get_dataset_config_names("glue")
glue_config_names

# Try the above with "PolyAI/minds14"
# Try to locate more datasets with such configs
# https://huggingface.co/datasets/PolyAI/minds14
# https://huggingface.co/datasets/nyu-mll/glue/viewer/
# https://gluebenchmark.com/

In [None]:
glue = load_dataset_builder("glue", 'cola')
# Please pick one among the available configs: ['ax', 'cola', 'mnli', 'mnli_matched', 'mnli_mismatched', 'mrpc', 'qnli', 'qqp', 'rte', 'sst2', 'stsb', 'wnli']

In [None]:
print(glue.builder_configs)  # speaks about the parquet / arrow file at disk level details

In [None]:
print(glue.info)  # discusses the parts of the Dataset at datalevel

In [None]:
glue.cache_dir

In [None]:
tomatoes_ds = load_dataset("rotten_tomatoes")

In [None]:
# working with repo with remote code
c4_config_names = get_dataset_config_names("c4", trust_remote_code=True)
c4_config_names

In [None]:
c4_splits = get_dataset_split_names("c4", "realnewslike", trust_remote_code=True)
c4_splits

In [None]:
# Diving into the data part of the dataset, working with rotten_tomatoes

tomatoes_ds

In [None]:
# observe the ds_obj is DatasetDict, so need to access the Dataset object inside using the splits
toma_train_ds = tomatoes_ds['train']
toma_train_ds

In [None]:
# try exploring the above Dataset, DatasetDict, DatasetInfo, DatasetConfig objects with dir function
toma_train_ds[0]

In [None]:
toma_train_ds['text'][0]

In [None]:
# Timing operations. Whats the difference?

import time

start_time = time.time()
d = toma_train_ds[0]['text']  # accessing the rows is faster
print(time.time() - start_time)

start_time = time.time()
d = toma_train_ds['text'][0]   # accessing the columns will take time
print(time.time() - start_time)

In [None]:
# creating a iterator from dataset
toma_train_iter = load_dataset("rotten_tomatoes", split='train', streaming=True)
# streaming is getting data when required
for ex in toma_train_iter:
    print(ex)
    break
# The iter will have data still... check below 
# this is different from the iter object that we create 
# by iter() function

In [None]:
type(toma_train_iter) 

In [None]:
next(iter(toma_train_iter))

In [None]:
list(toma_train_iter.take(3))

In [None]:
toma_train_ds[:3]

In [None]:
billsum = load_dataset(path="billsum",
                       split="train",
                       keep_in_memory=True)

In [None]:
glue = load_dataset("glue", "mrpc",
                    cache_dir="C:\\Users\\kamal\\.cache\\huggingface\\",
                    download_mode='reuse_dataset_if_exists')

In [None]:
twitter_list = ["emoji", "emotion", "hate", "irony", 
"offensive", "sentiment", "stance_abortion", "stance_atheism", 
"stance_climate", "stance_feminist", "stance_hillary"]
# Think how else can we get the above list!!!

for task in twitter_list:
    twitter = load_dataset("tweet_eval", task)

In [None]:
# [Advanced] Write a wrapper Class that takes a dataset name, provides access to list of configs, splits, dataset info, 
# contains methods to load dataset into memory, and process it as required.  << Expect to spend atleast 60 to 90 mins
ought_list = ['ade_corpus_v2', 'banking_77', 'terms_of_service', 
              'tai_safety_research', 'neurips_impact_statement_risks',
              'overruling', 'systematic_review_inclusion', 'one_stop_english',
              'tweet_eval_hate', 'twitter_complaints', 'semiconductor_org_types']
for task in ought_list:
    dataset = load_dataset("ought/raft", task)

In [None]:
# another way to time the process
import os; import psutil; import timeit

mem_before = psutil.Process(os.getpid()).memory_info().rss / (1024 * 1024)
ade_data = load_dataset("ade_corpus_v2", 'Ade_corpus_v2_classification', split="train")
mem_after = psutil.Process(os.getpid()).memory_info().rss / (1024 * 1024)

mem_after - mem_before

In [None]:
from datasets import load_dataset, Audio

dataset = load_dataset("PolyAI/minds14", "en-US", split="train")

In [None]:
from transformers import AutoModelForAudioClassification, AutoFeatureExtractor

model = AutoModelForAudioClassification.from_pretrained("facebook/wav2vec2-base")
feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base")

In [None]:
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
dataset[0]["audio"]

In [None]:
def preprocess_function(examples):
    audio_arrays = [x["array"] for x in examples["audio"]]
    inputs = feature_extractor(
        audio_arrays,
        sampling_rate=16000,
        padding=True,
        max_length=100000,
        truncation=True,
    )
    return inputs


In [None]:
dataset_10 = dataset.select(range(10)).map(preprocess_function, batched=True)

In [None]:
dataset_10

In [None]:
dataset_10[0]

In [None]:
dataset_10 = dataset_10.rename_column("intent_class", "labels")

In [None]:
import librosa
import IPython.display

# Load the audio file
audio_data, sample_rate = librosa.load('example_audio.wav')

In [None]:
# Play the audio file
IPython.display.Audio(audio_data, rate=sample_rate)

In [None]:
# work on text processing
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [None]:
print(tokenizer(toma_train_ds[0]["text"])) # explore with different tokenizers

In [None]:
def tokenization(example):
    return tokenizer(example["text"])

toma_5_ds = toma_train_ds.shuffle(57).select(range(5)).map(tokenization, batched=True)
toma_5_ds  # explore the output using know python commands << 5 mins

In [None]:
# feature extraction and au'umenting Images
image_tokenizer = AutoFeatureExtractor.from_pretrained("google/vit-base-patch16-224-in21k")

In [None]:
beans_ds = load_dataset("beans", split="train")

In [None]:
beans_ds['image'][0]

In [None]:
# !pip install torchvision torchaudio

In [None]:
from torchvision.transforms import RandomRotation

rotate = RandomRotation(degrees=(0, 90))
def transforms(examples):
    examples["pixel_values"] = [rotate(image.convert("RGB")) for image in examples["image"]]
    return examples



In [None]:
beans_ds.set_transform(transforms)
beans_ds[0]["pixel_values"]

In [None]:
# Diving into processing part. 

c4_subset = load_dataset("allenai/c4", data_files=["en/c4-train.00000-of-01024.json.gz",])

In [None]:
c4_subset

In [None]:
csv_dataset = load_dataset("csv", data_files="winequality.csv")

In [None]:
json_dataset = load_dataset("json", data_files="ordvJNeMjPIcomments.json")

In [None]:
json_dataset

In [None]:
base_url = "https://rajpurkar.github.io/SQuAD-explorer/dataset/"

dataset = load_dataset("json", data_files={"train": base_url + "train-v1.1.json", "validation": base_url + "dev-v1.1.json"}, field="data")

In [None]:
base_url = "https://storage.googleapis.com/huggingface-nlp/cache/datasets/wikipedia/20200501.en/1.0.0/"
data_files = {"train": base_url + "wikipedia-train.parquet"}
wiki = load_dataset("parquet", data_files=data_files, split="train")
# Downloading data:   5%|▌         | 977M/18.3G [04:36<1:21:58, 3.53MB/s]  

In [None]:
base_url = "https://storage.googleapis.com/huggingface-nlp/cache/datasets/wikipedia/20200501.en/1.0.0/"
data_files = {"train": base_url + "wikipedia-train.arrow"}
wiki = load_dataset("arrow", data_files=data_files, split="train")
# Downloading data:   0%|          | 3.49M/18.3G [00:02<3:36:14, 1.41MB/s] 

In [None]:
from datasets import Dataset
my_dict = {"a": [1, 2, 3]}  # from dict
dataset = Dataset.from_dict(my_dict)

In [None]:
my_list = [{"a": 1}, {"a": 2}, {"a": 3}]  # list of dicts
dataset = Dataset.from_list(my_list)

In [None]:
def my_gen():
    for i in range(1, 4):
        yield {"a": i}
dataset = Dataset.from_generator(my_gen)  # from generators

In [None]:
import pandas as pd
df = pd.DataFrame({"a": [1, 2, 3]})
dataset = Dataset.from_pandas(df)  # from dataframe

#### Entering into Processing of Datasets

- Reorder rows and split the dataset.

- Rename and remove columns, and other common column operations.

- Apply processing functions to each example in a dataset.

- Concatenate datasets.

- Apply a custom formatting transform.

- Save and export processed datasets.

    > Methods to master:

    Sort, shuffle, select, split, and shard

In [2]:

dataset = load_dataset("glue", "mrpc", split="train")

In [None]:
dataset['label'][:10]

In [None]:
sorted_ds = dataset.sort("label")
sorted_ds['label'][:10]

In [None]:
# selecting specific indices

small_dataset = dataset.select([0, 10, 20, 30, 40, 50])

small_dataset['label']

In [None]:
start_with_ar = dataset.filter(lambda example: example["sentence1"].startswith("Ar"))
start_with_ar['sentence1'][:2]

In [None]:
split_ds = dataset.train_test_split(test_size=0.1)
split_ds

In [None]:
shard_ds = dataset.shard(num_shards=4, index=0)

In [None]:
shard_ds

In [None]:
sorted_ds.shuffle(5).select(range(10))['label']

In [3]:
# Remove, Rename
dataset

Dataset({
    features: ['sentence1', 'sentence2', 'label', 'idx'],
    num_rows: 3668
})

In [4]:
dataset = dataset.rename_column('sentence1', 'sent1')
dataset

Dataset({
    features: ['sent1', 'sentence2', 'label', 'idx'],
    num_rows: 3668
})

In [6]:
dataset = dataset.remove_columns(['idx'])
dataset

Dataset({
    features: ['sent1', 'sentence2', 'label'],
    num_rows: 3668
})

In [8]:
dataset.features

{'sent1': Value(dtype='string', id=None),
 'sentence2': Value(dtype='string', id=None),
 'label': ClassLabel(names=['not_equivalent', 'equivalent'], id=None)}

In [10]:
from datasets import ClassLabel, Value

practice_cast = dataset.features.copy()

In [11]:
practice_cast['label'] = ClassLabel(names=['Super', 'Dooper'])
dataset = dataset.cast(practice_cast)
dataset

Casting the dataset: 100%|██████████| 3668/3668 [00:00<00:00, 11877.82 examples/s]


Dataset({
    features: ['sent1', 'sentence2', 'label'],
    num_rows: 3668
})

In [14]:
dataset.features

{'sent1': Value(dtype='string', id=None),
 'sentence2': Value(dtype='string', id=None),
 'label': ClassLabel(names=['Super', 'Dooper'], id=None)}

In [28]:
# flatten : nested Dicts can be flattened

squad_ds = load_dataset("squad", split="train")
squad_ds.features

{'id': Value(dtype='string', id=None),
 'title': Value(dtype='string', id=None),
 'context': Value(dtype='string', id=None),
 'question': Value(dtype='string', id=None),
 'answers': Sequence(feature={'text': Value(dtype='string', id=None), 'answer_start': Value(dtype='int32', id=None)}, length=-1, id=None)}

In [29]:
flat_squad = squad_ds.flatten()
flat_squad

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers.text', 'answers.answer_start'],
    num_rows: 87599
})

In [30]:
def add_prefix(example):
    example["answers.text"] = 'My Answer: ' + example["answers.text"][0]
    return example

In [25]:
updated_dataset = flat_squad.map(add_prefix)
updated_dataset["answers.text"][:5]

Map:   0%|          | 253/87599 [00:00<00:37, 2317.97 examples/s]

Map: 100%|██████████| 87599/87599 [00:19<00:00, 4580.93 examples/s]


['My Answer: Saint Bernadette Soubirous',
 'My Answer: a copper statue of Christ',
 'My Answer: the Main Building',
 'My Answer: a Marian place of prayer and reflection',
 'My Answer: a golden statue of the Virgin Mary']

In [27]:
# single process
updated_dataset = flat_squad.map(lambda example: {"updated_answer": example["answers.text"]}, remove_columns=["answers.text"])
updated_dataset.column_names

Map:   0%|          | 233/87599 [00:00<00:39, 2223.23 examples/s]

Map: 100%|██████████| 87599/87599 [00:20<00:00, 4352.45 examples/s]


['id',
 'title',
 'context',
 'question',
 'answers.answer_start',
 'updated_answer']

In [31]:
# multiprocess
updated_dataset = flat_squad.map(lambda example: {"updated_answer": example["answers.text"]},
                                 remove_columns=["answers.text"],
                                 num_proc=4)
updated_dataset.column_names

Map (num_proc=4): 100%|██████████| 87599/87599 [00:23<00:00, 3696.08 examples/s] 


['id',
 'title',
 'context',
 'question',
 'answers.answer_start',
 'updated_answer']

In [55]:
flat_squad[0]['question']

'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?'

In [49]:
def chunk_examples(examples):
    # print(examples['question'])
    chunks = []
    for sentence in examples["question"]:
        print(sentence)
        chunks += [sentence[i:i + 10] for i in range(0, len(sentence),10)]
    return {"chunked_questions": chunks}

In [60]:
def compute_question_length(examples):
    return {"question_length": [len(example[0]["question"].split()) for example in examples]}

In [47]:
sentence = "What is in front of the Notre Dame Main Building?"
chunb = []
for i in range(0, len(sentence), 10):
    print(sentence[i: i + 10])
    chunb.append(sentence[i: i + 10])

In [48]:
chunb

['What is in', ' front of ', 'the Notre ', 'Dame Main ', 'Building?']

In [None]:
chunkd_batches = squad_ds.map(chunk_examples, batched=True, batch_size=5,)

In [64]:
from random import randint
from transformers import pipeline

fillmask = pipeline("fill-mask", model="roberta-base")

mask_token = fillmask.tokenizer.mask_token

smaller_dataset = dataset.filter(lambda e, i: i<100, with_indices=True)

In [None]:
def augment_data(examples):
    outputs = []
    for sentence in examples["sentence1"]:
        words = sentence.split(' ')
        K = randint(1, len(words)-1)
        masked_sentence = " ".join(words[:K]  + [mask_token] + words[K+1:])
        predictions = fillmask(masked_sentence)
        augmented_sequences = [predictions[i]["sequence"] for i in range(3)]
        outputs += [sentence] + augmented_sequences
    return {"data": outputs}

In [None]:
augmented_dataset = smaller_dataset.map(augment_data,
                                        batched=True,
                                        remove_columns=dataset.column_names,
                                        batch_size=8)
augmented_dataset[:9]["data"]

In [None]:
from datasets import concatenate_datasets, load_dataset

bookcorpus = load_dataset("bookcorpus", split="train")
wiki = load_dataset("wikipedia", "20220301.en", split="train")

wiki = wiki.remove_columns([col for col in wiki.column_names if col != "text"])  # only keep the 'text' column

assert bookcorpus.features.type == wiki.features.type

bert_dataset = concatenate_datasets([bookcorpus, wiki])

In [None]:
from datasets import interleave_datasets

seed = 42
probabilities = [0.3, 0.5, 0.2]
d1 = Dataset.from_dict({"a": [0, 1, 2]})
d2 = Dataset.from_dict({"a": [10, 11, 12, 13]})
d3 = Dataset.from_dict({"a": [20, 21, 22]})
dataset = interleave_datasets([d1, d2, d3], probabilities=probabilities, seed=seed)
dataset["a"]

In [None]:
import torch
dataset.set_format(type="torch", columns=["input_ids", "token_type_ids", "attention_mask", "label"])