In [None]:
import json
from typing import Dict

from utils import get_object, put_file, get_object_list, put_text, get_text

#### Introduction


In [None]:
from datasets import load_dataset
emotions = load_dataset('emotion')
print(type(emotions))
print(type(emotions['train']))
print(emotions)

In [None]:
for split, dataset in emotions.items():
    dataset.to_parquet(f'reviews-{split}.parquet')
    #put_file(bucket_name, f'emotions-{split}.csv', f'emotions-{split}.csv')

In [None]:
bucket_name = 'review-data'
for split in ['train', 'validation', 'test']:
    put_file(bucket_name, f'reviews-{split}.csv', f'reviews-{split}.csv')

#### Features and Benefits

In [None]:
from transformers import AutoTokenizer
from transformers.tokenization_utils_base import BatchEncoding

model_check_point = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_check_point)

def tokenize(batch) -> BatchEncoding:
  out = tokenizer(batch['text'], padding=True, truncation=True, return_tensors='pt')
  return out

emotions_encoded = emotions.map(tokenize, batched=True, batch_size=None)
emotions_encoded['train'][0].keys()

#### Load Existing Object into a Dataset

In [None]:
from utils import get_object
data_files = {}
data_files['train'] = 'reviews-train.csv'
data_files['validation'] = 'reviews-validation.csv'
data_files['test'] = 'reviews-test.csv'

for split in data_files.keys(): #['train', 'validation', 'test']:
    object_info = get_object(bucket_name, data_files[split], data_files[split])
    print(object_info.object_name, object_info.last_modified)

reviews = load_dataset('csv', data_files=data_files)
print(type(reviews))
print(type(reviews['train']))
print(reviews)

#### Hugging Face’s S3 Interface

In [None]:
from s3fs import S3FileSystem

data_files = {}
data_files['train'] = 'reviews-train.csv'
data_files['validation'] = 'reviews-validation.csv'
data_files['test'] = 'reviews-test.csv'

url = 'http://localhost:9000'
bucket_name = 'review-data'

# Load the credentials and connection information.
with open('credentials.json') as f:
    credentials = json.load(f)

s3 = S3FileSystem(key=credentials['accessKey'], secret=credentials['secretKey'], endpoint_url=url, use_ssl=False)
objects = s3.ls(bucket_name)

for split in data_files.keys(): #['train', 'validation', 'test']:
    object_name = f'{bucket_name}/{data_files[split]}'
    s3.download(object_name, data_files[split])

reviews = load_dataset('csv', data_files=data_files)
print(type(reviews))
print(type(reviews['train']))
print(reviews)

#### Break a large document into paragraphs

In [None]:
bucket_name = 'philosophy-corpus'
file_name = 'gutenberg.org_cache_epub_1232_pg1232.txt'
author = 'Niccolo Machiavelli'  # This should be 'Niccolò Machiavelli' but MinIO only supports US ASCII. 
title = 'The Prince'

# Open and read in the entire file.
file_handle = open(file_name, 'r')
file_text = file_handle.read()
file_handle.close()

count = 0
paragraphs = []
for paragraph in file_text.split("\n\n"):
    # Skip empty paragraphs. 
    if not paragraph.strip():
        continue 
    count += 1
    object_name = f'{author}_{title}_{count:06d}.txt'
    metadata = {'author': author, 'title': title}
    put_text(bucket_name, object_name, paragraph, metadata=metadata)

count

#### Use a Python Generator for Large Datasets

In [None]:
from multiprocessing import cpu_count
from datasets.arrow_dataset import Dataset
from utils import get_object_list, get_text

bucket_name = 'philosophy-corpus'
#num_proc = cpu_count()

def document_generator(bucket_name: str) -> Dict:
    object_list = []

    if len(object_list) == 0:
        print('Retrieving document list.')
        bucket_name = 'philosophy-corpus'
        object_list = get_object_list(bucket_name)
    
    for index in range(0, len(object_list)):
        #print(index)
        yield {'text': get_text(bucket_name, object_list[index])}

ds = Dataset.from_generator(document_generator, 
                            cache_dir='./.cache/huggingface/datasets', 
                            gen_kwargs={'bucket_name': bucket_name},
                            keep_in_memory=False)
                            #num_proc=num_proc)

print(type(ds))
print(ds[6])

In [None]:
from datasets.dataset_dict import DatasetDict

dd = DatasetDict({'train': ds})
dd