In [7]:
from datasets import load_dataset, Dataset
from millify import millify
from tabulate import tabulate

dataset_list: list[Dataset] = [
    load_dataset("HuggingFaceH4/no_robots"),
    load_dataset("rahular/simple-wikipedia"),
    load_dataset("roneneldan/TinyStories"),
    load_dataset("nampdn-ai/tiny-textbooks")
]

In [17]:
data = []
header = ["name", "split", "rows", "bytes", "rows %", "bytes %"]
for dataset in dataset_list:
    columns = dataset.column_names
    for column in columns:
        i = dataset[column].info  # type: ignore
        s = i.splits[column]
        data.append([i.dataset_name, column, s.num_examples, s.num_bytes])

total_rows = sum([row[2] for row in data])
total_bytes = sum([row[3] for row in data])
data = [[name, split, millify(rows), millify(size), round(rows * 100 / total_rows), round(size * 100 / total_bytes)] for
        name, split, rows, size in data]

print(tabulate(data, header, tablefmt="grid"))
print(f"\n\nTotal bytes: {millify(total_bytes)}, Total rows: {millify(total_rows)}")

+------------------+------------+--------+---------+----------+-----------+
| name             | split      | rows   | bytes   |   rows % |   bytes % |
| no_robots        | train      | 10k    | 16M     |        0 |         0 |
+------------------+------------+--------+---------+----------+-----------+
| no_robots        | test       | 500    | 887k    |        0 |         0 |
+------------------+------------+--------+---------+----------+-----------+
| simple-wikipedia | train      | 770k   | 145M    |       23 |         4 |
+------------------+------------+--------+---------+----------+-----------+
| tiny_stories     | train      | 2M     | 2B      |       63 |        50 |
+------------------+------------+--------+---------+----------+-----------+
| tiny_stories     | validation | 22k    | 19M     |        1 |         1 |
+------------------+------------+--------+---------+----------+-----------+
| tiny-textbooks   | train      | 399k   | 2B      |       12 |        43 |
+-----------