In [1]:
import tiktoken
import numpy
import sys
import os
from tqdm import tqdm 
import numpy as np
# Use the current working directory instead of __file__
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../')))
from EDA import EDA_utils as EDA
import pickle


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset = EDA.load_hf_dataset("roneneldan/TinyStories")
tokenizer = tiktoken.get_encoding("gpt2") # Load tiktoken encoder (GPT-3.5/GPT-4 encoding)

🔹 RAM used for loading dataset: 154.86 MB


In [4]:
# Tokenization {train}
tokenized_train_samples = []
for item in tqdm(dataset["train"], desc="Tokenizing Train Set"):
    input_ids = tokenizer.encode(item["text"])
    tokenized_train_samples.append(np.array(input_ids))

# Total token count and estimated memory (GB)
total_train_tokens = sum(len(toks) for toks in tokenized_train_samples)
bytes_per_token = 4  # int32:4 | uint16:2
total_size_gb = total_train_tokens * bytes_per_token / (1024**3)

print(f"\n🔹 Total tokens in train dataset: {total_train_tokens:,}")
print(f"🔹 Estimated train token memory: {total_size_gb:.2f} GB")

Tokenizing Train Set: 100%|██████████| 2119719/2119719 [02:29<00:00, 14159.12it/s]



🔹 Total tokens in train dataset: 471,872,517
🔹 Estimated train token memory: 1.76 GB


In [5]:
# Tokenization {validation}
tokenized_valid_samples = []
for item in tqdm(dataset["validation"], desc="Tokenizing Validation Set"):
    input_ids = tokenizer.encode(item["text"])
    tokenized_valid_samples.append(np.array(input_ids))

# Total token count and estimated memory (MB)
total_valid_tokens = sum(len(toks) for toks in tokenized_valid_samples)
total_size_mb = total_valid_tokens * bytes_per_token / (1024**2)

print(f"\n🔹 Total tokens in validation dataset: {total_valid_tokens:,}")
print(f"🔹 Estimated validation token memory: {total_size_mb:.2f} MB")


Tokenizing Validation Set: 100%|██████████| 21990/21990 [00:01<00:00, 14535.63it/s]


🔹 Total tokens in validation dataset: 4,743,928
🔹 Estimated validation token memory: 18.10 MB





In [6]:
# Save tokens as a pickle file
with open('tokenized_train_samples.pkl', 'wb') as f:
    pickle.dump(tokenized_train_samples, f)

with open('tokenized_valid_samples.pkl', 'wb') as f:
    pickle.dump(tokenized_valid_samples, f)

In [None]:
import psutil
# RAM before
process = psutil.Process(os.getpid())
ram_before = process.memory_info().rss  # in bytes

# Load data from the pickle file
with open('tokenized_train_samples.pkl', 'rb') as f:
    tokenized_train_samples = pickle.load(f)

with open('tokenized_valid_samples.pkl', 'rb') as f:
    tokenized_valid_samples = pickle.load(f)

# RAM after
ram_after = process.memory_info().rss

# Result
ram_used = ram_after - ram_before
print(f"🔹 RAM used for loading full dataset into memory: {ram_used / (1024**3):.2f} GB")