In [2]:
from datasets import load_dataset
import tiktoken

In [4]:
enc = tiktoken.encoding_for_model("text-embedding-3-small")

def count_tokens_for_field(field="title", streaming=True):
    ds = load_dataset("mhurhangee/cpc-classifications", split="train", streaming=streaming)
    total_tokens = 0
    total_rows = 0

    for rec in ds:
        text = rec.get(field)
        if not text:
            continue
        total_tokens += len(enc.encode(text))
        total_rows += 1

    print(f"{field}: {total_rows:,} rows, {total_tokens:,} tokens")
    return total_tokens


In [5]:
tokens_title = count_tokens_for_field("title")

title: 261,962 rows, 5,707,234 tokens


In [6]:
tokens_fulltitle = count_tokens_for_field("fullTitle")


fullTitle: 261,962 rows, 44,329,356 tokens


In [11]:
BATCH_COST_MILLION_TOKENS = 0.01

print(f"${BATCH_COST_MILLION_TOKENS * (tokens_title / 1_000_000):.2f}")
print(f"${BATCH_COST_MILLION_TOKENS * (tokens_fulltitle / 1_000_000):.2f}")

$0.06
$0.44


In [12]:
from datasets import load_dataset
import json
import os

# Config
HF_DATASET = "mhurhangee/cpc-classifications"
TEXT_FIELD = "fullTitle"   # we decided to use fullTitle only
BATCH_LIMIT = 50_000       # max embedding inputs per batch
SIZE_LIMIT = 200 * 1024 * 1024  # 200MB in bytes
OUTPUT_PREFIX = "cpc_batch"

# Load dataset in streaming mode
dataset = load_dataset(HF_DATASET, split="train", streaming=True)

batch_index = 1
count_in_batch = 0
bytes_in_batch = 0
current_file = open(f"{OUTPUT_PREFIX}_{batch_index}.jsonl", "w")

def new_batch_file():
    global batch_index, count_in_batch, bytes_in_batch, current_file
    if current_file:
        current_file.close()
    batch_index += 1
    count_in_batch = 0
    bytes_in_batch = 0
    current_file = open(f"{OUTPUT_PREFIX}_{batch_index}.jsonl", "w")

for rec in dataset:
    text = rec.get(TEXT_FIELD)
    if not text:
        continue

    # Prepare line in Batch API format
    obj = {
        "custom_id": rec["key"],
        "method": "POST",
        "url": "/v1/embeddings",
        "body": {
            "model": "text-embedding-3-small",
            "input": text
        }
    }
    line = json.dumps(obj) + "\n"
    line_size = len(line.encode("utf-8"))

    # Check limits before writing
    if (count_in_batch >= BATCH_LIMIT) or (bytes_in_batch + line_size > SIZE_LIMIT):
        print(f"[INFO] Finalized batch {batch_index} with {count_in_batch} records, {bytes_in_batch/1024/1024:.2f} MB")
        new_batch_file()

    current_file.write(line)
    count_in_batch += 1
    bytes_in_batch += line_size

# Close last file
current_file.close()
print(f"[INFO] Finalized batch {batch_index} with {count_in_batch} records, {bytes_in_batch/1024/1024:.2f} MB")


[INFO] Finalized batch 1 with 50000 records, 42.76 MB
[INFO] Finalized batch 2 with 50000 records, 42.59 MB
[INFO] Finalized batch 3 with 50000 records, 43.10 MB
[INFO] Finalized batch 4 with 50000 records, 42.98 MB
[INFO] Finalized batch 5 with 50000 records, 42.93 MB
[INFO] Finalized batch 6 with 11962 records, 10.24 MB


In [8]:
batch_ids= ["file-6PcKcuySJpJLHsY42tVCCP", "file-1RsVmLgzmt6AJoK7QJR5GR", "file-LUbsM1yDcQcFH5njenE2jQ", "file-4M21GRYfpxzFhBX8419sdK", "file-D2RhZVAEd65JzqW8Jtjbm9", "file-SpSmzE8SDyVHfQgSovoHNR"]

In [7]:
from openai import OpenAI
client = OpenAI()

import glob
batch_ids = []
for file in sorted(glob.glob("cpc_batch_*.jsonl")):
    batch = client.files.create(
        file=open(file, "rb"),
        purpose="batch"
    )
    batch_ids.append(batch.id)
    print(f"Created batch {batch.id} for file {file}")


In [9]:
upload_ids = []

for batch_id in batch_ids:
    upload = client.batches.create(
        input_file_id=batch_id,
        endpoint="/v1/embeddings",
        completion_window="24h"
    )

    upload_ids.append(upload.id)
    print(f"Created upload {upload.id} for batch {batch_id}")



Created upload batch_689cf8e155f48190a395a4c4d3455712 for batch file-6PcKcuySJpJLHsY42tVCCP
Created upload batch_689cf8e2181c81909eb28b1ed55f11d6 for batch file-1RsVmLgzmt6AJoK7QJR5GR
Created upload batch_689cf8e2cb848190a0f73198007d08b4 for batch file-LUbsM1yDcQcFH5njenE2jQ
Created upload batch_689cf8e34a8c8190b41cfc7d7476a35b for batch file-4M21GRYfpxzFhBX8419sdK
Created upload batch_689cf8e3a0208190abfb0b7357c09ab1 for batch file-D2RhZVAEd65JzqW8Jtjbm9
Created upload batch_689cf8e3fad88190b5d78e883ab1b279 for batch file-SpSmzE8SDyVHfQgSovoHNR


In [5]:
from time import sleep
from IPython.display import clear_output

completed_ids = []

while len(completed_ids) < len(upload_ids):
    for id in upload_ids:
        batch = client.batches.retrieve(id)
        print(f"{batch.id}: {batch.status}")
        if batch.status == "completed":
            if batch.output_file_id not in completed_ids:
                completed_ids.append(batch.output_file_id)
    
    sleep(60)
    #clear output
    clear_output(wait=True)


In [12]:
import os
import json

import glob

jsonl_files = glob.glob("*.jsonl")


for file_id in jsonl_files:
    out_filename = f"{file_id}"
    if os.path.exists(out_filename):
        print(f"[SKIP] {out_filename} already exists")
    else:
        # Retrieve the file content from OpenAI
        resp = client.files.content(file_id)
        
        # Save locally
        with open(out_filename, "wb") as f:
            f.write(resp.read())  # resp is a stream
        
        print(f"[SAVED] Saved {out_filename}")


[SKIP] file-3EQSZoW9CWQL9CD2h325t3.jsonl already exists
[SKIP] file-Cwq2kuWDg6jDy1rjs5u9va.jsonl already exists
[SKIP] file-V6fas7QoGYVpnSMHSo8XZm.jsonl already exists
[SKIP] file-D7T2jFJTMweWFsgLdU7Zrm.jsonl already exists
[SKIP] file-5YcMZ6iWq63RCkjT3fA4Xv.jsonl already exists
[SKIP] file-7WimV3F2DLUqx7SpaaGDxn.jsonl already exists


In [3]:
from datasets import load_dataset, Dataset
import json
import glob
import os

HF_DATASET = "mhurhangee/cpc-classifications"
HF_DATASET_EMBEDDINGS = "mhurhangee/cpc-classifications-embeddings"

# Temp working file
merged_file = "cpc_with_embeddings.jsonl"

# Remove if exists
if os.path.exists(merged_file):
    os.remove(merged_file)


In [4]:

# Loop over each OpenAI output file
for out_file in sorted(glob.glob("*.jsonl")):
    print(f"[INFO] Processing {out_file} ...")

    # Load embeddings from this output file into a dict
    embedding_map = {}
    with open(out_file, "r") as f:
        for line in f:
            row = json.loads(line)
            cid = row["custom_id"]
            emb = row["response"]["body"]["data"][0]["embedding"]
            embedding_map[cid] = emb

    # Stream through HF dataset and write only matching rows
    for rec in load_dataset(HF_DATASET, split="train", streaming=True):
        if rec["key"] in embedding_map:
            rec["fullTitleEmbedding"] = embedding_map[rec["key"]]
            with open(merged_file, "a") as out_f:
                out_f.write(json.dumps(rec) + "\n")

    # Clear memory for this batch
    del embedding_map

print(f"[OK] Merged dataset written to {merged_file}")

[INFO] Processing file-3EQSZoW9CWQL9CD2h325t3.jsonl ...
[INFO] Processing file-5YcMZ6iWq63RCkjT3fA4Xv.jsonl ...
[INFO] Processing file-7WimV3F2DLUqx7SpaaGDxn.jsonl ...
[INFO] Processing file-Cwq2kuWDg6jDy1rjs5u9va.jsonl ...
[INFO] Processing file-D7T2jFJTMweWFsgLdU7Zrm.jsonl ...
[INFO] Processing file-V6fas7QoGYVpnSMHSo8XZm.jsonl ...
[OK] Merged dataset written to cpc_with_embeddings.jsonl


In [5]:
from huggingface_hub import login
from utils.env_loader import load_project_env
import os

load_project_env()
login(token=os.getenv("HF_TOKEN"))
# Reload and push to hub
merged_ds = load_dataset("json", data_files=merged_file, split="train")
merged_ds.push_to_hub(HF_DATASET_EMBEDDINGS)
print(f"[DONE] Pushed to {HF_DATASET_EMBEDDINGS}")

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.
Generating train split: 261960 examples [00:26, 9791.05 examples/s] 
Creating parquet from Arrow format: 100%|██████████| 33/33 [00:02<00:00, 12.55ba/s]
Processing Files (0 / 0)                : |          |  0.00B /  0.00B            
[A
Processing Files (0 / 1)                :   0%|          | 1.16MB /  431MB,  725kB/s  
Processing Files (0 / 1)                :   0%|          | 1.49MB /  431MB,  828kB/s  
Processing Files (0 / 1)                :   1%|          | 3.51MB /  431MB, 1.76MB/s  
[A
Processing Files (0 / 1)                :   1%|▏         | 5.77MB /  431MB, 2.40MB/s  
Processing Files (0 / 1)                :   3%|▎         | 10.9MB /  431MB, 4.20MB/s  
Processing Files (0 / 1)                :   4%|▍         | 18.7MB /  431MB, 6.67MB/s  
Processing Files (0 / 1)                :   7%|▋         | 29.3MB /  431MB, 9.77MB/s  
Processing Files (

[DONE] Pushed to mhurhangee/cpc-classifications-embeddings
