We are going to process the lyrics of a song using the batches Api to reduce the cost

## Creación del batch file

Rate limits
Batch API rate limits are separate from existing per-model rate limits. The Batch API has two new types of rate limits:

Per-batch limits: A single batch may include up to 50,000 requests, and a batch input file can be up to 200 MB in size. Note that /v1/embeddings batches are also restricted to a maximum of 50,000 embedding inputs across all requests in the batch.
Enqueued prompt tokens per model: Each model has a maximum number of enqueued prompt tokens allowed for batch processing. You can find these limits on the Platform Settings page.

## 1) Creación del archivo jsonl for batch API

In [13]:

# Model	Cost	Batch cost
# text-embedding-3-small	$0.01	$0.0001
# text-embedding-3-large	$0.065	$0.00013
# text-embedding-ada-002	$0.05	$0.0004
# https://platform.openai.com/docs/pricing

In [2]:
import pandas as pd 
import numpy as np
import os
import json 
import tiktoken
import json


In [None]:

url = "/v1/embeddings"
path_data = "../../../../data"
path_df =os.path.join(path_data, "spotify_dataset_sin_duplicados_4.csv")
df = pd.read_csv(path_df)


In [5]:

# Function to count the number of tokens
def num_tokens_from_string(string: str, encoding_name: str = "cl100k_base") -> int:
    """Returns the number of tokens in a text string."""
    print("Comezando en la función")
    encoding = tiktoken.get_encoding(encoding_name)
    print("Encoding obtenido")
    num_tokens = len(encoding.encode(string))
    print("Conteo listo")
    return num_tokens

num_tokens_from_string("tiktoken is great!", "cl100k_base")


# Function that take a start and a end to create the jsonl file 
#1) create bath file
def crear_jsonl_embeddings(texts, start, end, dir= "batch_files", startfilename="batch_embeddings", model="text-embedding-3-small"):
    os.makedirs(dir, exist_ok=True)
    filename = f"{startfilename}_{start}_{end}.jsonl"
    with open(os.path.join(dir, filename), "w", encoding="utf-8") as f:
        for i, text in enumerate(texts, start=start):
            request = {
                "custom_id": f"request-{i}",
                "method": "POST",
                "url": "/v1/embeddings",
                "body": {
                    "model": model,
                    "input": text
                }
            }
            f.write(json.dumps(request, ensure_ascii=False) + "\n")
    
    return filename

#2) upload
def upload_batch_file_to_openai(jsonl_path, api_key=None):
    from openai import OpenAI
    if api_key:
        client = OpenAI(api_key=api_key)
    else:
        client = OpenAI()

    with open(jsonl_path, "rb") as f:
        batch_input_file = client.files.create(
            file=f,
            purpose="batch"
        )
    print(batch_input_file)
    return batch_input_file

#3 Create the batch  
def create_openai_batch(batch_input_file, endpoint="/v1/embeddings", completion_window="24h", metadata=None, api_key=None):
    from openai import OpenAI

    if api_key:
        client = OpenAI(api_key=api_key)
    else:
        client = OpenAI()

    batch_input_file_id = batch_input_file.id
    response = client.batches.create(
        input_file_id=batch_input_file_id,
        endpoint=endpoint,
        completion_window=completion_window,
        metadata=metadata or {"description": "nightly eval job"}
    )
    print(response)
    return response


#4 Check the satus 
def check_openai_batch(batch_id, api_key=None):
    from openai import OpenAI

    if api_key:
        client = OpenAI(api_key=api_key)
    else:
        client = OpenAI()

    batch = client.batches.retrieve(batch_id)
    print(batch)
    return batch


#5 Retreive or doload the results
# Ejemplo de uso
texts = [
    "El sol brilla sobre las montañas.",
    "La inteligencia artificial está transformando el mundo.",
    "Los datos son el nuevo petróleo."
]

archivo = crear_jsonl_embeddings(texts, 0, 2)
print(f"Archivo generado: {archivo}")


Comezando en la función
Encoding obtenido
Conteo listo
Archivo generado: batch_embeddings_0_2.jsonl


## 2) Pruebas con textos largos

In [16]:
embedding_model = "text-embedding-3-small"
embedding_encoding = "cl100k_base"
max_tokens = 8191  # the maximum for text-embedding-3-small is 8191
from dotenv import load_dotenv

load_dotenv() 
API_KEY = os.environ["OPENAI_API_KEY"]
from openai import OpenAI
client = OpenAI(api_key=API_KEY)

# Cnaciones más largas

with open("cancion_large.txt", "r", encoding="utf-8") as f:
    cancion_mas_larga = f.read()

with open("cancion_large_in_chars.txt", "r", encoding="utf-8") as f2:
    cancion_large_in_chars = f2.read()

words = cancion_mas_larga.split(" ")
print(f"Longitud de la canción más larga: {len(words)} caracteres")

words2 = cancion_large_in_chars.split(" ")
print(f"Longitud de la canción más larga: {len(words2)} caracteres")

print("La cancion mas larga en words: ",num_tokens_from_string(cancion_mas_larga) )
print("La cancion mas larga en chars: ",num_tokens_from_string(cancion_large_in_chars))



Longitud de la canción más larga: 10124 caracteres
Longitud de la canción más larga: 10008 caracteres
Comezando en la función
Encoding obtenido
Conteo listo
La cancion mas larga en words:  13970
Comezando en la función
Encoding obtenido
Conteo listo
La cancion mas larga en chars:  12965


we are going to prove if with this song the normal API (without butches) works well

In [17]:
from openai import OpenAI
client = OpenAI(api_key=API_KEY)

response = client.embeddings.create(
    model=embedding_model,   
    input=cancion_mas_larga,
    encoding_format="float"
)

print(response)

# Si solo quieres ver el embedding:
print("Dimensión del embedding:", len(response.data[0].embedding))
print("Primeros 10 valores:", response.data[0].embedding[:10])

BadRequestError: Error code: 400 - {'error': {'message': "This model's maximum context length is 8192 tokens, however you requested 13970 tokens (13970 in your prompt; 0 for the completion). Please reduce your prompt; or completion length.", 'type': 'invalid_request_error', 'param': None, 'code': None}}

In [None]:

start = 0
end = 0

# Consider limits of thew official page aaaa (All embbedings are equal) 
# text-embedding-3-small
# Rate limits
# - Tokens per minute (TPM): 40,000
# - Requests per minute (RPM): 100
# - Requests per day (RPD): 2,000

# API usage is subject to rate limits applied on tokens per minute (TPM), 
# requests per minute or day (RPM/RPD), and other model-specific limits.
# Your organization's rate limits are listed below.



## Recorring by chunks for no saturate the memory unnecesaryly:
chunksize = 100
chunk_id = 0


path_embbedings = "../../../../data/gpt/embeddings"
path_errors = "../../../../data/gpt/embeddings_errors"

for chunk in pd.read_csv(path_df, chunksize=chunksize):
    embeddings = []
    end = start - 1
    texts = chunk['text'].fillna("")
    for idx, song in enumerate(texts, start=chunk_id*chunksize):
        print("Procesando la canción: ", idx)
        end = idx
        try:
            print("tuki")
            # 1) Crear el archivo jsonl desde start hasta end 
            # 2) Upload the buth file
            
        except Exception as e:
            print(f"Error en la canción {idx} (reintento): {e}")
            

    if END:
        break
    chunk_id+=1
    # np.save(os.path.join(path_embbedings, f"embeddings{start}-{end}.npy"), embeddings)

    if embeddings:
        if any(e is None for e in embeddings):
            save_path = os.path.join(path_errors, f"embeddings_{start}-{end}.npy")
            print(f"Contiene None, guardando en carpeta de errores: {save_path}")
        else:
            # np.save(
            #     os.path.join(path_embbedings, f"embeddings_{start}-{end}.npy"),
            #     np.array(embeddings, dtype=object)
            # )
            save_path = os.path.join(path_embbedings, f"embeddings_{start}-{end}.npy")
            print(f" Guardando en carpeta normal: {save_path}")
        np.save(save_path, np.array(embeddings, dtype=object))
        # print(f"Guardado embeddings_{start}-{end}.npy con {len(embeddings)} filas")
    start = end + 1




file_name = os.path.join(path_data, f"df_{start}_to_{end}.jsonl")


In [None]:
from openai import OpenAI
from dotenv import load_dotenv




In [None]:


client = OpenAI()

response = client.embeddings.create(
    input="Your text string goes here",
    model="text-embedding-3-small"
)

print(response.data[0].embedding)