<span style="color:red; font-family:Helvetica Neue, Helvetica, Arial, sans-serif; font-size:2em;">An Exception was encountered at '<a href="#papermill-error-cell">In [8]</a>'.</span>

We are going to process the lyrics of a song using the batches Api to reduce the cost

## Creación del batch file

Rate limits
Batch API rate limits are separate from existing per-model rate limits. The Batch API has two new types of rate limits:

Per-batch limits: A single batch may include up to 50,000 requests, and a batch input file can be up to 200 MB in size. Note that /v1/embeddings batches are also restricted to a maximum of 50,000 embedding inputs across all requests in the batch.
Enqueued prompt tokens per model: Each model has a maximum number of enqueued prompt tokens allowed for batch processing. You can find these limits on the Platform Settings page.

## 1) Creación del archivo jsonl for batch API

In [1]:

# Model	Cost	Batch cost
# text-embedding-3-small	$0.01	$0.0001
# text-embedding-3-large	$0.065	$0.00013
# text-embedding-ada-002	$0.05	$0.0004
# https://platform.openai.com/docs/pricing

In [2]:
import pandas as pd 
import numpy as np
import os
import json 
import tiktoken
import json


In [3]:

url = "/v1/embeddings"
path_data = "../../../../data"
path_df =os.path.join(path_data, "spotify_dataset_sin_duplicados_4.csv")
df = pd.read_csv(path_df)


In [4]:
import time
# Function to count the number of tokens
def num_tokens_from_string(string: str, encoding_name: str = "cl100k_base") -> int:
    """Returns the number of tokens in a text string."""
    # print("Comezando en la función")
    encoding = tiktoken.get_encoding(encoding_name)
    # print("Encoding obtenido")
    num_tokens = len(encoding.encode(string))
    # print("Conteo listo")
    return num_tokens


# Function that see how much the does the file weigh
def get_file_size(filepath):
    size_bytes = os.path.getsize(filepath)
    size_mb = size_bytes / (1024 * 1024)
    print(f"File size: {size_bytes} bytes ({size_mb:.2f} MB)")
    return size_mb


# In ram
def get_jsons_size(json_list):
    total_bytes = 0
    for j in json_list:
        line = json.dumps(j, ensure_ascii=False) + "\n"  
        total_bytes += len(line.encode("utf-8"))         
    
    return total_bytes / (1024 * 1024)    
# 0) Create a single json file 

def crear_single_json(text, idx, model="text-embedding-3-small"):
    request = {
        "custom_id": f"request-{idx}",
        "method": "POST",
        "url": "/v1/embeddings",
        "body": {
            "model": model,
            "input": text
        }
    }
    return request  
# Function that take a start and a end to create the jsonl file 
#1) create bath file
def guardar_jsons(json_list, start, end, output_dir="./"):
    # Nombre de salida con el rango
    filename = f"{output_dir}/embeddings_{start}_{end}.jsonl"
    
    with open(filename, "w", encoding="utf-8") as f:
        for obj in json_list:
            f.write(json.dumps(obj, ensure_ascii=False) + "\n")
    
    print(f"Archivo guardado: {filename} con {len(json_list)} requests")
    return filename

#2) upload
def upload_batch_file_to_openai(jsonl_path, api_key=None):
    print("Subiendo el archivo de la ruta:", jsonl_path)
    # Validar ruta
    if not os.path.exists(jsonl_path):
        raise FileNotFoundError(f"El archivo no existe: {jsonl_path}")
    if not os.path.isfile(jsonl_path):
        raise ValueError(f"La ruta no es un archivo válido: {jsonl_path}")

    # Cliente
    if api_key:
        client = OpenAI(api_key=api_key)
    else:
        client = OpenAI()

    # Subir archivo
    with open(jsonl_path, "rb") as f:
        batch_input_file = client.files.create(
            file=f,
            purpose="batch"
        )
    
    return batch_input_file
#3 Create the batch  
def create_openai_batch(batch_input_file, endpoint="/v1/embeddings", completion_window="24h", metadata=None, api_key=None):
    from openai import OpenAI

    if api_key:
        client = OpenAI(api_key=api_key)
    else:
        client = OpenAI()

    batch_input_file_id = batch_input_file.id
    response = client.batches.create(
        input_file_id=batch_input_file_id,
        endpoint=endpoint,
        completion_window=completion_window,
        metadata=metadata or {"description": "nightly eval job"}
    )
    # print(response)
    return response


#4 Check the satus 
def check_openai_batch(batch_id, api_key=None):
    from openai import OpenAI

    if api_key:
        client = OpenAI(api_key=api_key)
    else:
        client = OpenAI()

    batch = client.batches.retrieve(batch_id)
    # print(batch)
    return batch


#5 Retreive or doload the results
def download_results(batch_id, api_key=None):
    client = OpenAI(api_key=api_key)
    file_response = client.files.content(batch_id)
    print(file_response.text)
    return file_response.text


# 6 save the results
def saveResult(file_response, start, end, output_path="../../../../data/gpt_responses"):
    os.makedirs(output_path, exist_ok=True)
    with open(f"{output_path}/{start}_{end}.jsonl", "w", encoding="utf-8") as f:
        f.write(file_response)
    
    print(f"Resultados guardados en: {output_path}")


def save_embeddings_only(file_response_text,  start=0, end=0, output_path="../../../../data/gpt_responses"):
    os.makedirs(output_path, exist_ok=True)
    output_file = f"{output_path}/{start}_{end}_embeddings_only.jsonl"

    with open(output_file, "w", encoding="utf-8") as f:
        for line in file_response_text.splitlines():
            if not line.strip():
                continue
            try:
                j = json.loads(line)
                custom_id = j["custom_id"]
                embedding = j["response"]["body"]["data"][0]["embedding"]
                # Guardar como JSON en una sola línea
                f.write(json.dumps({"custom_id": custom_id, "embedding": embedding}, ensure_ascii=False) + "\n")
            except Exception as e:
                print(f"Error procesando línea: {e}")

    print(f"Embeddings guardados solo con custom_id en: {output_file}")
    return output_file

def load_embeddings_and_check_dim(file_path):
    embeddings = []
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            if not line.strip():
                continue
            j = json.loads(line)
            embeddings.append(j["embedding"])
    
    if embeddings:
        print(f"Número de embeddings: {len(embeddings)}")
        print(f"Dimensión del primer embedding: {len(embeddings[0])}")
    else:
        print("No se encontraron embeddings en el archivo.")

    return embeddings
# Ejemplo de uso
texts = [
    "El sol brilla sobre las montañas.",
    "La inteligencia artificial está transformando el mundo.",
    "Los datos son el nuevo petróleo."
]

def save_batch_metadata(start, end, batch_file_id, batch_id, output_dir="../../../../data/gpt_to_check"):
    os.makedirs(output_dir, exist_ok=True)
    metadata = {
        "start": start,
        "end": end,
        "batch_file_id": batch_file_id,
        "batch_id": batch_id
    }

    # Nombre del archivo de metadatos
    filename = f"{output_dir}/batch_metadata_{start}_{end}.json"
    
    # Guardar en disco
    try:
        with open(filename, "w", encoding="utf-8") as f:
            json.dump(metadata, f, indent=4)
        print(f"Metadatos del batch guardados en {filename}")
    except Exception as e:
        print(f"Error guardando metadatos del batch: {e}")



import re
# Function that clean text

def limpiar_letras(texto: str) -> str:
    """
    Elimina anotaciones como [Intro ...], [Chorus ...], [Verse 1: ...], 
    [Part 2 ...], [Hook ...], [Interlude ...], [Skit ...], [Produced ...], 
    [Track 1 ...], [Refrain ...], [Pre-Chorus ...], [Company ...], 
    [Backing vocals ...], [Sample ...], [Segue from ...], [Interview ...], 
    [2Pac ...], etc.
    """
    patron = r"""
        \[                                   # abre corchete
        (?:                                  # grupo de opciones
            Intro\s*\d*
          | Chorus\s*\d*
          | Verse\s*\d*
          | Vers\s*\d*
          | Hook\s*\d*
          | Part\s*\d*
          | Interlude\s*\d*
          | Skit\s*\d*
          | Produced\s*\d*
          | Track\s*\d*
          | Refrain\s*\d*
          | Pre-?Chorus\s*\d*
          | Company\s*\d*
          | Backing\s+vocals
          | Sample\s*\d*
          | Segue\s+from
          | Instrumental
          | Pre-Hook
          | Pre Hook
          | Lyrical
          | Beat
          | Interview\s*\d*
        )[^\]]*                              # cualquier cosa hasta ]
        \]                                   # cierra corchete
    """
    return re.sub(patron, "", texto, flags=re.IGNORECASE | re.VERBOSE)


## 1.1) Conteo de canciones que pasan el limite de tokens 

In [5]:

# df["n_tokens"] = df["text"].apply(lambda x: num_tokens_from_string(x))

# # Cuenta cuántos superan el límite
# exceden = (df["n_tokens"] > 8190).sum()

# print(f"Cantidad de textos que superan 8190 tokens: {exceden}")

## 2) Pruebas con textos largos

In [6]:
# embedding_model = "text-embedding-3-small"
# embedding_encoding = "cl100k_base"
# max_tokens = 8191  # the maximum for text-embedding-3-small is 8191
# from dotenv import load_dotenv

# load_dotenv() 
# API_KEY = os.environ["OPENAI_API_KEY"]
# from openai import OpenAI
# client = OpenAI(api_key=API_KEY)

# # Cnaciones más largas

# with open("cancion_large.txt", "r", encoding="utf-8") as f:
#     cancion_mas_larga = f.read()

# with open("cancion_large_in_chars.txt", "r", encoding="utf-8") as f2:
#     cancion_large_in_chars = f2.read()

# words = cancion_mas_larga.split(" ")
# print(f"Longitud de la canción más larga: {len(words)} caracteres")

# words2 = cancion_large_in_chars.split(" ")
# print(f"Longitud de la canción más larga: {len(words2)} caracteres")

# print("La cancion mas larga en words: ",num_tokens_from_string(cancion_mas_larga) )
# print("La cancion mas larga en chars: ",num_tokens_from_string(cancion_large_in_chars))



So, only are 13 songs that exceed the limit that's wahy we are only going to process the rest

In [7]:
# from openai import OpenAI
# client = OpenAI(api_key=API_KEY)

# response = client.embeddings.create(
#     model=embedding_model,   
#     input=cancion_mas_larga,
#     encoding_format="float"
# )

# print(response)

# # Si solo quieres ver el embedding:
# print("Dimensión del embedding:", len(response.data[0].embedding))
# print("Primeros 10 valores:", response.data[0].embedding[:10])

<span id="papermill-error-cell" style="color:red; font-family:Helvetica Neue, Helvetica, Arial, sans-serif; font-size:2em;">Execution using papermill encountered an exception here and stopped:</span>

In [8]:

# start = 0
# end = 0

# Consider limits of thew official page aaaa (All embbedings are equal) 
# text-embedding-3-small
# Rate limits
# - Tokens per minute (TPM): 40,000
# - Requests per minute (RPM): 100
# - Requests per day (RPD): 2,000

embedding_model = "text-embedding-3-small"
embedding_encoding = "cl100k_base"
max_tokens = 8191 

from openai import OpenAI
from dotenv import load_dotenv
import os
load_dotenv() 
API_KEY = os.environ["OPENAI_API_KEY"]



TESTING = False


# CONFIGURATIONS
# https://platform.openai.com/docs/guides/batch#rate-limits
TPM = 40000
RPM = 100
RPD = 2000
limit_size = 190 # In MB
# limit_request_in_file_batch = 50000
limit_request_in_file_batch = 15000 # Segun mi regla de 3 simple [900-3MB]
chunksize = 100 # For simplicity a multiple of limit_request_in_file_batch 
chunk_id = 0


if TESTING:
    limit_request_in_file_batch = 900
    # limit_size = 0.5
    second = False
END = False

path_embbedings = "../../../../data/gpt/embeddings"
# path_errors = "../../../../data/gpt/embeddings_errors"

os.makedirs(path_embbedings, exist_ok=True)
# os.makedirs(path_errors, exist_ok=True)

it_n = 0
start = 46805
end = None
songs_RAM  = []  
songs_passed_limit = []
for chunk in pd.read_csv(path_df, chunksize=chunksize):
    texts = chunk['text'].fillna("")
    for idx, song in enumerate(texts, start=chunk_id * chunksize ):
        if idx < start:
            continue
        if idx%1800==0:
            print("Procesando la canción: ", idx)
        song = limpiar_letras(song)
        end = idx # Solo si se procesra sera este end
        if(num_tokens_from_string(song) > max_tokens):
            print(f"La canción {idx} excede el límite de tokens ({num_tokens_from_string(song)} tokens). Se omitirá.")
            songs_passed_limit.append(idx)
            continue

        try:
            if get_jsons_size(songs_RAM) > limit_size or len(songs_RAM) >= limit_request_in_file_batch:
                end = idx-1 # Si no se procesa es el anterior
                it_n+=1
                print("Size en RAM: ", get_jsons_size(songs_RAM))
                # Limpiar songs RAM y guardarlo 
                if get_jsons_size(songs_RAM) > limit_size:
                    print("Song in RAM esta lleno")
                
                # Realizamos el guardado del file
                saved_in = guardar_jsons(songs_RAM, start, end, output_dir=path_embbedings)
                print("Tamaño real al guardar:",get_file_size(saved_in) )
                print(f"Archivo guardado en {saved_in}")
                
                
                # ---Inicio de los paso de gpt ----
                # 1) upload_batch_file_to_openai
                batch_input_file = upload_batch_file_to_openai(saved_in, API_KEY)
                print("Archivo subido con éxito:", batch_input_file.id) # file ID
                # 2) Create the batch 
                response = create_openai_batch(batch_input_file, endpoint="/v1/embeddings", completion_window="24h", metadata=None, api_key=API_KEY)
                print(response)
                print(f"Batch creado con ID: {response.id}") # Batch ID

                # Esto ya para mas despues
                # 3) check status
                # status_ = check_openai_batch(response.id, API_KEY)
                
                # print(f"Estado del batch: {status_.status}")
                # Esperar a que el estado del batch sea completed
                # if (check_openai_batch(response.id, API_KEY).status != "completed"):
                #     time.sleep(600) # Dormir 10 minutos

                if (check_openai_batch(response.id, API_KEY).status == "completed"):
                    print(f"Estado del batch completed")
                    output_file_id = check_openai_batch(response.id, API_KEY).output_file_id
                    pta = download_results(output_file_id, API_KEY)
                    saved_file = save_embeddings_only(pta,start,end)
                    print(f"File guardado en: {saved_file}")
                else:
                    # Guardar start, end, asociado a batch_input_file.id y response.id
                    print("Trankilo guardaremos lo necesario para obtener los embbedings")
                    save_batch_metadata(start, end, batch_input_file.id, response.id)

                # ---Fin de los paso de gpt ----
                print("Sleeping 1 minute")
                time.sleep(60) # Dormir 1 minuto porciacasito
                songs_RAM = []
                
                start = end + 1
                
                # Falta guardar la cancion current 
                json_request = crear_single_json(song, idx)
                songs_RAM.append(json_request)
                    


                # For testing
                if TESTING and it_n == 2:
                    END = True
                    break
            else:
                # Añadir la cancion al json file
                json_request = crear_single_json(song, idx)
                songs_RAM.append(json_request)

        except Exception as e:
            print(f"Error en la canción {idx}: {e}")

    chunk_id+=1
    
    if END:
        print("Termino el testing")
        break


ConnectionError: HTTPSConnectionPool(host='openaipublic.blob.core.windows.net', port=443): Max retries exceeded with url: /encodings/cl100k_base.tiktoken (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x7feb6b4152b0>: Failed to resolve 'openaipublic.blob.core.windows.net' ([Errno -2] Name or service not known)"))

In [None]:
status = check_openai_batch("batch_68aea8e4452481909a37f98cefa35431", API_KEY)
print(status.status)
print("Output file ID:", output_file_id)

if status.status == "completed":
    print("Completed")
    output_file_id = status.output_file_id
    print("Output file ID:", output_file_id)
    # print("Descargando resultados...")
    # pta = download_results(output_file_id, API_KEY)
    # # saveResult(pta,0,2)
    # save_embeddings_only(pta,0,4)

In [None]:
save_embeddings_only(pta,0,4)


In [None]:
# file_path = "../../../../data/gpt_responses/0_4_embeddings_only.jsonl"
# embeddings = load_embeddings_and_check_dim(file_path)

# for embb in embeddings:
#     print(embb)