We are going to process the lyrics of a song using the batches Api to reduce the cost

## Creación del batch file

Rate limits
Batch API rate limits are separate from existing per-model rate limits. The Batch API has two new types of rate limits:

Per-batch limits: A single batch may include up to 50,000 requests, and a batch input file can be up to 200 MB in size. Note that /v1/embeddings batches are also restricted to a maximum of 50,000 embedding inputs across all requests in the batch.
Enqueued prompt tokens per model: Each model has a maximum number of enqueued prompt tokens allowed for batch processing. You can find these limits on the Platform Settings page.

## 1) Creación del archivo jsonl for batch API

In [1]:

# Model	Cost	Batch cost
# text-embedding-3-small	$0.01	$0.0001
# text-embedding-3-large	$0.065	$0.00013
# text-embedding-ada-002	$0.05	$0.0004
# https://platform.openai.com/docs/pricing

In [16]:
import pandas as pd 
import numpy as np
import os
import json 
import tiktoken
import json
from openai import OpenAI


In [17]:

url = "/v1/embeddings"
path_data = "../../../../../../data/spanish"
dataset_path = "dataset/oficialDatasetEAIM2026.csv"

path_df =os.path.join(path_data, dataset_path)
df = pd.read_csv(path_df)

display(df.head(2))




Unnamed: 0,_id,artist,genre,lyrics,composer,lyrics_word_count,title_songs_new,spotify_id,popularity,explicit_content,duration_ms,release_date,external_urls.spotify,letras_path,id_yt
0,689a8a83a437eda121bd38ce,Laura Pausini,pop,Ya no responde ni al teléfono\nPende de un hil...,Compuesta por: Federico Cavalli / Angelo Valsi...,445,Se Fue,5oQadhkuEdEhtdVn0QceyZ,71,False,240179,2024-11-15,https://open.spotify.com/track/5oQadhkuEdEhtdV...,Laura Pausini/30278,g-GBiuujmL8
1,689a8a83a437eda121bd389f,Shakira,pop,Loca\n(Loca)\nNo te ponga' bruto\n\nQue te la ...,Compuesta por: Pitbull / El Cata / Shakira / C...,360,Loca (part. El Cata),42k1KeBehAd83lrGt1okiC,76,False,183693,2010-10-19,https://open.spotify.com/track/42k1KeBehAd83lr...,Shakira/1735339,XAhTt60W7qo


In [18]:
import time
# Function to count the number of tokens
def num_tokens_from_string(string: str, encoding_name: str = "cl100k_base") -> int:
    """Returns the number of tokens in a text string."""
    # print("Comezando en la función")
    encoding = tiktoken.get_encoding(encoding_name)
    # print("Encoding obtenido")
    num_tokens = len(encoding.encode(string))
    # print("Conteo listo")
    return num_tokens


# Function that see how much the does the file weigh
def get_file_size(filepath):
    size_bytes = os.path.getsize(filepath)
    size_mb = size_bytes / (1024 * 1024)
    print(f"File size: {size_bytes} bytes ({size_mb:.2f} MB)")
    return size_mb


# In ram
def get_jsons_size(json_list):
    total_bytes = 0
    for j in json_list:
        line = json.dumps(j, ensure_ascii=False) + "\n"  
        total_bytes += len(line.encode("utf-8"))         
    
    return total_bytes / (1024 * 1024)    
# 0) Create a single json file 

def crear_single_json(text, idx, model="text-embedding-3-small"):
    request = {
        "custom_id": f"request-{idx}",
        "method": "POST",
        "url": "/v1/embeddings",
        "body": {
            "model": model,
            "input": text
        }
    }
    return request  
# Function that take a start and a end to create the jsonl file 
#1) create bath file
def guardar_jsons(json_list, start, end, output_dir="./"):
    # Nombre de salida con el rango
    filename = f"{output_dir}/embeddings_{start}_{end}.jsonl"
    
    with open(filename, "w", encoding="utf-8") as f:
        for obj in json_list:
            f.write(json.dumps(obj, ensure_ascii=False) + "\n")
    
    print(f"Archivo guardado: {filename} con {len(json_list)} requests")
    return filename

#2) upload
def upload_batch_file_to_openai(jsonl_path, api_key=None):
    print("Subiendo el archivo de la ruta:", jsonl_path)
    # Validar ruta
    if not os.path.exists(jsonl_path):
        raise FileNotFoundError(f"El archivo no existe: {jsonl_path}")
    if not os.path.isfile(jsonl_path):
        raise ValueError(f"La ruta no es un archivo válido: {jsonl_path}")

    # Cliente
    if api_key:
        client = OpenAI(api_key=api_key)
    else:
        client = OpenAI()

    # Subir archivo
    with open(jsonl_path, "rb") as f:
        batch_input_file = client.files.create(
            file=f,
            purpose="batch"
        )
    
    return batch_input_file
#3 Create the batch  
def create_openai_batch(batch_input_file, endpoint="/v1/embeddings", completion_window="24h", metadata=None, api_key=None):

    if api_key:
        client = OpenAI(api_key=api_key)
    else:
        client = OpenAI()

    batch_input_file_id = batch_input_file.id
    response = client.batches.create(
        input_file_id=batch_input_file_id,
        endpoint=endpoint,
        completion_window=completion_window,
        metadata=metadata or {"description": "nightly eval job"}
    )
    # print(response)
    return response


#4 Check the satus 
def check_openai_batch(batch_id, api_key=None):
    from openai import OpenAI

    if api_key:
        client = OpenAI(api_key=api_key)
    else:
        client = OpenAI()

    batch = client.batches.retrieve(batch_id)
    # print(batch)
    return batch


#5 Retreive or doload the results
def download_results(batch_id, api_key=None):
    client = OpenAI(api_key=api_key)
    file_response = client.files.content(batch_id)
    print(file_response.text)
    return file_response.text


# 6 save the results
def saveResult(file_response, start, end, output_path="../../../../data/gpt_responses"):
    os.makedirs(output_path, exist_ok=True)
    with open(f"{output_path}/{start}_{end}.jsonl", "w", encoding="utf-8") as f:
        f.write(file_response)
    
    print(f"Resultados guardados en: {output_path}")


def save_embeddings_only(file_response_text,  start=0, end=0, output_path="../../../../data/gpt_responses"):
    os.makedirs(output_path, exist_ok=True)
    output_file = f"{output_path}/{start}_{end}_embeddings_only.jsonl"

    with open(output_file, "w", encoding="utf-8") as f:
        for line in file_response_text.splitlines():
            if not line.strip():
                continue
            try:
                j = json.loads(line)
                custom_id = j["custom_id"]
                embedding = j["response"]["body"]["data"][0]["embedding"]
                # Guardar como JSON en una sola línea
                f.write(json.dumps({"custom_id": custom_id, "embedding": embedding}, ensure_ascii=False) + "\n")
            except Exception as e:
                print(f"Error procesando línea: {e}")

    print(f"Embeddings guardados solo con custom_id en: {output_file}")
    return output_file

def load_embeddings_and_check_dim(file_path):
    embeddings = []
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            if not line.strip():
                continue
            j = json.loads(line)
            embeddings.append(j["embedding"])
    
    if embeddings:
        print(f"Número de embeddings: {len(embeddings)}")
        print(f"Dimensión del primer embedding: {len(embeddings[0])}")
    else:
        print("No se encontraron embeddings en el archivo.")

    return embeddings
# Ejemplo de uso
texts = [
    "El sol brilla sobre las montañas.",
    "La inteligencia artificial está transformando el mundo.",
    "Los datos son el nuevo petróleo."
]

def save_batch_metadata(start, end, batch_file_id, batch_id, output_dir="../../../../data/gpt_to_check"):
    os.makedirs(output_dir, exist_ok=True)
    metadata = {
        "start": start,
        "end": end,
        "batch_file_id": batch_file_id,
        "batch_id": batch_id
    }

    # Nombre del archivo de metadatos
    filename = f"{output_dir}/batch_metadata_{start}_{end}.json"
    
    # Guardar en disco
    try:
        with open(filename, "w", encoding="utf-8") as f:
            json.dump(metadata, f, indent=4)
        print(f"Metadatos del batch guardados en {filename}")
    except Exception as e:
        print(f"Error guardando metadatos del batch: {e}")



import re
# Function that clean text

def limpiar_letras(texto: str) -> str:
    """
    Elimina anotaciones como [Intro ...], [Chorus ...], [Verse 1: ...], 
    [Part 2 ...], [Hook ...], [Interlude ...], [Skit ...], [Produced ...], 
    [Track 1 ...], [Refrain ...], [Pre-Chorus ...], [Company ...], 
    [Backing vocals ...], [Sample ...], [Segue from ...], [Interview ...], 
    [2Pac ...], etc.
    """
    patron = r"""
        \[                                   # abre corchete
        (?:                                  # grupo de opciones
            Intro\s*\d*
          | Chorus\s*\d*
          | Verse\s*\d*
          | Vers\s*\d*
          | Hook\s*\d*
          | Part\s*\d*
          | Interlude\s*\d*
          | Skit\s*\d*
          | Produced\s*\d*
          | Track\s*\d*
          | Refrain\s*\d*
          | Pre-?Chorus\s*\d*
          | Company\s*\d*
          | Backing\s+vocals
          | Sample\s*\d*
          | Segue\s+from
          | Instrumental
          | Pre-Hook
          | Pre Hook
          | Lyrical
          | Beat
          | Interview\s*\d*
        )[^\]]*                              # cualquier cosa hasta ]
        \]                                   # cierra corchete
    """
    return re.sub(patron, "", texto, flags=re.IGNORECASE | re.VERBOSE)


## Clase que se encarga de la limpieza de datos, igual para todos los tipos de embbedings

In [20]:

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, SnowballStemmer
from nltk.tokenize import word_tokenize

import re

class TextPreprocessor:
    """
    Text preprocessing class for lyrics data
    """

    def __init__(self, language: str = 'spanish'):
        """
        Initialize the preprocessor

        Args:
            language: Language for stopwords (default: spanish)
        """
        self.language = language
        self.stemmer = SnowballStemmer(language)
        self.stop_words = set(stopwords.words(language))
        import re

    def clean_text(self, text: str) -> str:
        """
        Clean and normalize text

        Args:
            text: Input text to clean

        Returns:
            Cleaned text
        """
        if pd.isna(text) or text == '':
            return ''

        # Convert to lowercase
        text = str(text).lower()

        # Remove URLs
        text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

        # Remove special characters but keep spaces
        text = re.sub(r'[^\w\s]', ' ', text)

        # Remove numbers
        text = re.sub(r'\d+', '', text)

        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text).strip()



        return text
    


    def tokenize_and_process(self, text, remove_stopwords = True,
                           apply_stemming = True):
        """
        Tokenize and process text

        Args:
            text: Input text
            remove_stopwords: Whether to remove stopwords
            apply_stemming: Whether to apply stemming

        Returns:
            List of processed tokens
        """
        if not text:
            return []

        # Tokenize
        tokens = word_tokenize(text, language=self.language)

        # Remove stopwords if requested
        if remove_stopwords:
            tokens = [token for token in tokens if token not in self.stop_words]

        # Apply stemming if requested
        if apply_stemming:
            tokens = [self.stemmer.stem(token) for token in tokens]

        # Remove very short tokens
        tokens = [token for token in tokens if len(token) > 2]

        return tokens

    def preprocess(self, text: str, remove_stopwords: bool = True,
                   apply_stemming: bool = True) -> str:
        """
        Complete preprocessing pipeline

        Args:
            text: Input text
            remove_stopwords: Whether to remove stopwords
            apply_stemming: Whether to apply stemming

        Returns:
            Preprocessed text as string
        """
        cleaned_text = self.clean_text(text)
        tokens = self.tokenize_and_process(cleaned_text, remove_stopwords, apply_stemming)
        return ' '.join(tokens)



In [21]:

# start = 0
# end = 0

# Consider limits of thew official page aaaa (All embbedings are equal) 
# text-embedding-3-small
# Rate limits
# - Tokens per minute (TPM): 40,000
# - Requests per minute (RPM): 100
# - Requests per day (RPD): 2,000

embedding_model = "text-embedding-3-small"
embedding_encoding = "cl100k_base"
max_tokens = 8191 

from openai import OpenAI
from dotenv import load_dotenv
import os
load_dotenv() 
API_KEY = os.environ["OPENAI_API_KEY"]

TESTING = False

# CONFIGURATIONS
# https://platform.openai.com/docs/guides/batch#rate-limits
TPM = 40000
RPM = 100
RPD = 2000
limit_size = 190 # In MB
# limit_request_in_file_batch = 50000
limit_request_in_file_batch = 15000 # Segun mi regla de 3 simple [900-3MB]
chunksize = 100 # For simplicity a multiple of limit_request_in_file_batch 
chunk_id = 0


if TESTING:
    limit_request_in_file_batch = 900
    # limit_size = 0.5
    second = False
END = False

path_embbedings = f"{path_data}/gpt_T/json_files"
path_msg = f"{path_data}/gpt_T"
output_batch_metadata = f"{path_data}/gpt_T/metadata_butches_files"
# path_errors = "../../../../data/gpt/embeddings_errors"

os.makedirs(path_embbedings, exist_ok=True)
os.makedirs(output_batch_metadata, exist_ok=True)
os.makedirs(path_msg, exist_ok=True)
# os.makedirs(path_errors, exist_ok=True)

it_n = 0
start = 0
end = None
songs_RAM  = []  
songs_passed_limit = []
failed_songs = []

preprocessor = TextPreprocessor()

F = ['emotion', 'Key', 'Time signature', 'Artist(s)', 'song', 'Genre', 'Album', 'Similar Artist 1', 'Similar Song 1', 'Similar Artist 2', 'Similar Song 2', 'Similar Artist 3', 'Similar Song 3', 'song_normalized', 'artist_normalized']
T = ['lyrics']
COL_GPT = T
steaming= True

for chunk in pd.read_csv(path_df, chunksize=chunksize):
    chunk['combined_text'] = chunk[COL_GPT].fillna('').agg(' '.join, axis=1)
    for idx, song in enumerate(chunk['combined_text'], start=chunk_id * chunksize ):
        if idx < start:
            continue
        if idx%1800==0:
            print("Procesando la canción: ", idx)
        # song = limpiar_letras(song) 
        song = preprocessor.preprocess(song) #Uniformizando misma limpieza de la data para todos los embb
        # Ahora con limpieza
        end = idx # Solo si se procesra sera este end
        if(num_tokens_from_string(song) > max_tokens):
            print(f"La canción {idx} excede el límite de tokens ({num_tokens_from_string(song)} tokens). Se omitirá.")
            songs_passed_limit.append(idx)
            continue

        try:
            if get_jsons_size(songs_RAM) > limit_size or len(songs_RAM) >= limit_request_in_file_batch:
                end = idx-1 # Si no se procesa es el anterior
                it_n+=1
                print("Size en RAM: ", get_jsons_size(songs_RAM))
                # Limpiar songs RAM y guardarlo 
                if get_jsons_size(songs_RAM) > limit_size:
                    print("Song in RAM esta lleno")
                
                # Realizamos el guardado del file
                saved_in = guardar_jsons(songs_RAM, start, end, output_dir=path_embbedings)
                print("Tamaño real al guardar:",get_file_size(saved_in) )
                print(f"Archivo guardado en {saved_in}")
                
                
                # ---Inicio de los paso de gpt ----
                # 1) upload_batch_file_to_openai
                batch_input_file = upload_batch_file_to_openai(saved_in, API_KEY)
                print("Archivo subido con éxito:", batch_input_file.id) # file ID
                # 2) Create the batch 
                response = create_openai_batch(batch_input_file, endpoint="/v1/embeddings", completion_window="24h", metadata=None, api_key=API_KEY)
                print(response)
                print(f"Batch creado con ID: {response.id}") # Batch ID

                if (check_openai_batch(response.id, API_KEY).status == "completed"):
                    print(f"Estado del batch completed")
                    output_file_id = check_openai_batch(response.id, API_KEY).output_file_id
                    pta = download_results(output_file_id, API_KEY)
                    saved_file = save_embeddings_only(pta,start,end)
                    print(f"File guardado en: {saved_file}")
                else:
                    # Guardar start, end, asociado a batch_input_file.id y response.id
                    print("Yo nervioso (nose si me quiere n) tu trankilo, guardaremos la data para hacer la descargation")
                    save_batch_metadata(start, end, batch_input_file.id, response.id, output_dir=output_batch_metadata)

                # ---Fin de los paso de gpt ----
                print("Sleeping 1 minute")
                time.sleep(60) # Dormir 1 minuto porciacasito
                songs_RAM = []
                
                start = end + 1
                
                # Falta guardar la cancion current 
                json_request = crear_single_json(song, idx)
                songs_RAM.append(json_request)

                # For testing
                if TESTING and it_n == 2:
                    END = True
                    break
            else:
                # Añadir la cancion al json file
                json_request = crear_single_json(song, idx)
                songs_RAM.append(json_request)

        except Exception as e:
            print(f"Error en la canción {idx}: {e}")
            failed_songs.append(idx)

    chunk_id+=1
    
    if END:
        print("Termino el testing")
        break

if songs_RAM:  
    print("Procesando el último bloque en RAM...")
    end = end if end is not None else start + len(songs_RAM) - 1
    saved_in = guardar_jsons(songs_RAM, start, end, output_dir=path_embbedings)
    print("Tamaño real al guardar:", get_file_size(saved_in))
    print(f"Archivo guardado en {saved_in}")
    # --- Inicio de los pasos GPT para este último batch ---
    try:
        batch_input_file = upload_batch_file_to_openai(saved_in, API_KEY)
        print("Archivo subido con éxito:", batch_input_file.id) # file ID
        response = create_openai_batch(batch_input_file, endpoint="/v1/embeddings",
                                       completion_window="24h", metadata=None, api_key=API_KEY)
        print(response)
        print(f"Batch creado con ID: {response.id}")

        status = check_openai_batch(response.id, API_KEY)
        if status.status == "completed":
            print("Estado del batch: completed")
            output_file_id = status.output_file_id
            pta = download_results(output_file_id, API_KEY)
            saved_file = save_embeddings_only(pta, start, end)
            print(f"File guardado en: {saved_file}")
        else:
            print("Guardando metadata para continuar luego...")
            save_batch_metadata(start, end, batch_input_file.id, response.id, output_dir=output_batch_metadata)

    except Exception as e:
        print(f"Error procesando el último bloque: {e}")

Procesando la canción:  0
Procesando la canción:  1800
Procesando la canción:  3600
Procesando la canción:  5400
Procesando la canción:  7200
Procesando el último bloque en RAM...
Archivo guardado: ../../../../../../data/spanish/gpt_T/json_files/embeddings_0_7318.jsonl con 7319 requests
File size: 6323355 bytes (6.03 MB)
Tamaño real al guardar: 6.030421257019043
Archivo guardado en ../../../../../../data/spanish/gpt_T/json_files/embeddings_0_7318.jsonl
Subiendo el archivo de la ruta: ../../../../../../data/spanish/gpt_T/json_files/embeddings_0_7318.jsonl
Archivo subido con éxito: file-QPu2mMgDH7LVW5eqXxN14x
Batch(id='batch_6901d39233908190958fd386168769d9', completion_window='24h', created_at=1761727378, endpoint='/v1/embeddings', input_file_id='file-QPu2mMgDH7LVW5eqXxN14x', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1761813778, failed_at=None, finalizing_at=None, in_progres

In [22]:
if songs_passed_limit:
    path_passed = os.path.join(path_msg, "a_songs_passed_limit_F.txt")
    with open(path_passed, "w") as f:
        json.dump(songs_passed_limit, f)
    print(f"Se guardaron {len(songs_passed_limit)} índices en {path_passed}")


if failed_songs:  
    print("Terrible y ahora...")
    path_failed = os.path.join(path_msg, "failed_songs_F.txt")
    with open(path_failed, "w") as f:
        f.write(", ".join(map(str, failed_songs))) 
    print(f"Se guardaron {len(failed_songs)} índices en {path_failed}")
