In [1]:
import pandas as pd 
import numpy as np
import os
import re
import json

In [2]:

def get_first_number(filename):
    match = re.search(r"(\d+)", filename)
    return int(match.group(1)) if match else float('inf')
def get_second_number(filename: str) -> int:
    matches = re.findall(r"(\d+)", filename)
    if len(matches) >= 2:
        return int(matches[1])  # el segundo número
    return float('inf')  # si no lo encuentra


def load_embeddings_and_check_dim(file_path):
    embeddings = []
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            if not line.strip():
                continue
            j = json.loads(line)
            embeddings.append([get_first_number(j["custom_id"]),j["embedding"]])
    
    if embeddings:
        print(f"Número de embeddings: {len(embeddings)}")
        # print(f"Dimensión del primer embedding: {len(embeddings[0][1])}")
    else:
        print("No se encontraron embeddings en el archivo.")

    return embeddings

def sort_according_id(embeddings):
    return sorted(embeddings, key=lambda x: x[0])

def get_faltantes(start, end, embeddings):
    faltantes = []
    n = len(embeddings)
    index = 0   
    for i in range(start, end + 1):
        if index >= n:  
            faltantes.append(i)
        elif embeddings[index][0] == i:  
            index += 1
        else:
            faltantes.append(i) 
    return faltantes

def save_embb(embeddings, output_path):
    only_embb =[]
    for embb in embeddings:
        only_embb.append(embb[1])
    np.save(output_path, np.array(only_embb))
    print(f"Embeddings saved to {output_path}")


In [3]:
data_dir = "../../../../data"
path_embb = "../../../../data/embeddings_only"

file_embb = os.listdir(path_embb)
files_sorted = sorted(file_embb, key=get_first_number)
faltantes = []

index = 96814
for relative_path in files_sorted:
    start = get_first_number(relative_path)

    if start != index:
        continue
    path_file_embb = os.path.join(path_embb, relative_path)
    embbdings = load_embeddings_and_check_dim(path_file_embb)
    embbdings_sorted = sort_according_id(embbdings)
    end = get_second_number(relative_path)
    faltantes.append(get_faltantes(start, end, embbdings_sorted))
    directory = os.path.join(data_dir, "gpt_embd")
    os.makedirs(directory, exist_ok=True)
    save_embb(embbdings_sorted, f"{directory}/embeddings_{start}_{end}.npy")



Número de embeddings: 1301
Embeddings saved to ../../../../data/gpt_embd/embeddings_96814_1068124.npy


In [25]:
def get_array(path):
    with open(path, "r") as f:
        array_ = json.load(f)  
    return array_

In [28]:
irrecuperable = get_array("faltantes_according_token.json")
print(len(irrecuperable))


13


Vamos a verificar que indices tenemos disponibles ne la recuperación 

In [35]:
path_embb = "../../../../data/embeddings_only"

# Data antecesora a la data recueperada

path_antecesora_data = "91812_106812_embeddings_only.jsonl"  
path_file_antecesora = os.path.join(path_embb, path_antecesora_data)
embbdings_antecesora = load_embeddings_and_check_dim(path_file_antecesora)
embbdings_antecesora_sorted = sort_according_id(embbdings_antecesora)
print("antecesora start:", embbdings_antecesora_sorted[0][0])
print("antecesora end:", embbdings_antecesora_sorted[-1][0])

# Data recuperada
path_recuperate_data = "96814_1068124_embeddings_only.jsonl"
path_file_embb = os.path.join(path_embb, path_recuperate_data)
embbdings = load_embeddings_and_check_dim(path_file_embb)
embbdings_sorted = sort_according_id(embbdings)
print("start:", embbdings_sorted[0][0])
print("end:", embbdings_sorted[-1][0])


Número de embeddings: 13699
antecesora start: 91812
antecesora end: 105511
Número de embeddings: 1301
start: 105512
end: 106812


Los índices no presentan problema con la antecesora porque ya tan ordendos, tons vamos a poder fusionar sin problemas

Fusionar los embbedings

In [38]:
# A partir de 105512 es que tenemos los embbedings
data_dir = "../../../../data"
directory = os.path.join(data_dir, "gpt_embd")

files_ = os.listdir(directory)
sorted_files = sorted(files_, key=get_first_number)

all_embeddings = []
dir_fussioned = os.path.join(directory, "gpt_fussioned")
os.makedirs(dir_fussioned, exist_ok=True)
for relatvie_embb_path in sorted_files:
    path_embb = os.path.join(directory, relatvie_embb_path)
    print("Cargando:", path_embb)
    print(path_embb)
    arr = np.load(path_embb)
    all_embeddings.append(arr)

# Fusionar todos
fused_embeddings = np.concatenate(all_embeddings, axis=0)

path_fused = os.path.join(dir_fussioned, "embeddings_fused.npy")
np.save(path_fused, fused_embeddings)

print("Embeddings fusionados guardados en:", path_fused)
print("Shape final:", fused_embeddings.shape)

#Tamaño del archivo en disco
size_bytes = os.path.getsize(path_fused)
size_mb = size_bytes / (1024**2)
size_gb = size_bytes / (1024**3)

print(f"Tamaño del archivo: {size_bytes:,} bytes")
print(f"Tamaño aproximado: {size_mb:.2f} MB ({size_gb:.2f} GB)")

Cargando: ../../../../data/gpt_embd/embeddings_0_899.npy
../../../../data/gpt_embd/embeddings_0_899.npy
Cargando: ../../../../data/gpt_embd/embeddings_900_1799.npy
../../../../data/gpt_embd/embeddings_900_1799.npy
Cargando: ../../../../data/gpt_embd/embeddings_1800_16801.npy
../../../../data/gpt_embd/embeddings_1800_16801.npy
Cargando: ../../../../data/gpt_embd/embeddings_16802_31802.npy
../../../../data/gpt_embd/embeddings_16802_31802.npy
Cargando: ../../../../data/gpt_embd/embeddings_31803_46804.npy
../../../../data/gpt_embd/embeddings_31803_46804.npy
Cargando: ../../../../data/gpt_embd/embeddings_46805_61809.npy
../../../../data/gpt_embd/embeddings_46805_61809.npy
Cargando: ../../../../data/gpt_embd/embeddings_61810_76811.npy
../../../../data/gpt_embd/embeddings_61810_76811.npy
Cargando: ../../../../data/gpt_embd/embeddings_76812_91811.npy
../../../../data/gpt_embd/embeddings_76812_91811.npy
Cargando: ../../../../data/gpt_embd/embeddings_91812_106812.npy
../../../../data/gpt_embd/em