In [1]:
import pandas as pd 
import numpy as np
import os
import re
import json

In [2]:

def get_first_number(filename):
    match = re.search(r"(\d+)", filename)
    return int(match.group(1)) if match else float('inf')
def get_second_number(filename: str) -> int:
    matches = re.findall(r"(\d+)", filename)
    if len(matches) >= 2:
        return int(matches[1])  # el segundo número
    return float('inf')  # si no lo encuentra


def load_embeddings_and_check_dim(file_path):
    embeddings = []
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            if not line.strip():
                continue
            j = json.loads(line)
            embeddings.append([get_first_number(j["custom_id"]),j["embedding"]])
    
    if embeddings:
        print(f"Número de embeddings: {len(embeddings)}")
        # print(f"Dimensión del primer embedding: {len(embeddings[0][1])}")
    else:
        print("No se encontraron embeddings en el archivo.")

    return embeddings

def sort_according_id(embeddings):
    return sorted(embeddings, key=lambda x: x[0])

def get_faltantes(start, end, embeddings):
    faltantes = []
    n = len(embeddings)
    index = 0   
    for i in range(start, end + 1):
        if index >= n:  
            faltantes.append(i)
        elif embeddings[index][0] == i:  
            index += 1
        else:
            faltantes.append(i) 
    return faltantes

def save_embb(embeddings, output_path):
    only_embb =[]
    for embb in embeddings:
        only_embb.append(embb[1])
    np.save(output_path, np.array(only_embb))
    print(f"Embeddings saved to {output_path}")


In [3]:
data_dir = "../../../../data"
# path_embb = "../../../../data/embeddings_only"
path_embb = "../../../../data/gpt_F_cat/embeddings_with_id"
path_final_embb = "../../../../data/gpt_F_cat/only_embb"
os.makedirs(path_final_embb, exist_ok=True)

file_embb = os.listdir(path_embb)
files_sorted = sorted(file_embb, key=get_first_number)
faltantes = []

# index = 96814
total = 0
for relative_path in files_sorted:
    start = get_first_number(relative_path)

    # if start != index:
    #     continue
    path_file_embb = os.path.join(path_embb, relative_path)
    embbdings = load_embeddings_and_check_dim(path_file_embb)
    total+=len(embbdings)
    embbdings_sorted = sort_according_id(embbdings)
    end = get_second_number(relative_path)
    faltantes.append(get_faltantes(start, end, embbdings_sorted))
    # directory = os.path.join(data_dir, "gpt_embd")
    save_embb(embbdings_sorted, f"{path_final_embb}/embeddings_{start}_{end}.npy")

print(f"Se tienen un total de {total} embbedings")


Número de embeddings: 15000
Embeddings saved to ../../../../data/gpt_F_cat/only_embb/embeddings_0_14999.npy
Número de embeddings: 15000
Embeddings saved to ../../../../data/gpt_F_cat/only_embb/embeddings_15000_29999.npy
Número de embeddings: 15000
Embeddings saved to ../../../../data/gpt_F_cat/only_embb/embeddings_30000_44999.npy
Número de embeddings: 15000
Embeddings saved to ../../../../data/gpt_F_cat/only_embb/embeddings_45000_59999.npy
Número de embeddings: 15000
Embeddings saved to ../../../../data/gpt_F_cat/only_embb/embeddings_60000_74999.npy
Número de embeddings: 15000
Embeddings saved to ../../../../data/gpt_F_cat/only_embb/embeddings_75000_89999.npy
Número de embeddings: 15000
Embeddings saved to ../../../../data/gpt_F_cat/only_embb/embeddings_90000_104999.npy
Número de embeddings: 3138
Embeddings saved to ../../../../data/gpt_F_cat/only_embb/embeddings_105000_108137.npy
Se tienen un total de 108138 embbedings


In [6]:
print(faltantes)
faltantes_flaten = [ elment for array_ in faltantes for elment in array_]
print(faltantes_flaten)
with open("T_faltantes.json", "w") as f:
    json.dump(faltantes_flaten, f)

[[], [], [], [], [], [], [], []]
[]


In [7]:
def get_array(path):
    with open(path, "r") as f:
        array_ = json.load(f)  
    return array_

Fusionar los embbedings

In [8]:
# A partir de 105512 es que tenemos los embbedings
data_dir = "../../../../data"
# directory = os.path.join(data_dir, "new_gpt")
# directory = "../../../../data/new_gpt/embeddings_with_id"
directory = "../../../../data/gpt_F_cat/only_embb"
dir_fussioned = "../../../../data/gpt_F_cat/gpt_fussioned"

files_ = os.listdir(directory)
sorted_files = sorted(files_, key=get_first_number)

all_embeddings = []
# dir_fussioned = os.path.join(directory, "gpt_fussioned")
os.makedirs(dir_fussioned, exist_ok=True)
for relatvie_embb_path in sorted_files:
    path_embb = os.path.join(directory, relatvie_embb_path)
    print("Cargando:", path_embb)
    print(path_embb)
    arr = np.load(path_embb)
    all_embeddings.append(arr)

# Fusionar todos
fused_embeddings = np.concatenate(all_embeddings, axis=0)
print(fused_embeddings.shape)
path_fused = os.path.join(dir_fussioned, "gpt_F.npy")
np.save(path_fused, fused_embeddings)

print("Embeddings fusionados guardados en:", path_fused)
print("Shape final:", fused_embeddings.shape)

#Tamaño del archivo en disco
size_bytes = os.path.getsize(path_fused)
size_mb = size_bytes / (1024**2)
size_gb = size_bytes / (1024**3)

print(f"Tamaño del archivo: {size_bytes:,} bytes")
print(f"Tamaño aproximado: {size_mb:.2f} MB ({size_gb:.2f} GB)")

Cargando: ../../../../data/gpt_F_cat/only_embb/embeddings_0_14999.npy
../../../../data/gpt_F_cat/only_embb/embeddings_0_14999.npy
Cargando: ../../../../data/gpt_F_cat/only_embb/embeddings_15000_29999.npy
../../../../data/gpt_F_cat/only_embb/embeddings_15000_29999.npy
Cargando: ../../../../data/gpt_F_cat/only_embb/embeddings_30000_44999.npy
../../../../data/gpt_F_cat/only_embb/embeddings_30000_44999.npy
Cargando: ../../../../data/gpt_F_cat/only_embb/embeddings_45000_59999.npy
../../../../data/gpt_F_cat/only_embb/embeddings_45000_59999.npy
Cargando: ../../../../data/gpt_F_cat/only_embb/embeddings_60000_74999.npy
../../../../data/gpt_F_cat/only_embb/embeddings_60000_74999.npy
Cargando: ../../../../data/gpt_F_cat/only_embb/embeddings_75000_89999.npy
../../../../data/gpt_F_cat/only_embb/embeddings_75000_89999.npy
Cargando: ../../../../data/gpt_F_cat/only_embb/embeddings_90000_104999.npy
../../../../data/gpt_F_cat/only_embb/embeddings_90000_104999.npy
Cargando: ../../../../data/gpt_F_cat/onl