In [16]:
import pandas as pd 
import numpy as np
import os
import re
import json

In [50]:

def get_first_number(filename):
    match = re.search(r"(\d+)", filename)
    return int(match.group(1)) if match else float('inf')
def get_second_number(filename: str) -> int:
    matches = re.findall(r"(\d+)", filename)
    if len(matches) >= 2:
        return int(matches[1])  # el segundo número
    return float('inf')  # si no lo encuentra


def load_embeddings_and_check_dim(file_path):
    embeddings = []
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            if not line.strip():
                continue
            j = json.loads(line)
            embeddings.append([get_first_number(j["custom_id"]),j["embedding"]])
    
    if embeddings:
        print(f"Número de embeddings: {len(embeddings)}")
        # print(f"Dimensión del primer embedding: {len(embeddings[0][1])}")
    else:
        print("No se encontraron embeddings en el archivo.")

    return embeddings

def sort_according_id(embeddings):
    return sorted(embeddings, key=lambda x: x[0])

def get_faltantes(start, end, embeddings):
    faltantes = []
    n = len(embeddings)
    index = 0   
    for i in range(start, end + 1):
        if index >= n:  
            faltantes.append(i)
        elif embeddings[index][0] == i:  
            index += 1
        else:
            faltantes.append(i) 
    return faltantes

def save_embb(embeddings, output_path):
    only_embb =[]
    for embb in embeddings:
        only_embb.append(embb[1])
    np.save(output_path, np.array(only_embb))
    print(f"Embeddings saved to {output_path}")


In [51]:
data_dir = "../../../../data"
path_embb = "../../../../data/embeddings_only"

file_embb = os.listdir(path_embb)
files_sorted = sorted(file_embb, key=get_first_number)
faltantes = []

for relative_path in files_sorted:
    path_file_embb = os.path.join(path_embb, relative_path)
    embbdings = load_embeddings_and_check_dim(path_file_embb)
    embbdings_sorted = sort_according_id(embbdings)
    start = get_first_number(relative_path)
    end = get_second_number(relative_path)
    faltantes.append(get_faltantes(start, end, embbdings_sorted))
    directory = os.path.join(data_dir, "gpt_embd")
    os.makedirs(directory, exist_ok=True)
    save_embb(embbdings_sorted, f"{directory}/embeddings_{start}_{end}.npy")



Número de embeddings: 900
Embeddings saved to ../../../../data/gpt_embd/embeddings_0_899.npy
Número de embeddings: 900
Embeddings saved to ../../../../data/gpt_embd/embeddings_900_1799.npy
Número de embeddings: 15000
Embeddings saved to ../../../../data/gpt_embd/embeddings_1800_16801.npy
Número de embeddings: 15000
Embeddings saved to ../../../../data/gpt_embd/embeddings_16802_31802.npy
Número de embeddings: 15000
Embeddings saved to ../../../../data/gpt_embd/embeddings_31803_46804.npy
Número de embeddings: 15000
Embeddings saved to ../../../../data/gpt_embd/embeddings_46805_61809.npy
Número de embeddings: 15000
Embeddings saved to ../../../../data/gpt_embd/embeddings_61810_76811.npy
Número de embeddings: 15000
Embeddings saved to ../../../../data/gpt_embd/embeddings_76812_91811.npy
Número de embeddings: 13699
Embeddings saved to ../../../../data/gpt_embd/embeddings_91812_106812.npy
Número de embeddings: 1325
Embeddings saved to ../../../../data/gpt_embd/embeddings_106813_108137.npy


In [52]:
print(faltantes)

[[], [], [9681, 13382], [27808], [38450, 46753], [53015, 53018, 53102, 53137, 57281], [70666, 75102], [], [98849, 105512, 105513, 105514, 105515, 105516, 105517, 105518, 105519, 105520, 105521, 105522, 105523, 105524, 105525, 105526, 105527, 105528, 105529, 105530, 105531, 105532, 105533, 105534, 105535, 105536, 105537, 105538, 105539, 105540, 105541, 105542, 105543, 105544, 105545, 105546, 105547, 105548, 105549, 105550, 105551, 105552, 105553, 105554, 105555, 105556, 105557, 105558, 105559, 105560, 105561, 105562, 105563, 105564, 105565, 105566, 105567, 105568, 105569, 105570, 105571, 105572, 105573, 105574, 105575, 105576, 105577, 105578, 105579, 105580, 105581, 105582, 105583, 105584, 105585, 105586, 105587, 105588, 105589, 105590, 105591, 105592, 105593, 105594, 105595, 105596, 105597, 105598, 105599, 105600, 105601, 105602, 105603, 105604, 105605, 105606, 105607, 105608, 105609, 105610, 105611, 105612, 105613, 105614, 105615, 105616, 105617, 105618, 105619, 105620, 105621, 105622