In [1]:
import ast  # for converting embeddings saved as strings back to arrays
import openai # for calling the OpenAI API
import pandas as pd  # for storing text and embeddings data
import tiktoken  # for counting tokens
import os # for getting API token from env variable OPENAI_API_KEY
from scipy import spatial  # for calculating vector similarities for search

In [2]:
# Configuración de OpenAI
openai.api_key = "sk-proj-JEQe19qf1majBtkGMsubT3BlbkFJeGguQ7FTMjap1uVkFViu"
EMBEDDING_MODEL = "text-embedding-ada-002"
GPT_MODEL = "gpt-3.5-turbo"
MAX_TOKENS = 4096

In [3]:
def num_tokens(text: str, model: str = EMBEDDING_MODEL) -> int:
    encoding = tiktoken.encoding_for_model(model)
    return len(encoding.encode(text))

In [4]:
def create_embeddings_from_txt_files(directory: str, model: str = EMBEDDING_MODEL, max_tokens: int = 4096) -> pd.DataFrame:
    texts = []
    embeddings = []
    filenames = []
    
    for filename in os.listdir(directory):
        if filename.endswith(".txt"):
            filepath = os.path.join(directory, filename)
            with open(filepath, "r", encoding="utf-8") as file:
                text = file.read()
                
                # Split the text into manageable chunks
                token_count = num_tokens(text, model=model)
                if token_count > max_tokens:
                    print(f"Warning: File {filename} exceeds max token limit and will be truncated.")
                    text = text[:max_tokens]
                
                response = openai.Embedding.create(model=model, input=text)
                embedding = response['data'][0]['embedding']
                
                texts.append(text)
                embeddings.append(embedding)
                filenames.append(filename)
    
    df = pd.DataFrame({"text": texts, "embedding": embeddings, "filename": filenames})
    return df

In [5]:
# Uso de la función
directory = "data"
df = create_embeddings_from_txt_files(directory)
df.to_excel("vectores.xlsx", index=False)
df.to_csv("vectores.csv", index=False)
df



Unnamed: 0,text,embedding,filename
0,"El comando ""Enviar mensaje al conductor"" permi...","[-0.02398996241390705, 0.00521461758762598, -0...",Conductores.txt
1,\nEl dashboard en la plataforma de seguimiento...,"[-0.023207632824778557, -0.0013814468402415514...",Dashboard.txt
2,"""Manual de Usuario Detallado: Cómo Usar EVA - ...","[-0.021633613854646683, -0.0037275038193911314...",EVA.txt
3,\nCreación y Gestión de Geocercas en el Sistem...,"[-0.013801012188196182, 0.02510000951588154, 0...",Geocercas.txt
4,\nLa plataforma de seguimiento ofrece diversas...,"[-0.02603936940431595, 0.01353693287819624, -0...",Herramientas.txt
5,"En la pestaña de informes, se pueden crear inf...","[-0.00831671804189682, 0.004125538282096386, 0...",Informes.txt
6,Introducción a la plataforma:\n\nExplicación d...,"[0.004100274294614792, 0.005372195970267057, -...",Introducción a la plataforma.txt
7,\nGestión de Notificaciones en el Sistema de S...,"[-0.019962914288043976, 0.013457619585096836, ...",Notificaciones.txt
8,La pestaña de recorridos en la plataforma de s...,"[-0.012566705234348774, -0.012425732798874378,...",Recorridos.txt
9,"Además de la pestaña Remolques, se pueden ver ...","[-0.01726648584008217, 0.0003124531067442149, ...",Remolques.txt
