In [None]:
import sys
sys.path.append("../")

from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_community.vectorstores.faiss import FAISS
from langchain_core.documents import Document

from pathlib import Path
from dotenv import load_dotenv, find_dotenv
import pandas as pd
import os

from langchain.schema import Document
from typing import List

In [None]:
_ = load_dotenv(find_dotenv())
embedding_model = OpenAIEmbeddings(model="text-embedding-3-large")
cluster_number = 1


In [None]:
def create_documents_from_csv(path_to_file: str) -> List[Document]:
    """
    Lê um arquivo de texto, tratando cada linha como um documento separado
    para a vector store.
    """
    loaded_docs = []

    df = pd.read_csv(path_to_file)
    
    for _, row in df.iterrows():
        page_content = row.fixed
        metadata = {
            "source": path_to_file,
            "line_number": row.line_id_tp_dataset ,
            "buggy_train_version": row.buggy,
            "gumtree_diff": row.baseline_ast_gumtree
        }
            
        doc = Document(page_content=page_content, metadata=metadata)
        loaded_docs.append(doc)
    
    return loaded_docs


def vector_store_gen(loaded_docs):
    vector_store = FAISS.from_documents(documents=loaded_docs, embedding=embedding_model)
    return vector_store

In [None]:
initial_cluster_path = "../dataset_train/train_ast_cluster/C1/clean_train_dataset_C1.csv"
docs = create_documents_from_csv(initial_cluster_path)
complete_vector_store = vector_store_gen(docs)

for cluster in range(2, 6):
    cluster_path = f"../dataset_train/train_ast_cluster/C{cluster}/clean_train_dataset_C{cluster}.csv"
    docs_each_cluster = create_documents_from_csv(cluster_path)
    complete_vector_store.add_documents(docs_each_cluster)

complete_vector_store.save_local("CleanTrainDataset")