In [4]:
import faiss
import pandas as pd
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain_community.docstore.in_memory import InMemoryDocstore

from uuid import uuid4

from langchain_core.documents import Document

In [3]:
# Load your train data
train_df = pd.read_csv("../datasets/core_data/final_train.csv")

# Initialize OpenAI embeddings
embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")

index = faiss.IndexFlatL2(len(embedding_model.embed_query("hello world")))


# Create FAISS vector store with metadata
vector_store = FAISS(
    embedding_function=embedding_model,
    # metadatas=metadatas,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)

In [None]:
all_documents = [
    Document(
        page_content=row["sentence"],
        metadata=row.to_dict(),
    )
    for _, row in train_df.iterrows()
]
uuids = [str(uuid4()) for _ in range(len(all_documents))]
vector_store.add_documents(documents=all_documents, ids=uuids)

['e743ddcc-2ea2-4dfa-8097-e1d1a82dd1b6',
 'a8ab0629-849d-4e22-a8d9-c3f15c168c50',
 'a54553bc-d239-46d6-859d-5a019198fad7',
 '81eabe05-d860-48f0-82ea-d0d008ca16d6',
 'f6cae2b0-60dd-4010-aeb1-c70b569e046b',
 '9b198ffe-be8c-4e7f-9be9-2a5d0768da93',
 'a1881fe9-9131-406a-b24f-875c756bab8c',
 '9748fd6d-265c-4faf-ab3d-65a86976a78f',
 '3056fc5a-359f-4e35-acba-1486085f99ab',
 'b5442f5f-e533-4889-84fe-f43fbaee5ca8',
 'a9a119d2-f78c-44d9-92cb-c45e3ad64388',
 '98587ade-18e6-4d6a-9e24-07476835aa50',
 '837d027d-f1ef-43d5-a1f9-aa76a0302d23',
 'f4674438-b8de-4795-b394-b86514cd058b',
 '75e3ca4f-6357-4bc8-9371-b2e491dbb228',
 'af462c83-8bec-4128-ac08-a9c16791ff98',
 '830cd1ba-d1be-4cbf-b94b-d12d94336bfb',
 '95cbab96-050a-4432-8696-784053626b97',
 'acf7f3d4-5ccd-4dd9-8c6b-31c1a1dc541c',
 'd4309c2f-3622-48a9-8291-65b5ebb61269',
 '29b4fa23-c555-4e23-a5fd-dab83e3e9cc1',
 'a43b9633-9c8f-4110-8c3c-7d9d2ded4f1c',
 'dae459dc-0138-412d-a5ce-e211c9cb03ce',
 '94acf8a8-bbba-46e4-92ec-8441f9273e00',
 'd9d69701-fae0-

In [10]:
# Retrieval returns Document objects with .metadata containing the row dict
def retrieve_similar_rows(vector_store, query, k=5):
    docs_and_scores = vector_store.similarity_search_with_score(query, k=k)
    return [(doc.metadata, score) for doc, score in docs_and_scores]

In [None]:
# Example usage
query_example = train_df.iloc[0]['sentence']
similar_contexts = retrieve_similar_rows(vector_store,query_example)
print(f"Query: {query_example}\n")
print("Similar contexts:")
for context, score in similar_contexts:
    print(f"Score: {score:.4f}\nContext: {context}\n")

Query: Badgeville subsequently raised a $ 12M Series B Round in July 2011 , led by Norwest Venture Partners and El Dorado Ventures .

Similar contexts:
Score: 0.0000
Context: {'id': 'E8026501', 'relation': 'shareholder_of', 'entity_1': 'Norwest Venture Partners', 'entity_2': 'Badgeville', 'sentence': 'Badgeville subsequently raised a $ 12M Series B Round in July 2011 , led by Norwest Venture Partners and El Dorado Ventures .'}

Score: 0.8390
Context: {'id': 'E8029705', 'relation': 'shareholder_of', 'entity_1': 'ABS Capital Partners', 'entity_2': 'Bask Technology, Inc.', 'sentence': 'Investment In July 2014 , Bask received its Series B funding from ABS Capital Partners , in the amount of $ 18 million .'}

Score: 0.8509
Context: {'id': 'E8246514', 'relation': 'shareholder_of', 'entity_1': 'Battery Ventures', 'entity_2': 'SingleHop, LLC', 'sentence': 'In 2012 , the company raised $ 27.5 million from Boston-based Battery Ventures .'}

Score: 0.8673
Context: {'id': 'E8015601', 'relation': '

In [9]:
vector_store.save_local("./faiss_indices/core_data_index")

In [11]:
new_vector_store = FAISS.load_local(
    "./faiss_indices/core_data_index", embedding_model, allow_dangerous_deserialization=True
)

In [13]:
# Example usage
query_example = train_df.iloc[0]['sentence']
similar_contexts = retrieve_similar_rows(vector_store, query_example, k=2)
print(f"Query: {query_example}\n")
print("Similar contexts:")
for context, score in similar_contexts:
    print(f"Score: {score:.4f}\nContext: {context}\n")

Query: Badgeville subsequently raised a $ 12M Series B Round in July 2011 , led by Norwest Venture Partners and El Dorado Ventures .

Similar contexts:
Score: 0.0000
Context: {'id': 'E8026501', 'relation': 'shareholder_of', 'entity_1': 'Norwest Venture Partners', 'entity_2': 'Badgeville', 'sentence': 'Badgeville subsequently raised a $ 12M Series B Round in July 2011 , led by Norwest Venture Partners and El Dorado Ventures .'}

Score: 0.8390
Context: {'id': 'E8029705', 'relation': 'shareholder_of', 'entity_1': 'ABS Capital Partners', 'entity_2': 'Bask Technology, Inc.', 'sentence': 'Investment In July 2014 , Bask received its Series B funding from ABS Capital Partners , in the amount of $ 18 million .'}

