In [1]:
import logging
from pathlib import Path

import pandas as pd
import faiss
import numpy as np


from langchain_huggingface import HuggingFaceEmbeddings

from get_embedding import timing, get_embedding_model, wrap_embedding_query

In [2]:
embed_log = logging.getLogger(__name__)
embed_log.setLevel(logging.DEBUG)
embed_log.addHandler(logging.StreamHandler())

In [3]:
def get_latest_vector_path(vector_dir: str) -> Path:
    store_path = Path(vector_dir)
    list_of_files = store_path.glob(f"*")
    latest_file = max(list_of_files, key=lambda p: p.stat().st_ctime)

    embed_log.debug(f"Reading {store_path.name} for latest an got {latest_file.name}")
    return latest_file

In [4]:
embedding = get_embedding_model()

  from tqdm.autonotebook import tqdm, trange
func:get_embedding_model Time: 4.962897266999789


In [5]:
vector_dir = f"/home/xoph/repos/github/nfroseth/world_graph/vectors/"

In [6]:
embedded_note_chunks_frame = pd.read_pickle(get_latest_vector_path(vector_dir))
print(f"{embedded_note_chunks_frame.columns}=")
print(f"{embedded_note_chunks_frame.describe()}=")
embedded_note_chunks_frame

Reading vectors for latest an got Slip Box_Alibaba-NLP--gte-large-en-v1.5_2024-06-07_11:15:11.pkl


Index(['content_chunk', 'note_name', 'embedding'], dtype='object')=
                                            content_chunk  \
count                                                   1   
unique                                                  1   
top     [page_content='## All  \n```tasks\nnot done\nt...   
freq                                                    1   

                                                note_name  \
count                                                   1   
unique                                                  1   
top     [000 - Tasks_chunk_0, Chat-2024_03_28-13_03_44...   
freq                                                    1   

                                                embedding  
count                                                   1  
unique                                                  1  
top     [[-0.7515473365783691, 0.4106312096118927, -0....  
freq                                                    1  =


Unnamed: 0,content_chunk,note_name,embedding
0,[page_content='## All \n```tasks\nnot done\nt...,"[000 - Tasks_chunk_0, Chat-2024_03_28-13_03_44...","[[-0.7515473365783691, 0.4106312096118927, -0...."


In [7]:
vectors = np.array(embedded_note_chunks_frame.embedding.iloc[0])
vectors.shape

(52824, 1024)

In [8]:
index = faiss.IndexFlatL2(vectors.shape[1])
index.add(vectors)

In [13]:
query = "Here is a list of the most important things that I would like to learn about Both in depth and breadth of knowledge "
query_vector = np.array([wrap_embedding_query(embedding, query)])
query_vector.shape

Batches: 100%|██████████| 1/1 [00:00<00:00, 20.85it/s]
func:wrap_embedding_query Time: 0.058296601000620285


(1, 1024)

In [19]:
# Example search query
# query_vector = np.random.random((1, 1024)).astype('float32')
D, I = index.search(query_vector, k=100)  # k ios the number of nearest neighbors to retrieve
I

array([[51429, 51425, 51435, 47869, 51436, 49399, 47377, 49398, 47525,
        51427, 49397, 50486, 51432, 48516, 51758, 51437, 47414, 45435,
        51431, 48998, 51518, 51420, 47527, 47534, 47378, 51410, 51423,
        51417, 51784, 44612, 51128, 47745, 48425, 46470, 51422, 47532,
        45521, 51781, 48410, 51421, 49455, 51418, 50381, 45805, 46473,
        44664, 51424, 44324, 47526, 49041, 51360, 48195, 46471, 51339,
        51071, 51407, 45685, 51344, 51434, 51756, 30629, 47787, 48424,
        50389, 51426, 51763, 45404, 44085, 47538, 48547, 47524, 49454,
        44657, 44638, 44387, 50440, 42871, 49431, 45152, 50279, 45885,
        51430, 50339, 48233, 48652, 48197, 46689, 48312, 42774, 48517,
        49379, 51345, 48169, 42857, 42874, 50313, 42853, 51516, 51003,
        49096]])

In [20]:
print(f"Question: {query} Similar Chunks:")
for idx in I[0]:
    retrieved_content = embedded_note_chunks_frame.content_chunk.iloc[0][idx]
    retrieved_chunk_name = embedded_note_chunks_frame.note_name.iloc[0][idx]
    print(f"From {retrieved_chunk_name}, {retrieved_content}")

Question: Here is a list of the most important things that I would like to learn about Both in depth and breadth of knowledge  Similar Chunks:
From All the many things I'd like to learn about. Exhaustively._chunk_20, page_content='### Social' metadata={'Header 1': "All the many things I'd like to learn about. Exhaustively.", 'Header 3': 'Social'}
From All the many things I'd like to learn about. Exhaustively._chunk_16, page_content='### Career' metadata={'Header 1': "All the many things I'd like to learn about. Exhaustively.", 'Header 3': 'Career'}
From All the many things I'd like to learn about. Exhaustively._chunk_26, page_content='### Sciences\nChemistry\nBiology\nPharmacology' metadata={'Header 1': "All the many things I'd like to learn about. Exhaustively.", 'Header 3': 'Sciences'}
From Index of Projects_chunk_9, page_content="## [[All the many things I'd like to learn about. Exhaustively.]]" metadata={'Header 1': 'Projects', 'Header 2': "[[All the many things I'd like to learn a