# Langchain Exploration



In [1]:
#!pip install faiss-gpu # For CUDA 7.5+ Supported GPU's.
#!pip install faiss-cpu # For CPU Installation
#!pip3 install Langchain[FAISS]
#!pip install sentence-transformers
#!pip install pypdf

In [2]:
# Uncomment the following line if you need to initialize FAISS with no AVX2 optimization
# os.environ['FAISS_NO_AVX2'] = '1'

In [63]:
import json
import os
import time
from typing import (
    List,
    Dict,
)

import numpy as np
from scipy.spatial.distance import (
    euclidean,
    cosine
)

import pandas as pd
import pickle
import torch
import torch.nn.functional as F
from transformers import (
    AutoTokenizer,
    AutoModel
)
from sentence_transformers import SentenceTransformer

import faiss
from faiss import (
    IndexFlatL2,
    IndexFlatIP
)
from langchain_community.docstore.in_memory import (
    InMemoryDocstore
)

from langchain_core.documents import (
    Document
)
from langchain.document_loaders import (
    TextLoader,
    PyPDFLoader
)
from langchain.text_splitter import (
    CharacterTextSplitter,
    RecursiveCharacterTextSplitter,
    SentenceTransformersTokenTextSplitter
)
from langchain.embeddings import (
    SentenceTransformerEmbeddings
)
from langchain.vectorstores import (
    FAISS
)

# Constant

In [4]:
SENTENCE_TRF_MODEL: str = "all-MiniLM-L6-v2"

---- 

# Document Loader

Generate langchain ```documents:List[Document]``` from the data source.

* [Document loaders](https://python.langchain.com/docs/modules/data_connection/document_loaders/)



## PDF Loader

* [PyPDFLoader](https://api.python.langchain.com/en/stable/document_loaders/langchain_community.document_loaders.pdf.PyPDFLoader.html)

Use [pypdf](https://pypi.org/project/pypdf/) to generate chunks as ```List[Document]``` by pages and stores page numbers in metadata.

In [5]:
loader = PyPDFLoader("../data/test_sample.pdf")
pages: List[Document] = loader.load()
print(f"metadata:{pages[0].metadata} length:{len(pages)}")

metadata:{'source': '../data/test_sample.pdf', 'page': 0} length:4


## Text Loader

* [TextLoader](https://api.python.langchain.com/en/stable/document_loaders/langchain_community.document_loaders.text.TextLoader.html)

Load from a text file into single ```Document``` instance.

In [6]:
loader = TextLoader(
    file_path="../data/state_of_the_union.txt",
    encoding='utf-8'
)
documents = loader.load()
print(f"metadata:{documents[0].metadata} length:{len(documents)}")

metadata:{'source': '../data/state_of_the_union.txt'} length:1


In [7]:
text = documents[0].page_content
' '.join(text.split()[:15])

'Madame Speaker, Vice President Biden, members of Congress, distinguished guests, and fellow Americans: Our Constitution'

# Document

A Document is a piece of text and associated metadata.

* [class Document](https://api.python.langchain.com/en/stable/_modules/langchain_core/documents/base.html#Document)


In [8]:
for attr in dir(documents[0]):
    print(attr) if not attr.startswith("_") else None

Config
construct
copy
dict
from_orm
get_lc_namespace
is_lc_serializable
json
lc_attributes
lc_id
lc_secrets
metadata
page_content
parse_file
parse_obj
parse_raw
schema
schema_json
to_json
to_json_not_implemented
type
update_forward_refs
validate


---
# Document Transformers

* [Document transformers](https://python.langchain.com/docs/modules/data_connection/document_transformers/)


## Text Split

The objective of the splitter is to package as many sentences as possible into a chunk so that **consequtive sentences stay together**.

* [Text splitters](https://python.langchain.com/docs/modules/data_connection/document_transformers/#text-splitters)

1. Split the text up into small, semantically meaningful chunks (often sentences).
2. Start combining these small chunks into a larger chunk until you reach a certain size (as measured by some function).
3. Once you reach that size, make that chunk its own piece of text and then start creating a new chunk of text with some overlap (to keep context between chunks).

That means there are two different axes along which you can customize your text splitter:

1. How the text is split
2. How the chunk size is measured



### RecursiveCharacterTextSplitter

The default recommended is [RecursiveCharacterTextSplitter](https://api.python.langchain.com/en/stable/text_splitter/langchain.text_splitter.RecursiveCharacterTextSplitter.html) that tries to create chunks based on splitting on the character in ```["\n\n", "\n", " ", ""]```  but if any chunks are too large it then moves onto the next character.

* ```length_function```: how the length of chunks is calculated. Defaults to just counting number of characters, but it's pretty common to pass a token counter here.
* ```chunk_size```: the maximum size of your chunks (as measured by the length function).
* ```chunk_overlap```: the maximum overlap between chunks to have some overlap to maintain some continuity between chunks (e.g. do a sliding window).
* ```add_start_index```: whether to include the starting position of each chunk within the original document in the metadata.



In [9]:
recursive_text_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n"],
    is_separator_regex=False,
    keep_separator=False,
    chunk_size = 15,
    chunk_overlap  = 0,
    length_function = len,
    add_start_index = True,
)

# for page in pages:
#     print(json.dumps(page.to_json(), indent=4, default=str, ensure_ascii=True))

# Split the 1st PDF page text into chunks 
for chunk in recursive_text_splitter.split_documents([pages[0]]):
    print(json.dumps(chunk.to_json(), indent=4, default=str, ensure_ascii=True))

{
    "lc": 1,
    "type": "constructor",
    "id": [
        "langchain",
        "schema",
        "document",
        "Document"
    ],
    "kwargs": {
        "page_content": "PDF Bookmark Sample Page 1 of 4 ",
        "metadata": {
            "source": "../data/test_sample.pdf",
            "page": 0,
            "start_index": 5
        }
    }
}
{
    "lc": 1,
    "type": "constructor",
    "id": [
        "langchain",
        "schema",
        "document",
        "Document"
    ],
    "kwargs": {
        "page_content": " PDF B OOKMARK SAMPLE  ",
        "metadata": {
            "source": "../data/test_sample.pdf",
            "page": 0,
            "start_index": 38
        }
    }
}
{
    "lc": 1,
    "type": "constructor",
    "id": [
        "langchain",
        "schema",
        "document",
        "Document"
    ],
    "kwargs": {
        "page_content": "Sample Date: May 2001 ",
        "metadata": {
            "source": "../data/test_sample.pdf",
            "page": 

### CharacterTextSplitter

* [What does langchain CharacterTextSplitter's chunk_size param even do?](https://stackoverflow.com/a/77341919/4281353)

> CharacterTextSpliiter behaves differently from what you expected. It first looks for the **first 6** characters and then splits the next chunk from the closest separator, **not from the 7th characte**.
> ```
> text_splitter = CharacterTextSplitter(
>     separator="\n",
>     chunk_size=6,           # <--- Look for the first 6 characters, then start looking for the separator.
> )



In [10]:
text_splitter = CharacterTextSplitter(
    separator="\n\n",
    is_separator_regex=False,
    chunk_size=1200,
    chunk_overlap=0
)
docs = text_splitter.split_documents(documents)

In [11]:
for doc in docs:
    print(json.dumps(doc.to_json(), indent=4, default=str, ensure_ascii=True))

{
    "lc": 1,
    "type": "constructor",
    "id": [
        "langchain",
        "schema",
        "document",
        "Document"
    ],
    "kwargs": {
        "page_content": "Madame Speaker, Vice President Biden, members of Congress, distinguished guests, and fellow Americans:\n\nOur Constitution declares that from time to time, the president shall give to Congress information about the state of our union. For 220 years, our leaders have fulfilled this duty. They have done so during periods of prosperity and tranquility. And they have done so in the midst of war and depression; at moments of great strife and great struggle.\n\nIt's tempting to look back on these moments and assume that our progress was inevitable, that America was always destined to succeed. But when the Union was turned back at Bull Run and the Allies first landed at Omaha Beach, victory was very much in doubt. When the market crashed on Black Tuesday and civil rights marchers were beaten on Bloody Sunday, the fu

### SentenceTransformersTokenTextSplitter

* [Split by tokens](https://python.langchain.com/docs/modules/data_connection/document_transformers/text_splitters/split_by_token)

Split not to exceed the token limits required by the model. Tokenizer depends on the Sentence Transformer model to use, hence the model name must match.

* [Sentence Transformers](https://huggingface.co/sentence-transformers)
* [Sentence Transformer - all-MiniLM-L6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2)

```RecursiveCharacterTextSplitter().from_huggingface_tokenizer(sentence_trf_tokenizer)``` does not work as the number of tokens exceed the limit.

```
sentence_trf_tokenizer = AutoTokenizer.from_pretrained(f'sentence-transformers/{SENTENCE_TRF_MODEL}')
sentence_trf_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n"],
    is_separator_regex=False,
    keep_separator=False,
    chunk_size = 10,
    chunk_overlap  = 0,
    # length_function = len,
    add_start_index = True,
).from_huggingface_tokenizer(sentence_trf_tokenizer)
sentence_trf_splitter.count_tokens(text=sentences[0].page_content)
----
3683   # <--- exceeds the limit
```

In [76]:
sentence_trf_splitter = SentenceTransformersTokenTextSplitter(
    model_name=SENTENCE_TRF_MODEL,
    chunk_overlap=0    # number of tokens to overlap, not characters
)

num_max_tokens: int = sentence_trf_splitter.maximum_tokens_per_chunk
print(f"Max tokens for the model [{SENTENCE_TRF_MODEL}] is [{num_max_tokens}].")

Max tokens for the model [all-MiniLM-L6-v2] is [256].


#### Split documents into chunks

Split the documents into chunks where each chunk has the token length that the Sentence Transformer Model can accept. There are multiple methods to split.

In [77]:
chunks: List[str] = sentence_trf_splitter.split_text(text=text)
print(len(chunks[0].split()))
chunks

220


["madame speaker, vice president biden, members of congress, distinguished guests, and fellow americans : our constitution declares that from time to time, the president shall give to congress information about the state of our union. for 220 years, our leaders have fulfilled this duty. they have done so during periods of prosperity and tranquility. and they have done so in the midst of war and depression ; at moments of great strife and great struggle. it's tempting to look back on these moments and assume that our progress was inevitable, that america was always destined to succeed. but when the union was turned back at bull run and the allies first landed at omaha beach, victory was very much in doubt. when the market crashed on black tuesday and civil rights marchers were beaten on bloody sunday, the future was anything but certain. these were times that tested the courage of our convictions and the strength of our union. and despite all our divisions and disagreements, our hesitat

In [13]:
texts: List[Document] = sentence_trf_splitter.create_documents(texts=[text])
print(json.dumps(texts[0].to_json(), indent=4, default=str))

2023-12-21 21:23:45.903468: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-12-21 21:23:45.940677: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-21 21:23:45.940707: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-21 21:23:45.940734: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-12-21 21:23:45.947810: I tensorflow/core/platform/cpu_feature_g

{
    "lc": 1,
    "type": "constructor",
    "id": [
        "langchain",
        "schema",
        "document",
        "Document"
    ],
    "kwargs": {
        "page_content": "madame speaker, vice president biden, members of congress, distinguished guests, and fellow americans : our constitution declares that from time to time, the president shall give to congress information about the state of our union. for 220 years, our leaders have fulfilled this duty. they have done so during periods of prosperity and tranquility. and they have done so in the midst of war and depression ; at moments of great strife and great struggle. it's tempting to look back on these moments and assume that our progress was inevitable, that america was always destined to succeed. but when the union was turned back at bull run and the allies first landed at omaha beach, victory was very much in doubt. when the market crashed on black tuesday and civil rights marchers were beaten on bloody sunday, the future

In [14]:
sentences: List[Document] = sentence_trf_splitter.split_documents(documents)
print(json.dumps(sentences[0].to_json(), indent=4, default=str))

{
    "lc": 1,
    "type": "constructor",
    "id": [
        "langchain",
        "schema",
        "document",
        "Document"
    ],
    "kwargs": {
        "page_content": "madame speaker, vice president biden, members of congress, distinguished guests, and fellow americans : our constitution declares that from time to time, the president shall give to congress information about the state of our union. for 220 years, our leaders have fulfilled this duty. they have done so during periods of prosperity and tranquility. and they have done so in the midst of war and depression ; at moments of great strife and great struggle. it's tempting to look back on these moments and assume that our progress was inevitable, that america was always destined to succeed. but when the union was turned back at bull run and the allies first landed at omaha beach, victory was very much in doubt. when the market crashed on black tuesday and civil rights marchers were beaten on bloody sunday, the future

In [15]:
all([
    a.page_content == b.page_content
    for a, b in zip(texts, sentences)
])

True

In [16]:
_sentence: Document
for index, _sentence in enumerate(sentences): 
    _sentence.metadata['index'] = index

Verify the sentence is within the maximum number of tokens that the model can accept.

In [74]:
sentence = sentences[0].page_content
splits = sentence_trf_splitter.split_text(sentence)
first_split = splits[0]
num_split_tokens: int = len(first_split.split())
print(
    f"first split: length:[{num_split_tokens}] "
    f"less than max tokens [{num_max_tokens}] is {num_split_tokens <= num_max_tokens}"
)
print(first_split)

first split: length:[220] less than max tokens [256] is True
["madame speaker, vice president biden, members of congress, distinguished guests, and fellow americans : our constitution declares that from time to time, the president shall give to congress information about the state of our union. for 220 years, our leaders have fulfilled this duty. they have done so during periods of prosperity and tranquility. and they have done so in the midst of war and depression ; at moments of great strife and great struggle. it's tempting to look back on these moments and assume that our progress was inevitable, that america was always destined to succeed. but when the union was turned back at bull run and the allies first landed at omaha beach, victory was very much in doubt. when the market crashed on black tuesday and civil rights marchers were beaten on bloody sunday, the future was anything but certain. these were times that tested the courage of our convictions and the strength of our union.

In [18]:
print(json.dumps(sentences[0].to_json(), indent=4, default=str))

{
    "lc": 1,
    "type": "constructor",
    "id": [
        "langchain",
        "schema",
        "document",
        "Document"
    ],
    "kwargs": {
        "page_content": "madame speaker, vice president biden, members of congress, distinguished guests, and fellow americans : our constitution declares that from time to time, the president shall give to congress information about the state of our union. for 220 years, our leaders have fulfilled this duty. they have done so during periods of prosperity and tranquility. and they have done so in the midst of war and depression ; at moments of great strife and great struggle. it's tempting to look back on these moments and assume that our progress was inevitable, that america was always destined to succeed. but when the union was turned back at bull run and the allies first landed at omaha beach, victory was very much in doubt. when the market crashed on black tuesday and civil rights marchers were beaten on bloody sunday, the future

In [64]:
sentence_trf_tokenizer = AutoTokenizer.from_pretrained(f'sentence-transformers/{SENTENCE_TRF_MODEL}')
encoded_input = sentence_trf_tokenizer([sentence.page_content for sentence in sentences[:1]], return_tensors='pt')
encoded_input

{'input_ids': tensor([[  101, 10602,  5882,  1010,  3580,  2343,  7226,  2368,  1010,  2372,
          1997,  3519,  1010,  5182,  6368,  1010,  1998,  3507,  4841,  1024,
          2256,  4552, 18806,  2008,  2013,  2051,  2000,  2051,  1010,  1996,
          2343,  4618,  2507,  2000,  3519,  2592,  2055,  1996,  2110,  1997,
          2256,  2586,  1012,  2005, 10545,  2086,  1010,  2256,  4177,  2031,
         16829,  2023,  4611,  1012,  2027,  2031,  2589,  2061,  2076,  6993,
          1997, 14165,  1998, 25283, 26147,  3012,  1012,  1998,  2027,  2031,
          2589,  2061,  1999,  1996, 12930,  1997,  2162,  1998,  6245,  1025,
          2012,  5312,  1997,  2307, 27865,  1998,  2307,  5998,  1012,  2009,
          1005,  1055, 23421,  2000,  2298,  2067,  2006,  2122,  5312,  1998,
          7868,  2008,  2256,  5082,  2001, 13418,  1010,  2008,  2637,  2001,
          2467, 16036,  2000,  9510,  1012,  2021,  2043,  1996,  2586,  2001,
          2357,  2067,  2012,  7087,  

In [71]:
# https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


vectorizer = AutoModel.from_pretrained(f'sentence-transformers/{SENTENCE_TRF_MODEL}')
with torch.no_grad():
    model_output = vectorizer(**tokens)

# Perform pooling
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

# Normalize embeddings
vector = F.normalize(sentence_embeddings, p=2, dim=1)


vector.dtype

torch.float32

In [73]:
model = SentenceTransformer(SENTENCE_TRF_MODEL)
vector = model.encode(text)
vector.dtype

dtype('float32')

---
# Vector Database FAISS

* [Facebook Research - Faiss: A library for efficient similarity search](https://engineering.fb.com/2017/03/29/data-infrastructure/faiss-a-library-for-efficient-similarity-search/)
* [Introduction to Facebook AI Similarity Search (Faiss)](https://www.pinecone.io/learn/series/faiss/faiss-tutorial/)
* [FAISS - Github](https://github.com/facebookresearch/faiss)
* [FAISS - Readthedocs](https://faiss.ai/index.html)

## Tutorial

* [Getting started](https://github.com/facebookresearch/faiss/wiki/Getting-started)

> The code can be run by copy/pasting it or running it from the [tutorial](https://github.com/facebookresearch/faiss/tree/master/tutorial) subdirectory of the Faiss distribution.

## Langchain
* [Langchain - Vector stores](https://python.langchain.com/docs/modules/data_connection/vectorstores/)
* [Langchain - FAISS](https://python.langchain.com/docs/integrations/vectorstores/faiss)
* [Langchain API - vectorstores.faiss.FAISS](https://api.python.langchain.com/en/stable/vectorstores/langchain_community.vectorstores.faiss.FAISS.html)

## FAISS Index

```Index``` object stores all the vectors and allow vector search k-nearest-neighbors. There are multiple index types as in [Faiss indexes](https://github.com/facebookresearch/faiss/wiki/Faiss-indexes). Faiss index itself is the vector database that manages the vectors and provide add(vector) and search(vector) interfaces. Named as Index is because it provides indexing feature for fast similarity search. FAISS Index does not offer other metadata or interfaces such as primary key to uniquely identify and retrieve the vectors, but focus only to provide similarity search. Hence LangChain provides additional metadata management using its [InMemoryDocstore](https://api.js.langchain.com/classes/stores_doc_in_memory.InMemoryDocstore.html).

### Caution

FAISS ```IndexFlatL2``` may calculate incorrect distances. 

* [Why you should be careful using FAISS](https://medium.com/mlearning-ai/why-you-should-be-careful-using-faiss-c44996eda9ee)

> go deeper into the principles of the basic FAISS’s index — IndexFlatL2. This index is very useful when you need to make an exact search using Euclidean distance. This type of index doesn’t compress or cluster your vectors. Nevertheless, it has some features which might worsen your experience. You have to remember that **FAISS uses the formula which can cause catastrophic cancellation** when floats of different magnitudes are added. We can also avoid it by using double-precision floating-point format (float64). However, we should remember that **FAISS works only with float32** format.

* [negative distance returned in IndexFlatL2 search query #297](https://github.com/facebookresearch/faiss/issues/297)

> The problem is that if you have a query vector x and two database vectors y_1 and y_2, where ```||x|| >> ||y_1||``` and ```||x|| >> ||y_2||``` then there will be accuracy losses because computations are performed with 32-bit float precision.

### Build IndexFlatL2 Index
* [Building an index and adding the vectors to it](https://github.com/facebookresearch/faiss/wiki/Getting-started#building-an-index-and-adding-the-vectors-to-it)

> Faiss is built around the ```Index``` object. It encapsulates the set of database vectors, and optionally preprocesses them to make searching efficient. There are many types of indexes, we are going to use the simplest version that just performs brute-force L2 distance search on them: ```IndexFlatL2```.
> ```Index``` needs to know the dimensionality of the vectors it operates.
> ```
> import faiss                   # make faiss available
> index = faiss.IndexFlatL2(d)   # build the index
> ```

### Build IndexFlatIP index

* [How can we build index/search based on cosine similarity](https://github.com/facebookresearch/faiss/issues/95)

> ```
> index = faiss.IndexFlatIP(dimensions)
> ```

### Search 

```
top_k: int = 5    # 5 nearest neighbors
distances, indices = index.search(query_vector, top_k)  # distances is L2 distances
```


## Build Vector Database

### Create a Vectorizer 

Need to define the vectorizer to embed a text into a vector.

In [20]:
vectorizer = embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

In [21]:
type(SentenceTransformerEmbeddings)

pydantic.main.ModelMetaclass

In [22]:
x = np.array(vectorizer.embed_query("This is a pen."))
y = np.array(vectorizer.embed_query("That is a pencil."))
z = np.array(vectorizer.embed_query("I love sushi."))
euclidean(x, z)

1.4246604853027693

### Create an FAISS Index

In [23]:
dimensions: int = len(embedding_function.embed_query("dummy"))
dimensions

384

In [24]:
faiss_index: IndexFlatL2 = IndexFlatL2(dimensions)
# faiss_index: IndexFlatIP = IndexFlatIP(dimensions)

### Build an empty database

In [25]:
index_to_docstore_id = {}
db = FAISS(
    embedding_function=embedding_function,
    index=faiss_index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id=index_to_docstore_id,
    normalize_L2=False
)

In [26]:
dir(faiss_index)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__swig_destroy__',
 '__weakref__',
 'add',
 'add_c',
 'add_with_ids',
 'add_with_ids_c',
 'assign',
 'assign_c',
 'code_size',
 'codes',
 'compute_distance_subset',
 'compute_residual',
 'compute_residual_n',
 'd',
 'get_distance_computer',
 'get_xb',
 'is_trained',
 'metric_arg',
 'metric_type',
 'ntotal',
 'range_search',
 'range_search_c',
 'reconstruct',
 'reconstruct_c',
 'reconstruct_n',
 'reconstruct_n_c',
 'remove_ids',
 'remove_ids_c',
 'reset',
 'sa_code_size',
 'sa_decode',
 'sa_decode_c',
 'sa_encode',
 'sa_encode_c',
 'search',
 'search_and_reconstruct',
 'search_and_reconstruct_c',
 'searc

### Add documents to the database

In [27]:
ids = [
    str(i).zfill(10) for i in range(len(sentences))
]
_ = db.add_documents(
    documents=sentences,
    ids=ids
)

In [28]:
index_to_docstore_id

{0: '0000000000',
 1: '0000000001',
 2: '0000000002',
 3: '0000000003',
 4: '0000000004',
 5: '0000000005',
 6: '0000000006',
 7: '0000000007',
 8: '0000000008',
 9: '0000000009',
 10: '0000000010',
 11: '0000000011',
 12: '0000000012',
 13: '0000000013',
 14: '0000000014',
 15: '0000000015',
 16: '0000000016',
 17: '0000000017',
 18: '0000000018',
 19: '0000000019',
 20: '0000000020',
 21: '0000000021',
 22: '0000000022',
 23: '0000000023',
 24: '0000000024',
 25: '0000000025',
 26: '0000000026',
 27: '0000000027',
 28: '0000000028',
 29: '0000000029',
 30: '0000000030',
 31: '0000000031',
 32: '0000000032',
 33: '0000000033',
 34: '0000000034'}

### Build from documents

## Search Similar Documents

FAISS Index provides the search functions with a few search types (similarity, mmr).


### Methods

* [search(query: str, search_type: str, **kwargs: Any)](https://api.python.langchain.com/en/stable/vectorstores/langchain_community.vectorstores.faiss.FAISS.html#langchain_community.vectorstores.faiss.FAISS.search)



### Search Types

* [Vector store-backed retriever](https://python.langchain.com/docs/modules/data_connection/retrievers/vectorstore)

> * search_type="similarity": 
> * search_type="mmr": By default, the vector store retriever uses similarity search. If the underlying vector store supports maximum marginal relevance search, you can specify that as the search type.



* [similarity_search_with_score(query: str, k: int = 4, filter)](https://api.python.langchain.com/en/stable/vectorstores/langchain_community.vectorstores.faiss.FAISS.html#langchain_community.vectorstores.faiss.FAISS.similarity_search_with_score)
* [Similarity Search with score](https://python.langchain.com/docs/integrations/vectorstores/faiss#similarity-search-with-score)

> ```similarity_search_with_score``` returns ```(doc, score)``` where score is **L2 distance**. Therefore, **a lower score is better**.

### L2 Distances

Why ```IndexMFlatL2``` gives different value from scipy and numpy L2 distance value?

In [29]:
query: str = "the president shall give to congress information about the state of our union"

In [30]:
query_vector = embedding_function.embed_query(query)
doc_vector = embedding_function.embed_query(sentences[0].page_content)
# cosine(np.array(query_vector), np.array(doc_vector))
euclidean(np.array(query_vector), np.array(doc_vector))

1.042056251618508

In [31]:
import torch
x1=torch.tensor([query_vector])
x2=torch.tensor([doc_vector])
torch.cdist(x1, x2, p=2.0, compute_mode='use_mm_for_euclid_dist_if_necessary')

tensor([[1.0421]])

In [32]:
db.index.search(np.array([query_vector], dtype=np.float32), k=3)

(array([[1.0858811, 1.2505221, 1.2690783]], dtype=float32),
 array([[ 0,  8, 22]]))

In [33]:
faiss.cvar.distance_compute_blas_threshold = len(index_to_docstore_id) + 1
db.index.search(np.array([query_vector], dtype=np.float32), k=3)

(array([[1.0858811, 1.2505221, 1.2690783]], dtype=float32),
 array([[ 0,  8, 22]]))

In [34]:
db.search(query="the state of our union", search_type="mmr")

[Document(page_content="madame speaker, vice president biden, members of congress, distinguished guests, and fellow americans : our constitution declares that from time to time, the president shall give to congress information about the state of our union. for 220 years, our leaders have fulfilled this duty. they have done so during periods of prosperity and tranquility. and they have done so in the midst of war and depression ; at moments of great strife and great struggle. it's tempting to look back on these moments and assume that our progress was inevitable, that america was always destined to succeed. but when the union was turned back at bull run and the allies first landed at omaha beach, victory was very much in doubt. when the market crashed on black tuesday and civil rights marchers were beaten on bloody sunday, the future was anything but certain. these were times that tested the courage of our convictions and the strength of our union. and despite all our divisions and disa

In [35]:
for doc, score in db.similarity_search_with_score(query, 3):
    print(f"score:{score} : {json.dumps(doc.to_json(), indent=4, default=str)}")

score:1.0858811140060425 : {
    "lc": 1,
    "type": "constructor",
    "id": [
        "langchain",
        "schema",
        "document",
        "Document"
    ],
    "kwargs": {
        "page_content": "madame speaker, vice president biden, members of congress, distinguished guests, and fellow americans : our constitution declares that from time to time, the president shall give to congress information about the state of our union. for 220 years, our leaders have fulfilled this duty. they have done so during periods of prosperity and tranquility. and they have done so in the midst of war and depression ; at moments of great strife and great struggle. it's tempting to look back on these moments and assume that our progress was inevitable, that america was always destined to succeed. but when the union was turned back at bull run and the allies first landed at omaha beach, victory was very much in doubt. when the market crashed on black tuesday and civil rights marchers were beaten o

### Filter the search

Filter the matched documents with the metadata attributes.

```
"metadata": {
    "source": "./data/state_of_the_union.txt",
    "index": 22              # <--- Directory specify the metadata attribute
}
```

In [36]:
filter=dict(index=22)
filter

{'index': 22}

In [37]:
query: str = "the president shall give to congress information about the state of our union"
for doc, score in db.similarity_search_with_score(query, k=1, fetch_k=5, filter=filter):
    print(f"score:{score} : {json.dumps(doc.to_json(), indent=4, default=str)}")

score:1.269078254699707 : {
    "lc": 1,
    "type": "constructor",
    "id": [
        "langchain",
        "schema",
        "document",
        "Document"
    ],
    "kwargs": {
        "page_content": "about how washington works that have been growing for years. to close that credibility gap we must take action on both ends of pennsylvania avenue to end the outsized influence of lobbyists ; to do our work openly and to give our people the government they deserve. that's what i came to washington to do. that's why - - for the first time in history - - my administration posts our white house visitors online. and that's why we've excluded lobbyists from policymaking jobs or seats on federal boards and commissions. but we can't stop there. it's time to require lobbyists to disclose each contact they make on behalf of a client with my administration or congress. and it's time to put strict limits on the contributions that lobbyists give to candidates for federal office. last week, the s

## Add Document

[add_documents](https://api.python.langchain.com/en/latest/vectorstores/langchain_community.vectorstores.faiss.FAISS.html?highlight=faiss#langchain_community.vectorstores.faiss.FAISS.add_documents) will invoke [add_texts](https://api.python.langchain.com/en/latest/vectorstores/langchain_community.vectorstores.faiss.FAISS.html?highlight=faiss#langchain_community.vectorstores.faiss.FAISS.add_texts) method where you can set your own IDs.

* [langchain_community/vectorstores/faiss.py#add_texts](https://api.python.langchain.com/en/latest/_modules/langchain_community/vectorstores/faiss.html#FAISS.add_texts):

```
def add_texts(
    self,
    texts: Iterable[str],
    metadatas: Optional[List[dict]] = None,
    ids: Optional[List[str]] = None,           # <----
    **kwargs: Any,
) -> List[str]:
```


In [38]:
doc_to_add: Document = Document(
    page_content="the president shall give to congress information about the state of our union", 
    metadata=dict(index=99)
)
ids: List[str] = db.add_documents(
    documents=[doc_to_add], 
    # You can provide your own unique ID, otherwise FAISS class generates UUID.
    # langchain_community/vectorstores/faiss.py#__add method:
    # ```
    # ids = ids or [str(uuid.uuid4()) for _ in texts]
    # ```
    ids=["DOC04123333"]            
)
ids

['DOC04123333']

## Select a document with 

Currently Lanchain does not provide a method to retrieve a document by its internal ID.

* [How to retrieve vectors by ids for LangChain vectorstore FAISS? #8897](https://github.com/langchain-ai/langchain/issues/8897) 

> Currently, the FAISS vectorstore implementation in LangChain does not have a method to retrieve vectors by ids similar to the retrieve method in the Qdrant vectorstore. The current implementation supports deleting vectors by ids, but not retrieving them.
> However, it is possible to implement a similar method in the FAISS vectorstore. Here is a rough idea of how it could be done:
```
def retrieve(self, ids: List[str], with_vectors: bool = False) -> List[Optional[np.ndarray]]:
    """Retrieve vectors by ID. These are the IDs in the vectorstore.

    Args:
        ids: List of ids to retrieve.
        with_vectors: If True, return the vectors along with the ids.

    Returns:
        List of vectors corresponding to the ids, or None if an id does not exist.
    """
    if with_vectors:
        _reversed_index = {v: k for k, v in self.index_to_docstore_id.items()}
        index_to_retrieve = [_reversed_index.get(i) for i in ids]
        vectors = self.index.reconstruct_n(0, len(self.index_to_docstore_id))
        return [vectors[i] for i in index_to_retrieve if i is not None]
    else:
        return [None for _ in ids]
```

>  the add method in faiss stores vectors in the given index using sequentially generated indices by default. The FAISS wrapper use a index_to_docstore_id dictionary, which essentially converts these indices into UUIDs for the respective documents stored in the underlying document store. Consequently, you can leverage the underlying document store and the index_to_docstore_id dictionary to retrieve a document based on its ID generated by faiss:
```
from langchain.vectorstores.faiss import FAISS
from langchain.embeddings.fake import FakeEmbeddings
from langchain.docstore import InMemoryDocstore
from langchain.docstore.document import Document
import faiss
embedding_size = 1536
index = faiss.IndexFlatL2(embedding_size)
embedding_fn = FakeEmbeddings(size=embedding_size).embed_query
docstore = InMemoryDocstore({})
vectorstore = FAISS(embedding_fn, index, docstore, {})
documents = [Document(page_content='Hello How are you doing')]
vectorstore.add_documents(documents)
index_to_docstore_id = vectorstore.index_to_docstore_id
for i in range(len(documents)):
    print(docstore._dict[index_to_docstore_id[i]])
```

In [39]:
def select(
        db,
        keys: List[str]
):
    """Retrieve vectors by keys, which are the IDs in the vectorstore.
    Args:
        keys: primary keys to identify the records to select.

    Returns:
        List of vectors corresponding to the ids, or None if an id does not exist.
    """
    _id_to_index = {
        _id: _index for _index, _id in db.index_to_docstore_id.items()
    }
    indices_to_retrieve = [_id_to_index.get(_id) for _id in keys]
    vectors = db.index.reconstruct_n(0, len(db.index_to_docstore_id))
    return (
        [db.docstore.search(_id) for _id in keys],
        [vectors[index] for index in indices_to_retrieve if index is not None]
    )

select(db=db, keys=['DOC04123333'])

([Document(page_content='the president shall give to congress information about the state of our union', metadata={'index': 99})],
 [array([-2.28103362e-02,  6.77464530e-02,  9.50659811e-03,  5.78828976e-02,
         -3.08208242e-02,  6.06366321e-02,  3.15811373e-02, -5.21173514e-02,
         -8.54805335e-02, -1.82847157e-02, -1.27744377e-01,  3.03568784e-03,
         -1.91189686e-03, -2.76795924e-02, -6.97776005e-02,  2.95578577e-02,
          7.60543793e-02,  3.85096408e-02, -1.77105214e-03, -1.50946816e-02,
          1.09686725e-01,  6.01188689e-02, -7.03044906e-02,  5.85764041e-03,
          2.17567589e-02,  1.40187228e-02, -4.88124974e-02, -6.09397814e-02,
         -5.87077960e-02, -3.19501348e-02, -2.63713324e-03, -7.52108395e-02,
          2.12408993e-02,  6.47165701e-02, -1.89610273e-02, -1.28192827e-01,
          7.25442618e-02,  3.60509939e-02,  8.64282921e-02, -2.63611060e-02,
          1.16491783e-02, -5.09439409e-02,  8.39687791e-03,  2.07770411e-02,
         -2.92405374e-

In [40]:
def select(
        db,
        ids: List[str]
):
    """Retrieve vectors by ids, which are the IDs in the vectorstore.
    Args:
        ids: Docstore ids to identify the records to select.

    Returns: Generator to yield (document, vector) for the ids
    """
    # Langchain Docstore IDs
    docstore_ids = ids

    # FAISS vector indices
    _id_to_index = {
        _id: _index for _index, _id in db.index_to_docstore_id.items()
    }
    indices_to_retrieve = [_id_to_index.get(_id) for _id in docstore_ids]

    for index, _id in zip(indices_to_retrieve, docstore_ids):
        document: Document = db.docstore.search(search=_id)
        vector: np.ndarray = db.index.reconstruct_n(n0=index, ni=1)[0]
        yield document, vector


In [41]:
g = select(db, ['DOC04123333'])
doc, vec = next(g)
assert np.allclose(db.embedding_function.embed_query(doc.page_content), vec)

doc
vec

array([-2.28103362e-02,  6.77464530e-02,  9.50659811e-03,  5.78828976e-02,
       -3.08208242e-02,  6.06366321e-02,  3.15811373e-02, -5.21173514e-02,
       -8.54805335e-02, -1.82847157e-02, -1.27744377e-01,  3.03568784e-03,
       -1.91189686e-03, -2.76795924e-02, -6.97776005e-02,  2.95578577e-02,
        7.60543793e-02,  3.85096408e-02, -1.77105214e-03, -1.50946816e-02,
        1.09686725e-01,  6.01188689e-02, -7.03044906e-02,  5.85764041e-03,
        2.17567589e-02,  1.40187228e-02, -4.88124974e-02, -6.09397814e-02,
       -5.87077960e-02, -3.19501348e-02, -2.63713324e-03, -7.52108395e-02,
        2.12408993e-02,  6.47165701e-02, -1.89610273e-02, -1.28192827e-01,
        7.25442618e-02,  3.60509939e-02,  8.64282921e-02, -2.63611060e-02,
        1.16491783e-02, -5.09439409e-02,  8.39687791e-03,  2.07770411e-02,
       -2.92405374e-02,  1.01810060e-01,  4.98622190e-03,  2.33929735e-02,
       -9.77457408e-03,  7.11746067e-02, -1.25237415e-03,  2.90405843e-02,
        2.32936069e-02,  

## Delete

In [42]:
db.delete(['DOC04123333'])

True

---
# Server

## Flask

In [43]:
import json
from flask import Flask, request

app = Flask(__name__)


@app.route('/embedding', methods=['POST'])
def generate_embedding():
    query = request.json['query']
    results = db.similarity_search_with_score(query, 3)
    return {'results': json.dumps(results, default=str)}

if __name__ == '__main__':
    app.run(port=8001)

ModuleNotFoundError: No module named 'flask'

## FAISS gRPC Server

* [Faiss gRPC Server](https://github.com/louiezzang/faiss-server)

> A library for efficient similarity search and clustering of dense vectors.

In [80]:
pow(2, 15) * 8

262144