# Langchain Exploration



In [1]:
#!pip install faiss-gpu # For CUDA 7.5+ Supported GPU's.
#!pip install faiss-cpu # For CPU Installation
#!pip3 install Langchain[FAISS]
#!pip install sentence-transformers
#!pip install pypdf

In [2]:
# Uncomment the following line if you need to initialize FAISS with no AVX2 optimization
# os.environ['FAISS_NO_AVX2'] = '1'

In [3]:
import json
import os
import time
from typing import (
    List,
    Dict,
)

import numpy as np
from scipy.spatial.distance import (
    euclidean,
    cosine
)

import pandas as pd
import pickle
import torch
import torch.nn.functional as F
from transformers import (
    AutoTokenizer,
    AutoModel
)
from sentence_transformers import SentenceTransformer

import faiss
from faiss import (
    IndexFlatL2,
    IndexFlatIP
)
from langchain_community.docstore.in_memory import (
    InMemoryDocstore
)

from langchain_core.documents import (
    Document
)
from langchain.document_loaders import (
    TextLoader,
    PyPDFLoader
)
from langchain.text_splitter import (
    CharacterTextSplitter,
    RecursiveCharacterTextSplitter,
    SentenceTransformersTokenTextSplitter
)
from langchain.embeddings import (
    SentenceTransformerEmbeddings
)
from langchain.vectorstores import (
    FAISS
)

# Constant

In [4]:
SENTENCE_MIN_MODEL: str = "all-MiniLM-L6-v2"
SENTENCE_TRF_MODEL: str = "gtr-t5-large"

---- 

# Document Loader

Generate langchain ```documents:List[Document]``` from the data source.

* [Document loaders](https://python.langchain.com/docs/modules/data_connection/document_loaders/)



## PDF Loader

* [PyPDFLoader](https://api.python.langchain.com/en/stable/document_loaders/langchain_community.document_loaders.pdf.PyPDFLoader.html)

Use [pypdf](https://pypi.org/project/pypdf/) to generate chunks as ```List[Document]``` by pages and stores page numbers in metadata.

In [5]:
loader = PyPDFLoader("../data/test_sample.pdf")
pages: List[Document] = loader.load()
print(f"metadata:{pages[0].metadata} length:{len(pages)}")

metadata:{'source': '../data/test_sample.pdf', 'page': 0} length:4


## Text Loader

* [TextLoader](https://api.python.langchain.com/en/stable/document_loaders/langchain_community.document_loaders.text.TextLoader.html)

Load from a text file into single ```Document``` instance.

In [6]:
loader = TextLoader(
    file_path="../data/state_of_the_union.txt",
    encoding='utf-8'
)
documents = loader.load()
print(f"metadata:{documents[0].metadata} length:{len(documents)}")

metadata:{'source': '../data/state_of_the_union.txt'} length:1


In [7]:
text = documents[0].page_content
' '.join(text.split())

'Madame Speaker, Vice President Biden, members of Congress, distinguished guests, and fellow Americans: Our Constitution declares that from time to time, the president shall give to Congress information about the state of our union. For 220 years, our leaders have fulfilled this duty. They have done so during periods of prosperity and tranquility. And they have done so in the midst of war and depression; at moments of great strife and great struggle. It\'s tempting to look back on these moments and assume that our progress was inevitable, that America was always destined to succeed. But when the Union was turned back at Bull Run and the Allies first landed at Omaha Beach, victory was very much in doubt. When the market crashed on Black Tuesday and civil rights marchers were beaten on Bloody Sunday, the future was anything but certain. These were times that tested the courage of our convictions and the strength of our union. And despite all our divisions and disagreements, our hesitatio

# Document

A Document is a piece of text and associated metadata.

* [class Document](https://api.python.langchain.com/en/stable/_modules/langchain_core/documents/base.html#Document)


In [8]:
for attr in dir(documents[0]):
    print(attr) if not attr.startswith("_") else None

Config
construct
copy
dict
from_orm
get_lc_namespace
is_lc_serializable
json
lc_attributes
lc_id
lc_secrets
metadata
page_content
parse_file
parse_obj
parse_raw
schema
schema_json
to_json
to_json_not_implemented
type
update_forward_refs
validate


---
# Document Transformers

* [Document transformers](https://python.langchain.com/docs/modules/data_connection/document_transformers/)


## Text Split

The objective of the splitter is to package as many sentences as possible into a chunk so that **consequtive sentences stay together**.

* [Text splitters](https://python.langchain.com/docs/modules/data_connection/document_transformers/#text-splitters)

1. Split the text up into small, semantically meaningful chunks (often sentences).
2. Start combining these small chunks into a larger chunk until you reach a certain size (as measured by some function).
3. Once you reach that size, make that chunk its own piece of text and then start creating a new chunk of text with some overlap (to keep context between chunks).

That means there are two different axes along which you can customize your text splitter:

1. How the text is split
2. How the chunk size is measured



### RecursiveCharacterTextSplitter

The default recommended is [RecursiveCharacterTextSplitter](https://api.python.langchain.com/en/stable/text_splitter/langchain.text_splitter.RecursiveCharacterTextSplitter.html) that tries to create chunks based on splitting on the character in ```["\n\n", "\n", " ", ""]```  but if any chunks are too large it then moves onto the next character.

* ```length_function```: how the length of chunks is calculated. Defaults to just counting number of characters, but it's pretty common to pass a token counter here.
* ```chunk_size```: the maximum size of your chunks (as measured by the length function).
* ```chunk_overlap```: the maximum overlap between chunks to have some overlap to maintain some continuity between chunks (e.g. do a sliding window).
* ```add_start_index```: whether to include the starting position of each chunk within the original document in the metadata.



In [9]:
recursive_text_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n"],
    is_separator_regex=False,
    keep_separator=False,
    chunk_size = 15,
    chunk_overlap  = 0,
    length_function = len,
    add_start_index = True,
)

# for page in pages:
#     print(json.dumps(page.to_json(), indent=4, default=str, ensure_ascii=True))

# Split the 1st PDF page text into chunks 
for chunk in recursive_text_splitter.split_documents([pages[0]]):
    print(json.dumps(chunk.to_json(), indent=4, default=str, ensure_ascii=True))

{
    "lc": 1,
    "type": "constructor",
    "id": [
        "langchain",
        "schema",
        "document",
        "Document"
    ],
    "kwargs": {
        "page_content": "PDF Bookmark Sample Page 1 of 4 ",
        "metadata": {
            "source": "../data/test_sample.pdf",
            "page": 0,
            "start_index": 5
        }
    }
}
{
    "lc": 1,
    "type": "constructor",
    "id": [
        "langchain",
        "schema",
        "document",
        "Document"
    ],
    "kwargs": {
        "page_content": " PDF B OOKMARK SAMPLE  ",
        "metadata": {
            "source": "../data/test_sample.pdf",
            "page": 0,
            "start_index": 38
        }
    }
}
{
    "lc": 1,
    "type": "constructor",
    "id": [
        "langchain",
        "schema",
        "document",
        "Document"
    ],
    "kwargs": {
        "page_content": "Sample Date: May 2001 ",
        "metadata": {
            "source": "../data/test_sample.pdf",
            "page": 

### CharacterTextSplitter

* [What does langchain CharacterTextSplitter's chunk_size param even do?](https://stackoverflow.com/a/77341919/4281353)

> CharacterTextSpliiter behaves differently from what you expected. It first looks for the **first 6** characters and then splits the next chunk from the closest separator, **not from the 7th characte**.
> ```
> text_splitter = CharacterTextSplitter(
>     separator="\n",
>     chunk_size=6,           # <--- Look for the first 6 characters, then start looking for the separator.
> )



In [10]:
text_splitter = CharacterTextSplitter(
    separator="\n\n",
    is_separator_regex=False,
    chunk_size=1200,
    chunk_overlap=0
)
docs = text_splitter.split_documents(documents)

In [11]:
for doc in docs:
    print(json.dumps(doc.to_json(), indent=4, default=str, ensure_ascii=True))

{
    "lc": 1,
    "type": "constructor",
    "id": [
        "langchain",
        "schema",
        "document",
        "Document"
    ],
    "kwargs": {
        "page_content": "Madame Speaker, Vice President Biden, members of Congress, distinguished guests, and fellow Americans:\n\nOur Constitution declares that from time to time, the president shall give to Congress information about the state of our union. For 220 years, our leaders have fulfilled this duty. They have done so during periods of prosperity and tranquility. And they have done so in the midst of war and depression; at moments of great strife and great struggle.\n\nIt's tempting to look back on these moments and assume that our progress was inevitable, that America was always destined to succeed. But when the Union was turned back at Bull Run and the Allies first landed at Omaha Beach, victory was very much in doubt. When the market crashed on Black Tuesday and civil rights marchers were beaten on Bloody Sunday, the fu

### SentenceTransformersTokenTextSplitter

SentenceTransformersTokenTextSplitter.split_text has a bug. It is using huggingface tokenizer encode_plus which may not return start_token_id e.g. tokenizer for ```"gtr-t5-large```. However, it assumes it always has ```[start_token][tokens][end_token]``` structure and use ```[1:-1]``` to truncate start_token and end_token. Hence, everytime the first word is lost.

[github SentenceTransformersTokenTextSplitter](api.python.langchain.com/en/stable/_modules/langchain/text_splitter.html#SentenceTransformersTokenTextSplitter)
```
def encode_strip_start_and_stop_token_ids(text: str) -> List[int]:
    return self._encode(text)[1:-1] <---
```

* [Split by tokens](https://python.langchain.com/docs/modules/data_connection/document_transformers/text_splitters/split_by_token)

Split not to exceed the token limits required by the model. Tokenizer depends on the Sentence Transformer model to use, hence the model name must match.

* [Sentence Transformers](https://huggingface.co/sentence-transformers)
* [Sentence Transformer - all-MiniLM-L6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2)

```RecursiveCharacterTextSplitter().from_huggingface_tokenizer(sentence_trf_tokenizer)``` does not work as the number of tokens exceed the limit.

```
sentence_trf_tokenizer = AutoTokenizer.from_pretrained(f'sentence-transformers/{SENTENCE_TRF_MODEL}')
sentence_trf_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n"],
    is_separator_regex=False,
    keep_separator=False,
    chunk_size = 10,
    chunk_overlap  = 0,
    # length_function = len,
    add_start_index = True,
).from_huggingface_tokenizer(sentence_trf_tokenizer)
sentence_trf_splitter.count_tokens(text=sentences[0].page_content)
----
3683   # <--- exceeds the limit
```

In [12]:
sentence_trf_splitter = SentenceTransformersTokenTextSplitter(
    model_name=SENTENCE_TRF_MODEL,
    chunk_overlap=0,    # number of tokens to overlap, not characters,
)

num_max_tokens: int = sentence_trf_splitter.maximum_tokens_per_chunk
print(f"Max tokens for the model [{SENTENCE_TRF_MODEL}] is [{num_max_tokens}].")

Max tokens for the model [gtr-t5-large] is [512].


In [13]:
tokenizer = sentence_trf_splitter._model.tokenizer
tokens = tokenizer('a')['input_ids']
tokenizer.batch_decode(tokens)
sentence_trf_splitter._model.max_seq_length

2023-12-26 19:14:08.450379: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-12-26 19:14:08.473262: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-26 19:14:08.473291: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-26 19:14:08.473317: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-12-26 19:14:08.477620: I tensorflow/core/platform/cpu_feature_g

512

In [14]:
sentence_trf_splitter.split_text('')

[]

#### Split documents into chunks

Split the documents into chunks where each chunk has the token length that the Sentence Transformer Model can accept. There are multiple methods to split.

In [15]:
chunks: List[str] = sentence_trf_splitter.split_text(text=text)
print(len(chunks[0].split()))
chunks

402


["Speaker, Vice President Biden, members of Congress, distinguished guests, and fellow Americans: Our Constitution declares that from time to time, the president shall give to Congress information about the state of our union. For 220 years, our leaders have fulfilled this duty. They have done so during periods of prosperity and tranquility. And they have done so in the midst of war and depression; at moments of great strife and great struggle. It's tempting to look back on these moments and assume that our progress was inevitable, that America was always destined to succeed. But when the Union was turned back at Bull Run and the Allies first landed at Omaha Beach, victory was very much in doubt. When the market crashed on Black Tuesday and civil rights marchers were beaten on Bloody Sunday, the future was anything but certain. These were times that tested the courage of our convictions and the strength of our union. And despite all our divisions and disagreements, our hesitations and 

In [16]:
text="madame speaker, vice president biden, members of congress, distinguished guests, and fellow americans."

In [17]:
sentence_trf_splitter.split_text(text)

['ame speaker, vice president biden, members of congress, distinguished guests, and fellow americans.']

In [18]:
sentence_trf_splitter.count_tokens(text=text)

23

In [19]:
model = SentenceTransformer(SENTENCE_TRF_MODEL)
model.encode("text").ndim

1

In [20]:
texts: List[Document] = sentence_trf_splitter.create_documents(texts=[text])
print(json.dumps(texts[0].to_json(), indent=4, default=str))

{
    "lc": 1,
    "type": "constructor",
    "id": [
        "langchain",
        "schema",
        "document",
        "Document"
    ],
    "kwargs": {
        "page_content": "ame speaker, vice president biden, members of congress, distinguished guests, and fellow americans.",
        "metadata": {}
    }
}


In [21]:
sentences: List[Document] = sentence_trf_splitter.split_documents(documents)
print(json.dumps(sentences[0].to_json(), indent=4, default=str))

{
    "lc": 1,
    "type": "constructor",
    "id": [
        "langchain",
        "schema",
        "document",
        "Document"
    ],
    "kwargs": {
        "page_content": "Speaker, Vice President Biden, members of Congress, distinguished guests, and fellow Americans: Our Constitution declares that from time to time, the president shall give to Congress information about the state of our union. For 220 years, our leaders have fulfilled this duty. They have done so during periods of prosperity and tranquility. And they have done so in the midst of war and depression; at moments of great strife and great struggle. It's tempting to look back on these moments and assume that our progress was inevitable, that America was always destined to succeed. But when the Union was turned back at Bull Run and the Allies first landed at Omaha Beach, victory was very much in doubt. When the market crashed on Black Tuesday and civil rights marchers were beaten on Bloody Sunday, the future was anyt

In [22]:
all([
    a.page_content == b.page_content
    for a, b in zip(texts, sentences)
])

False

In [23]:
_sentence: Document
for index, _sentence in enumerate(sentences): 
    _sentence.metadata['index'] = index

Verify the sentence is within the maximum number of tokens that the model can accept.

In [24]:
sentence = sentences[0].page_content
splits = sentence_trf_splitter.split_text(sentence)
first_split = splits[0]
num_split_tokens: int = len(first_split.split())
print(
    f"first split: length:[{num_split_tokens}] "
    f"less than max tokens [{num_max_tokens}] is {num_split_tokens <= num_max_tokens}"
)
print(first_split)

first split: length:[402] less than max tokens [512] is True
, Vice President Biden, members of Congress, distinguished guests, and fellow Americans: Our Constitution declares that from time to time, the president shall give to Congress information about the state of our union. For 220 years, our leaders have fulfilled this duty. They have done so during periods of prosperity and tranquility. And they have done so in the midst of war and depression; at moments of great strife and great struggle. It's tempting to look back on these moments and assume that our progress was inevitable, that America was always destined to succeed. But when the Union was turned back at Bull Run and the Allies first landed at Omaha Beach, victory was very much in doubt. When the market crashed on Black Tuesday and civil rights marchers were beaten on Bloody Sunday, the future was anything but certain. These were times that tested the courage of our convictions and the strength of our union. And despite all o

In [25]:
print(json.dumps(sentences[0].to_json(), indent=4, default=str))

{
    "lc": 1,
    "type": "constructor",
    "id": [
        "langchain",
        "schema",
        "document",
        "Document"
    ],
    "kwargs": {
        "page_content": "Speaker, Vice President Biden, members of Congress, distinguished guests, and fellow Americans: Our Constitution declares that from time to time, the president shall give to Congress information about the state of our union. For 220 years, our leaders have fulfilled this duty. They have done so during periods of prosperity and tranquility. And they have done so in the midst of war and depression; at moments of great strife and great struggle. It's tempting to look back on these moments and assume that our progress was inevitable, that America was always destined to succeed. But when the Union was turned back at Bull Run and the Allies first landed at Omaha Beach, victory was very much in doubt. When the market crashed on Black Tuesday and civil rights marchers were beaten on Bloody Sunday, the future was anyt

--- 
# Tokenize

At encode, the tokenizer inserts special characters to mark the start and end of the tokens. However, it depends on the model if the decode will remove those special characters.

## all-MiniLM-L6-v2

```all-MiniLM-L6-v2``` model inserts ```[CLS]``` and ```[SEP]``` to mark the start and end of the tokens and decode method do not remove it.



In [26]:
sentence_all_minilm_tokenizer = AutoTokenizer.from_pretrained(f'sentence-transformers/all-MiniLM-L6-v2')
sentence_all_minilm_tokenizer

BertTokenizerFast(name_or_path='sentence-transformers/all-MiniLM-L6-v2', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)

In [27]:
text: str = ' '.join("""
Madame Speaker, Vice President Biden, members of Congress, distinguished guests, and fellow Americans: 
Our Constitution declares that from time to time, the president shall give to Congress information about 
the state of our union. For 220 years, our leaders have fulfilled this duty. They have done so during 
periods of prosperity and tranquility. And they have done so in the midst of war and depression; 
at moments of great strife and great struggle. It\'s tempting to look back on these moments and assume 
that our progress was inevitable, that America was always destined to succeed. But when the Union was 
turned back at Bull Run and the Allies first landed at Omaha Beach, victory was very much in doubt. 
When the market crashed on Black Tuesday and civil rights marchers were beaten on Bloody Sunday, 
the future was anything but certain. These were times that tested the courage of our convictions 
and the strength of our union. And despite all our divisions and disagreements, our hesitations and 
our fears, America prevailed because we chose to move forward as one nation and one people. 
Again, we are tested. And again, we must answer history\'s call. One year ago, I took office amid 
two wars, an economy rocked by severe recession, a financial system on the verge of 
""".split()) 


In [28]:
sentence_all_minilm_tokenizer.tokenize(text=text)
sentence_all_minilm_tokenizer.encode(text=text)

[101,
 10602,
 5882,
 1010,
 3580,
 2343,
 7226,
 2368,
 1010,
 2372,
 1997,
 3519,
 1010,
 5182,
 6368,
 1010,
 1998,
 3507,
 4841,
 1024,
 2256,
 4552,
 18806,
 2008,
 2013,
 2051,
 2000,
 2051,
 1010,
 1996,
 2343,
 4618,
 2507,
 2000,
 3519,
 2592,
 2055,
 1996,
 2110,
 1997,
 2256,
 2586,
 1012,
 2005,
 10545,
 2086,
 1010,
 2256,
 4177,
 2031,
 16829,
 2023,
 4611,
 1012,
 2027,
 2031,
 2589,
 2061,
 2076,
 6993,
 1997,
 14165,
 1998,
 25283,
 26147,
 3012,
 1012,
 1998,
 2027,
 2031,
 2589,
 2061,
 1999,
 1996,
 12930,
 1997,
 2162,
 1998,
 6245,
 1025,
 2012,
 5312,
 1997,
 2307,
 27865,
 1998,
 2307,
 5998,
 1012,
 2009,
 1005,
 1055,
 23421,
 2000,
 2298,
 2067,
 2006,
 2122,
 5312,
 1998,
 7868,
 2008,
 2256,
 5082,
 2001,
 13418,
 1010,
 2008,
 2637,
 2001,
 2467,
 16036,
 2000,
 9510,
 1012,
 2021,
 2043,
 1996,
 2586,
 2001,
 2357,
 2067,
 2012,
 7087,
 2448,
 1998,
 1996,
 6956,
 2034,
 5565,
 2012,
 12864,
 3509,
 1010,
 3377,
 2001,
 2200,
 2172,
 1999,
 4797,
 1012,
 

In [29]:
print(len(sentence_all_minilm_tokenizer.batch_decode(sentence_all_minilm_tokenizer(text)['input_ids'])))
sentence_all_minilm_tokenizer.batch_decode(sentence_all_minilm_tokenizer(text)['input_ids'])

256


['[CLS]',
 'madame',
 'speaker',
 ',',
 'vice',
 'president',
 'bid',
 '##en',
 ',',
 'members',
 'of',
 'congress',
 ',',
 'distinguished',
 'guests',
 ',',
 'and',
 'fellow',
 'americans',
 ':',
 'our',
 'constitution',
 'declares',
 'that',
 'from',
 'time',
 'to',
 'time',
 ',',
 'the',
 'president',
 'shall',
 'give',
 'to',
 'congress',
 'information',
 'about',
 'the',
 'state',
 'of',
 'our',
 'union',
 '.',
 'for',
 '220',
 'years',
 ',',
 'our',
 'leaders',
 'have',
 'fulfilled',
 'this',
 'duty',
 '.',
 'they',
 'have',
 'done',
 'so',
 'during',
 'periods',
 'of',
 'prosperity',
 'and',
 'tran',
 '##quil',
 '##ity',
 '.',
 'and',
 'they',
 'have',
 'done',
 'so',
 'in',
 'the',
 'midst',
 'of',
 'war',
 'and',
 'depression',
 ';',
 'at',
 'moments',
 'of',
 'great',
 'strife',
 'and',
 'great',
 'struggle',
 '.',
 'it',
 "'",
 's',
 'tempting',
 'to',
 'look',
 'back',
 'on',
 'these',
 'moments',
 'and',
 'assume',
 'that',
 'our',
 'progress',
 'was',
 'inevitable',
 ',',

### Start/End tokens

In [30]:
tokens = sentence_all_minilm_tokenizer("a")['input_ids']
tokens

[101, 1037, 102]

In [31]:
words = sentence_all_minilm_tokenizer.batch_decode(tokens)
start_word, a_word, end_word = words
start_word, a_word, end_word

('[CLS]', 'a', '[SEP]')

In [32]:
tokens = sentence_all_minilm_tokenizer.encode('I love sushi.')
decoded = sentence_all_minilm_tokenizer.decode(tokens)

In [33]:
decoded.replace(start_word, '', 1).rsplit(sep=end_word, maxsplit=1)[0].strip()

'i love sushi.'


### Character case does not matter

```all-MiniLM-L6-v2``` does not care the case as it all handles as small letters.

In [34]:
tokens_titled = sentence_all_minilm_tokenizer("Madame")['input_ids']
tokens_titled_decoded = sentence_all_minilm_tokenizer.batch_decode(tokens_titled)
print(f"Madame: tokens:{tokens_titled} decoded: {tokens_titled_decoded}")

tokens_small = sentence_all_minilm_tokenizer("madame".lower())['input_ids']
tokens_small_decoded = sentence_all_minilm_tokenizer.batch_decode(tokens_small)
print(f"madame: tokens:{tokens_small} decoded: {tokens_small_decoded}")

Madame: tokens:[101, 10602, 102] decoded: ['[CLS]', 'madame', '[SEP]']
madame: tokens:[101, 10602, 102] decoded: ['[CLS]', 'madame', '[SEP]']


In [35]:
text_with_case = "Madame speaker, Vice president, members of Congress, distinguished guests, and fellow Americans."
tokens = sentence_all_minilm_tokenizer.encode(text_with_case)
print(f"length:{len(tokens)} tokens:{tokens}")
print(f"decoded: {sentence_all_minilm_tokenizer.batch_decode(tokens)}")
sentence_all_minilm_tokenizer.batch_decode([tokens])

length:19 tokens:[101, 10602, 5882, 1010, 3580, 2343, 1010, 2372, 1997, 3519, 1010, 5182, 6368, 1010, 1998, 3507, 4841, 1012, 102]
decoded: ['[CLS]', 'madame', 'speaker', ',', 'vice', 'president', ',', 'members', 'of', 'congress', ',', 'distinguished', 'guests', ',', 'and', 'fellow', 'americans', '.', '[SEP]']


['[CLS] madame speaker, vice president, members of congress, distinguished guests, and fellow americans. [SEP]']

## gtr-t5-large

```gtr-t5-large``` tokenizesr inserts token ```3``` as an empty string to mark the start and  ```{'eos_token': '</s>'}``` to mark the end of the tokens. 



In [36]:
sentence_t5_tokenizer = AutoTokenizer.from_pretrained(f'sentence-transformers/gtr-t5-large')
sentence_t5_tokenizer

T5TokenizerFast(name_or_path='sentence-transformers/gtr-t5-large', vocab_size=32100, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'additional_special_tokens': ['<extra_id_0>', '<extra_id_1>', '<extra_id_2>', '<extra_id_3>', '<extra_id_4>', '<extra_id_5>', '<extra_id_6>', '<extra_id_7>', '<extra_id_8>', '<extra_id_9>', '<extra_id_10>', '<extra_id_11>', '<extra_id_12>', '<extra_id_13>', '<extra_id_14>', '<extra_id_15>', '<extra_id_16>', '<extra_id_17>', '<extra_id_18>', '<extra_id_19>', '<extra_id_20>', '<extra_id_21>', '<extra_id_22>', '<extra_id_23>', '<extra_id_24>', '<extra_id_25>', '<extra_id_26>', '<extra_id_27>', '<extra_id_28>', '<extra_id_29>', '<extra_id_30>', '<extra_id_31>', '<extra_id_32>', '<extra_id_33>', '<extra_id_34>', '<extra_id_35>', '<extra_id_36>', '<extra_id_37>', '<extra_id_38>', '<extra_id_39>', '<extra_id_40>', '<extra_id_41>', '<extra_id_42>', 

### start/end tokens

In [37]:
# dict(list(sentence_trf_tokenizer.special_tokens_map.items())[:3])
encoded = sentence_t5_tokenizer(['a'])['input_ids'][0]
decoded = sentence_t5_tokenizer.decode(encoded)
print(f"encoded:{encoded}, decoded:[{decoded}]")

encoded:[3, 9, 1], decoded:[a</s>]


In [38]:
tokens = sentence_t5_tokenizer("a")['input_ids']
words = sentence_t5_tokenizer.batch_decode(tokens)
start_word, a_word, end_word = words
start_word, a_word, end_word

('', 'a', '</s>')

### Character case mattes

```gtr-t5-large``` cares about the cases and behave differently.

In [39]:
tokens_titled = sentence_t5_tokenizer("Madame")['input_ids']
tokens_titled_decoded = sentence_t5_tokenizer.batch_decode(tokens_titled)
print(f"Madame: tokens:{tokens_titled} decoded: {tokens_titled_decoded}")

tokens_small = sentence_t5_tokenizer("madame".lower())['input_ids']
tokens_small_decoded = sentence_t5_tokenizer.batch_decode(tokens_small)
print(f"madame: tokens:{tokens_small} decoded: {tokens_small_decoded}")

Madame: tokens:[27328, 1] decoded: ['Madame', '</s>']
madame: tokens:[11454, 265, 15, 1] decoded: ['mad', 'am', 'e', '</s>']


In [40]:
tokens_titled = sentence_all_minilm_tokenizer("Madame")['input_ids']
tokens_small = sentence_all_minilm_tokenizer("Madame".lower())['input_ids']
print(f"titled:{tokens_titled} small:{tokens_small}")

titled:[101, 10602, 102] small:[101, 10602, 102]


In [41]:
text_with_case = "Madame speaker, Vice president, members of Congress, distinguished guests, and fellow Americans."
tokens = sentence_t5_tokenizer.encode(text_with_case)
print(f"length:{len(tokens)} tokens:{tokens}")
print(f"decoded: {sentence_t5_tokenizer.batch_decode(tokens)}")
sentence_t5_tokenizer.batch_decode([tokens])

length:18 tokens:[27328, 5873, 6, 8236, 2753, 6, 724, 13, 4442, 6, 18908, 2554, 6, 11, 4999, 5452, 5, 1]
decoded: ['Madame', 'speaker', ',', 'Vice', 'president', ',', 'members', 'of', 'Congress', ',', 'distinguished', 'guests', ',', 'and', 'fellow', 'Americans', '.', '</s>']


['Madame speaker, Vice president, members of Congress, distinguished guests, and fellow Americans.</s>']

In [42]:
text_without_case = "Madame speaker, vice president Biden, members of Congress, distinguished guests, and fellow Americans.".lower()
tokens = sentence_t5_tokenizer.encode(text_without_case)
print(tokens)
sentence_t5_tokenizer.batch_decode(tokens)

[11454, 265, 15, 5873, 6, 6444, 2753, 6894, 35, 6, 724, 13, 27197, 6, 18908, 2554, 6, 11, 4999, 10211, 7, 5, 1]


['mad',
 'am',
 'e',
 'speaker',
 ',',
 'vice',
 'president',
 'bid',
 'en',
 ',',
 'members',
 'of',
 'congress',
 ',',
 'distinguished',
 'guests',
 ',',
 'and',
 'fellow',
 'american',
 's',
 '.',
 '</s>']

In [43]:
text = "Madame Speaker, Vice President Biden, members of Congress, distinguished guests, and fellow Americans: Our Constitution declares that from time to time, the president shall give to Congress information about the state of our union. For 220 years, our leaders have fulfilled this duty. They have done so during periods of prosperity and tranquility. And they have done so in the midst of war and depression; at moments of great strife and great struggle. It's tempting to look back on these moments and assume that our progress was inevitable, that America was always destined to succeed. But when the Union was turned back at Bull Run and the Allies first landed at Omaha Beach, victory was very much in doubt. When the market crashed on Black Tuesday and civil rights marchers were beaten on Bloody Sunday, the future was anything but certain. These were times that tested the courage of our convictions and the strength of our union. And despite all our divisions and disagreements, our hesitations and our fears, America prevailed because we chose to move forward as one nation and one people. Again, we are tested. And again, we must answer history's call. One year ago, I took office amid two wars, an economy rocked by severe recession, a financial system on the verge of collapse and a government deeply in debt. Experts from across the political spectrum warned that if we did not act, we might face a second depression. So we acted immediately and aggressively. And one year later, the worst of the storm has passed. But the devastation remains. One in 10 Americans still cannot find work. Many businesses have shuttered. Home values have declined. Small towns and rural communities have been hit especially hard. For those who had already known poverty, life has become that much harder. This recession has also compounded the burdens that America's families have been dealing with for decades -- the burden of working harder and longer for less, of being unable to save enough to retire or help kids with college. So I know the anxieties that are out there right now. They're not new. These struggles are the reason I ran for president. These struggles are what I've witnessed for years in places like Elkhart, Ind., and Galesburg, Ill. I hear about them in the letters that I read each night. The toughest to read are those written by"

In [44]:
text = "Madame Speaker, Vice President Biden, members of Congress, distinguished guests, and fellow Americans: Our Constitution declares that from time to time, the president shall give to Congress information about the state of our union. For 220 years, our leaders have fulfilled this duty. They have done so during periods of prosperity and tranquility. And they have done so in the midst of war and depression; at moments of great strife and great struggle. It's tempting to look back on these moments and assume that our progress was inevitable, that America was always destined to succeed. But when the Union was turned back at Bull Run and the Allies first landed at Omaha Beach, victory was very much in doubt. When the market crashed on Black Tuesday and civil rights marchers were beaten on Bloody Sunday, the future was anything but certain. These were times that tested the courage of our convictions and the strength of our union. And despite all our divisions and disagreements, our hesitations and our fears, America prevailed because we chose to move forward as one nation and one people. Again, we are tested. And again, we must answer history's call. One year ago, I took office amid two wars, an economy rocked by severe recession, a financial system on the verge of collapse and a government deeply in debt. Experts from across the political spectrum warned that if we did not act, we might face a second depression. So we acted immediately and aggressively. And one year later, the worst of the storm has passed. But the devastation remains. One in 10 Americans still cannot find work. Many businesses have shuttered. Home values have declined. Small towns and rural communities have been hit especially hard. For those who had already known poverty, life has become that much harder. This recession has also compounded the burdens that America's families have been dealing with for decades -- the burden of working harder and longer for less, of being unable to save enough to retire or help kids with college. So I know the anxieties that are out there right now. They're not new. These struggles are the reason I ran for president. These struggles are what I've witnessed for years in places like Elkhart, Ind., and Galesburg, Ill. I hear about them in the letters that I read each night. The toughest to read are those written by children asking why they have to move from their home, or when their mom"

In [45]:
t5_expected_split: str = ' '.join("""
Madame Speaker, Vice President Biden, members of Congress, distinguished guests, and fellow Americans: 
Our Constitution declares that from time to time, the president shall give to Congress information about 
the state of our union. For 220 years, our leaders have fulfilled this duty. They have done so during 
periods of prosperity and tranquility. And they have done so in the midst of war and depression; 
at moments of great strife and great struggle. It\'s tempting to look back on these moments and assume 
that our progress was inevitable, that America was always destined to succeed. But when the Union was 
turned back at Bull Run and the Allies first landed at Omaha Beach, victory was very much in doubt. 
When the market crashed on Black Tuesday and civil rights marchers were beaten on Bloody Sunday, 
the future was anything but certain. These were times that tested the courage of our convictions 
and the strength of our union. And despite all our divisions and disagreements, our hesitations and 
our fears, America prevailed because we chose to move forward as one nation and one people. 
Again, we are tested. And again, we must answer history\'s call. One year ago, I took office amid 
two wars, an economy rocked by severe recession, a financial system on the verge of collapse and a 
government deeply in debt. Experts from across the political spectrum warned that if we did not act, 
we might face a second depression. So we acted immediately and aggressively. And one year later, 
the worst of the storm has passed. But the devastation remains. One in 10 Americans still cannot 
find work. Many businesses have shuttered. Home values have declined. Small towns and rural 
communities have been hit especially hard. For those who had already known poverty, 
life has become that much harder. This recession has also compounded the burdens that America's families 
have been dealing with for decades -- the burden of working harder and longer for less, of being unable 
to save enough to retire or help kids with college. So I know the anxieties that are out there right now. 
They're not new. These struggles are the reason I ran for president. These struggles are what 
I've witnessed for years in places like Elkhart, Ind., and Galesburg, Ill. I hear about them in 
the letters that I read each night. The toughest to read are those written by
""".split())
text = t5_expected_split

In [46]:
print(len(sentence_t5_tokenizer(text)['input_ids']))
print((sentence_t5_tokenizer(text)['input_ids']))

sentence_t5_tokenizer.batch_decode(sentence_t5_tokenizer(text)['input_ids'])

511
[27328, 16778, 6, 8236, 1661, 2106, 537, 6, 724, 13, 4442, 6, 18908, 2554, 6, 11, 4999, 5452, 10, 421, 11378, 15884, 7, 24, 45, 97, 12, 97, 6, 8, 2753, 1522, 428, 12, 4442, 251, 81, 8, 538, 13, 69, 7021, 5, 242, 204, 1755, 203, 6, 69, 2440, 43, 20795, 48, 5461, 5, 328, 43, 612, 78, 383, 8811, 13, 21571, 11, 14249, 485, 5, 275, 79, 43, 612, 78, 16, 8, 3, 12342, 13, 615, 11, 7562, 117, 44, 4413, 13, 248, 5765, 99, 15, 11, 248, 4393, 5, 94, 31, 7, 24873, 12, 320, 223, 30, 175, 4413, 11, 5344, 24, 69, 2188, 47, 17508, 6, 24, 1371, 47, 373, 3, 26677, 12, 7229, 5, 299, 116, 8, 3545, 47, 2120, 223, 44, 10204, 7113, 11, 8, 432, 725, 166, 3, 16349, 44, 29518, 2979, 6, 6224, 47, 182, 231, 16, 3228, 5, 366, 8, 512, 24679, 30, 1589, 2818, 11, 3095, 2166, 10556, 277, 130, 3, 17349, 30, 12737, 63, 1771, 6, 8, 647, 47, 959, 68, 824, 5, 506, 130, 648, 24, 5285, 8, 11578, 13, 69, 13929, 7, 11, 8, 2793, 13, 69, 7021, 5, 275, 3, 3565, 66, 69, 4889, 7, 11, 28155, 7, 6, 69, 29457, 7, 11, 69, 14935, 6, 

['Madame',
 'Speaker',
 ',',
 'Vice',
 'President',
 'Bi',
 'den',
 ',',
 'members',
 'of',
 'Congress',
 ',',
 'distinguished',
 'guests',
 ',',
 'and',
 'fellow',
 'Americans',
 ':',
 'Our',
 'Constitution',
 'declare',
 's',
 'that',
 'from',
 'time',
 'to',
 'time',
 ',',
 'the',
 'president',
 'shall',
 'give',
 'to',
 'Congress',
 'information',
 'about',
 'the',
 'state',
 'of',
 'our',
 'union',
 '.',
 'For',
 '2',
 '20',
 'years',
 ',',
 'our',
 'leaders',
 'have',
 'fulfilled',
 'this',
 'duty',
 '.',
 'They',
 'have',
 'done',
 'so',
 'during',
 'periods',
 'of',
 'prosperity',
 'and',
 'tranquil',
 'ity',
 '.',
 'And',
 'they',
 'have',
 'done',
 'so',
 'in',
 'the',
 '',
 'midst',
 'of',
 'war',
 'and',
 'depression',
 ';',
 'at',
 'moments',
 'of',
 'great',
 'str',
 'if',
 'e',
 'and',
 'great',
 'struggle',
 '.',
 'It',
 "'",
 's',
 'tempting',
 'to',
 'look',
 'back',
 'on',
 'these',
 'moments',
 'and',
 'assume',
 'that',
 'our',
 'progress',
 'was',
 'inevitable',
 

In [47]:
model.tokenizer.encode(text)

[27328,
 16778,
 6,
 8236,
 1661,
 2106,
 537,
 6,
 724,
 13,
 4442,
 6,
 18908,
 2554,
 6,
 11,
 4999,
 5452,
 10,
 421,
 11378,
 15884,
 7,
 24,
 45,
 97,
 12,
 97,
 6,
 8,
 2753,
 1522,
 428,
 12,
 4442,
 251,
 81,
 8,
 538,
 13,
 69,
 7021,
 5,
 242,
 204,
 1755,
 203,
 6,
 69,
 2440,
 43,
 20795,
 48,
 5461,
 5,
 328,
 43,
 612,
 78,
 383,
 8811,
 13,
 21571,
 11,
 14249,
 485,
 5,
 275,
 79,
 43,
 612,
 78,
 16,
 8,
 3,
 12342,
 13,
 615,
 11,
 7562,
 117,
 44,
 4413,
 13,
 248,
 5765,
 99,
 15,
 11,
 248,
 4393,
 5,
 94,
 31,
 7,
 24873,
 12,
 320,
 223,
 30,
 175,
 4413,
 11,
 5344,
 24,
 69,
 2188,
 47,
 17508,
 6,
 24,
 1371,
 47,
 373,
 3,
 26677,
 12,
 7229,
 5,
 299,
 116,
 8,
 3545,
 47,
 2120,
 223,
 44,
 10204,
 7113,
 11,
 8,
 432,
 725,
 166,
 3,
 16349,
 44,
 29518,
 2979,
 6,
 6224,
 47,
 182,
 231,
 16,
 3228,
 5,
 366,
 8,
 512,
 24679,
 30,
 1589,
 2818,
 11,
 3095,
 2166,
 10556,
 277,
 130,
 3,
 17349,
 30,
 12737,
 63,
 1771,
 6,
 8,
 647,
 47,
 959,
 68,
 824

---
# Sentence Transformers

* [SentenceTransformer](https://www.sbert.net/docs/package_reference/SentenceTransformer.html)

In [48]:
sentence_trf_tokenizer = AutoTokenizer.from_pretrained(f'sentence-transformers/{SENTENCE_TRF_MODEL}')
encoded_input = sentence_trf_tokenizer([sentence.page_content for sentence in sentences[:1]], return_tensors='pt')
encoded_input

Token indices sequence length is longer than the specified maximum sequence length for this model (513 > 512). Running this sequence through the model will result in indexing errors


{'input_ids': tensor([[16778,     6,  8236,  1661,  2106,   537,     6,   724,    13,  4442,
             6, 18908,  2554,     6,    11,  4999,  5452,    10,   421, 11378,
         15884,     7,    24,    45,    97,    12,    97,     6,     8,  2753,
          1522,   428,    12,  4442,   251,    81,     8,   538,    13,    69,
          7021,     5,   242,   204,  1755,   203,     6,    69,  2440,    43,
         20795,    48,  5461,     5,   328,    43,   612,    78,   383,  8811,
            13, 21571,    11, 14249,   485,     5,   275,    79,    43,   612,
            78,    16,     8,     3, 12342,    13,   615,    11,  7562,   117,
            44,  4413,    13,   248,  5765,    99,    15,    11,   248,  4393,
             5,    94,    31,     7, 24873,    12,   320,   223,    30,   175,
          4413,    11,  5344,    24,    69,  2188,    47, 17508,     6,    24,
          1371,    47,   373,     3, 26677,    12,  7229,     5,   299,   116,
             8,  3545,    47,  2120,  

In [49]:
model = SentenceTransformer(SENTENCE_TRF_MODEL)
vector = model.encode(text)
vector.dtype
model.tokenize([text])

{'input_ids': tensor([[27328, 16778,     6,  8236,  1661,  2106,   537,     6,   724,    13,
           4442,     6, 18908,  2554,     6,    11,  4999,  5452,    10,   421,
          11378, 15884,     7,    24,    45,    97,    12,    97,     6,     8,
           2753,  1522,   428,    12,  4442,   251,    81,     8,   538,    13,
             69,  7021,     5,   242,   204,  1755,   203,     6,    69,  2440,
             43, 20795,    48,  5461,     5,   328,    43,   612,    78,   383,
           8811,    13, 21571,    11, 14249,   485,     5,   275,    79,    43,
            612,    78,    16,     8,     3, 12342,    13,   615,    11,  7562,
            117,    44,  4413,    13,   248,  5765,    99,    15,    11,   248,
           4393,     5,    94,    31,     7, 24873,    12,   320,   223,    30,
            175,  4413,    11,  5344,    24,    69,  2188,    47, 17508,     6,
             24,  1371,    47,   373,     3, 26677,    12,  7229,     5,   299,
            116,     8,  35

---
# Vector Database FAISS

* [Facebook Research - Faiss: A library for efficient similarity search](https://engineering.fb.com/2017/03/29/data-infrastructure/faiss-a-library-for-efficient-similarity-search/)
* [Introduction to Facebook AI Similarity Search (Faiss)](https://www.pinecone.io/learn/series/faiss/faiss-tutorial/)
* [FAISS - Github](https://github.com/facebookresearch/faiss)
* [FAISS - Readthedocs](https://faiss.ai/index.html)

## Tutorial

* [Getting started](https://github.com/facebookresearch/faiss/wiki/Getting-started)

> The code can be run by copy/pasting it or running it from the [tutorial](https://github.com/facebookresearch/faiss/tree/master/tutorial) subdirectory of the Faiss distribution.

## Langchain
* [Langchain - Vector stores](https://python.langchain.com/docs/modules/data_connection/vectorstores/)
* [Langchain - FAISS](https://python.langchain.com/docs/integrations/vectorstores/faiss)
* [Langchain API - vectorstores.faiss.FAISS](https://api.python.langchain.com/en/stable/vectorstores/langchain_community.vectorstores.faiss.FAISS.html)

In [50]:
model = AutoModel.from_pretrained(f'sentence-transformers/{SENTENCE_TRF_MODEL}')
model

Some weights of T5Model were not initialized from the model checkpoint at sentence-transformers/gtr-t5-large and are newly initialized: ['decoder.block.17.layer.2.DenseReluDense.wo.weight', 'decoder.block.1.layer.1.EncDecAttention.k.weight', 'decoder.block.22.layer.2.DenseReluDense.wi.weight', 'decoder.block.14.layer.2.DenseReluDense.wo.weight', 'decoder.block.8.layer.1.EncDecAttention.k.weight', 'decoder.block.4.layer.0.SelfAttention.k.weight', 'decoder.block.7.layer.0.SelfAttention.q.weight', 'decoder.block.12.layer.0.SelfAttention.o.weight', 'decoder.block.16.layer.1.layer_norm.weight', 'decoder.block.6.layer.1.EncDecAttention.o.weight', 'decoder.block.11.layer.0.SelfAttention.v.weight', 'decoder.block.3.layer.0.SelfAttention.q.weight', 'decoder.block.9.layer.1.EncDecAttention.k.weight', 'decoder.block.9.layer.1.layer_norm.weight', 'decoder.block.11.layer.1.EncDecAttention.k.weight', 'decoder.block.6.layer.2.layer_norm.weight', 'decoder.block.3.layer.2.DenseReluDense.wo.weight', 'de

T5Model(
  (shared): Embedding(32128, 1024)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 1024)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=1024, out_features=1024, bias=False)
              (k): Linear(in_features=1024, out_features=1024, bias=False)
              (v): Linear(in_features=1024, out_features=1024, bias=False)
              (o): Linear(in_features=1024, out_features=1024, bias=False)
              (relative_attention_bias): Embedding(32, 16)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=1024, out_features=4096, bias=False)
              (wo): Linear(in_features=4096, out_features=1024, bias=False)
              (dropout): Dropout(p=

## FAISS Index

```Index``` object stores all the vectors and allow vector search k-nearest-neighbors. There are multiple index types as in [Faiss indexes](https://github.com/facebookresearch/faiss/wiki/Faiss-indexes). Faiss index itself is the vector database that manages the vectors and provide add(vector) and search(vector) interfaces. Named as Index is because it provides indexing feature for fast similarity search. FAISS Index does not offer other metadata or interfaces such as primary key to uniquely identify and retrieve the vectors, but focus only to provide similarity search. Hence LangChain provides additional metadata management using its [InMemoryDocstore](https://api.js.langchain.com/classes/stores_doc_in_memory.InMemoryDocstore.html).

### Caution

FAISS ```IndexFlatL2``` may calculate incorrect distances. 

* [Why you should be careful using FAISS](https://medium.com/mlearning-ai/why-you-should-be-careful-using-faiss-c44996eda9ee)

> go deeper into the principles of the basic FAISS’s index — IndexFlatL2. This index is very useful when you need to make an exact search using Euclidean distance. This type of index doesn’t compress or cluster your vectors. Nevertheless, it has some features which might worsen your experience. You have to remember that **FAISS uses the formula which can cause catastrophic cancellation** when floats of different magnitudes are added. We can also avoid it by using double-precision floating-point format (float64). However, we should remember that **FAISS works only with float32** format.

* [negative distance returned in IndexFlatL2 search query #297](https://github.com/facebookresearch/faiss/issues/297)

> The problem is that if you have a query vector x and two database vectors y_1 and y_2, where ```||x|| >> ||y_1||``` and ```||x|| >> ||y_2||``` then there will be accuracy losses because computations are performed with 32-bit float precision.

### Build IndexFlatL2 Index
* [Building an index and adding the vectors to it](https://github.com/facebookresearch/faiss/wiki/Getting-started#building-an-index-and-adding-the-vectors-to-it)

> Faiss is built around the ```Index``` object. It encapsulates the set of database vectors, and optionally preprocesses them to make searching efficient. There are many types of indexes, we are going to use the simplest version that just performs brute-force L2 distance search on them: ```IndexFlatL2```.
> ```Index``` needs to know the dimensionality of the vectors it operates.
> ```
> import faiss                   # make faiss available
> index = faiss.IndexFlatL2(d)   # build the index
> ```

### Build IndexFlatIP index

* [How can we build index/search based on cosine similarity](https://github.com/facebookresearch/faiss/issues/95)

> ```
> index = faiss.IndexFlatIP(dimensions)
> ```

### Search 

```
top_k: int = 5    # 5 nearest neighbors
distances, indices = index.search(query_vector, top_k)  # distances is L2 distances
```


## Build Vector Database

### Create a Vectorizer 

Need to define the vectorizer to embed a text into a vector.

In [51]:
vectorizer = embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

In [52]:
vectorizer.client

SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
)

In [53]:
x = np.array(vectorizer.embed_query("This is a pen."))
y = np.array(vectorizer.embed_query("That is a pencil."))
z = np.array(vectorizer.embed_query("I love sushi."))
euclidean(x, z)

1.4246604853027693

### Create an FAISS Index

In [54]:
dimensions: int = len(embedding_function.embed_query("dummy"))
dimensions

384

In [55]:
faiss_index: IndexFlatL2 = IndexFlatL2(dimensions)
# faiss_index: IndexFlatIP = IndexFlatIP(dimensions)

### Build an empty database

In [56]:
index_to_docstore_id = {}
db = FAISS(
    embedding_function=embedding_function,
    index=faiss_index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id=index_to_docstore_id,
    normalize_L2=False
)

In [57]:
dir(faiss_index)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__swig_destroy__',
 '__weakref__',
 'add',
 'add_c',
 'add_with_ids',
 'add_with_ids_c',
 'assign',
 'assign_c',
 'code_size',
 'codes',
 'compute_distance_subset',
 'compute_residual',
 'compute_residual_n',
 'd',
 'get_distance_computer',
 'get_xb',
 'is_trained',
 'metric_arg',
 'metric_type',
 'ntotal',
 'range_search',
 'range_search_c',
 'reconstruct',
 'reconstruct_c',
 'reconstruct_n',
 'reconstruct_n_c',
 'remove_ids',
 'remove_ids_c',
 'reset',
 'sa_code_size',
 'sa_decode',
 'sa_decode_c',
 'sa_encode',
 'sa_encode_c',
 'search',
 'search_and_reconstruct',
 'search_and_reconstruct_c',
 'searc

### Add documents to the database

In [58]:
ids = [
    str(i).zfill(10) for i in range(len(sentences))
]
_ = db.add_documents(
    documents=sentences,
    ids=ids
)

In [59]:
index_to_docstore_id

{0: '0000000000',
 1: '0000000001',
 2: '0000000002',
 3: '0000000003',
 4: '0000000004',
 5: '0000000005',
 6: '0000000006',
 7: '0000000007',
 8: '0000000008',
 9: '0000000009',
 10: '0000000010',
 11: '0000000011',
 12: '0000000012',
 13: '0000000013',
 14: '0000000014',
 15: '0000000015',
 16: '0000000016',
 17: '0000000017'}

### Build from documents

## Search Similar Documents

FAISS Index provides the search functions with a few search types (similarity, mmr).


### Methods

* [search(query: str, search_type: str, **kwargs: Any)](https://api.python.langchain.com/en/stable/vectorstores/langchain_community.vectorstores.faiss.FAISS.html#langchain_community.vectorstores.faiss.FAISS.search)



### Search Types

* [Vector store-backed retriever](https://python.langchain.com/docs/modules/data_connection/retrievers/vectorstore)

> * search_type="similarity": 
> * search_type="mmr": By default, the vector store retriever uses similarity search. If the underlying vector store supports maximum marginal relevance search, you can specify that as the search type.



* [similarity_search_with_score(query: str, k: int = 4, filter)](https://api.python.langchain.com/en/stable/vectorstores/langchain_community.vectorstores.faiss.FAISS.html#langchain_community.vectorstores.faiss.FAISS.similarity_search_with_score)
* [Similarity Search with score](https://python.langchain.com/docs/integrations/vectorstores/faiss#similarity-search-with-score)

> ```similarity_search_with_score``` returns ```(doc, score)``` where score is **L2 distance**. Therefore, **a lower score is better**.

### L2 Distances

Why ```IndexMFlatL2``` gives different value from scipy and numpy L2 distance value?

In [60]:
query: str = "the president shall give to congress information about the state of our union"

In [61]:
query_vector = embedding_function.embed_query(query)
doc_vector = embedding_function.embed_query(sentences[0].page_content)
# cosine(np.array(query_vector), np.array(doc_vector))
euclidean(np.array(query_vector), np.array(doc_vector))

1.0427087315533865

In [62]:
import torch
x1=torch.tensor([query_vector])
x2=torch.tensor([doc_vector])
torch.cdist(x1, x2, p=2.0, compute_mode='use_mm_for_euclid_dist_if_necessary')

tensor([[1.0427]])

In [63]:
db.index.search(np.array([query_vector], dtype=np.float32), k=3)

(array([[1.0872418, 1.2950897, 1.329037 ]], dtype=float32),
 array([[ 0,  4, 17]]))

In [64]:
faiss.cvar.distance_compute_blas_threshold = len(index_to_docstore_id) + 1
db.index.search(np.array([query_vector], dtype=np.float32), k=3)

(array([[1.0872418, 1.2950897, 1.329037 ]], dtype=float32),
 array([[ 0,  4, 17]]))

In [65]:
db.search(query="the state of our union", search_type="mmr")

[Document(page_content="Speaker, Vice President Biden, members of Congress, distinguished guests, and fellow Americans: Our Constitution declares that from time to time, the president shall give to Congress information about the state of our union. For 220 years, our leaders have fulfilled this duty. They have done so during periods of prosperity and tranquility. And they have done so in the midst of war and depression; at moments of great strife and great struggle. It's tempting to look back on these moments and assume that our progress was inevitable, that America was always destined to succeed. But when the Union was turned back at Bull Run and the Allies first landed at Omaha Beach, victory was very much in doubt. When the market crashed on Black Tuesday and civil rights marchers were beaten on Bloody Sunday, the future was anything but certain. These were times that tested the courage of our convictions and the strength of our union. And despite all our divisions and disagreements

In [66]:
for doc, score in db.similarity_search_with_score(query, 3):
    print(f"score:{score} : {json.dumps(doc.to_json(), indent=4, default=str)}")

score:1.087241768836975 : {
    "lc": 1,
    "type": "constructor",
    "id": [
        "langchain",
        "schema",
        "document",
        "Document"
    ],
    "kwargs": {
        "page_content": "Speaker, Vice President Biden, members of Congress, distinguished guests, and fellow Americans: Our Constitution declares that from time to time, the president shall give to Congress information about the state of our union. For 220 years, our leaders have fulfilled this duty. They have done so during periods of prosperity and tranquility. And they have done so in the midst of war and depression; at moments of great strife and great struggle. It's tempting to look back on these moments and assume that our progress was inevitable, that America was always destined to succeed. But when the Union was turned back at Bull Run and the Allies first landed at Omaha Beach, victory was very much in doubt. When the market crashed on Black Tuesday and civil rights marchers were beaten on Bloody S

### Filter the search

Filter the matched documents with the metadata attributes.

```
"metadata": {
    "source": "./data/state_of_the_union.txt",
    "index": 22              # <--- Directory specify the metadata attribute
}
```

In [67]:
filter=dict(index=22)
filter

{'index': 22}

In [68]:
query: str = "the president shall give to congress information about the state of our union"
for doc, score in db.similarity_search_with_score(query, k=1, fetch_k=5, filter=filter):
    print(f"score:{score} : {json.dumps(doc.to_json(), indent=4, default=str)}")

## Add Document

[add_documents](https://api.python.langchain.com/en/latest/vectorstores/langchain_community.vectorstores.faiss.FAISS.html?highlight=faiss#langchain_community.vectorstores.faiss.FAISS.add_documents) will invoke [add_texts](https://api.python.langchain.com/en/latest/vectorstores/langchain_community.vectorstores.faiss.FAISS.html?highlight=faiss#langchain_community.vectorstores.faiss.FAISS.add_texts) method where you can set your own IDs.

* [langchain_community/vectorstores/faiss.py#add_texts](https://api.python.langchain.com/en/latest/_modules/langchain_community/vectorstores/faiss.html#FAISS.add_texts):

```
def add_texts(
    self,
    texts: Iterable[str],
    metadatas: Optional[List[dict]] = None,
    ids: Optional[List[str]] = None,           # <----
    **kwargs: Any,
) -> List[str]:
```


In [69]:
doc_to_add: Document = Document(
    page_content="the president shall give to congress information about the state of our union", 
    metadata=dict(index=99)
)
ids: List[str] = db.add_documents(
    documents=[doc_to_add], 
    # You can provide your own unique ID, otherwise FAISS class generates UUID.
    # langchain_community/vectorstores/faiss.py#__add method:
    # ```
    # ids = ids or [str(uuid.uuid4()) for _ in texts]
    # ```
    ids=["DOC04123333"]            
)
ids

['DOC04123333']

## Select a document with 

Currently Lanchain does not provide a method to retrieve a document by its internal ID.

* [How to retrieve vectors by ids for LangChain vectorstore FAISS? #8897](https://github.com/langchain-ai/langchain/issues/8897) 

> Currently, the FAISS vectorstore implementation in LangChain does not have a method to retrieve vectors by ids similar to the retrieve method in the Qdrant vectorstore. The current implementation supports deleting vectors by ids, but not retrieving them.
> However, it is possible to implement a similar method in the FAISS vectorstore. Here is a rough idea of how it could be done:
```
def retrieve(self, ids: List[str], with_vectors: bool = False) -> List[Optional[np.ndarray]]:
    """Retrieve vectors by ID. These are the IDs in the vectorstore.

    Args:
        ids: List of ids to retrieve.
        with_vectors: If True, return the vectors along with the ids.

    Returns:
        List of vectors corresponding to the ids, or None if an id does not exist.
    """
    if with_vectors:
        _reversed_index = {v: k for k, v in self.index_to_docstore_id.items()}
        index_to_retrieve = [_reversed_index.get(i) for i in ids]
        vectors = self.index.reconstruct_n(0, len(self.index_to_docstore_id))
        return [vectors[i] for i in index_to_retrieve if i is not None]
    else:
        return [None for _ in ids]
```

>  the add method in faiss stores vectors in the given index using sequentially generated indices by default. The FAISS wrapper use a index_to_docstore_id dictionary, which essentially converts these indices into UUIDs for the respective documents stored in the underlying document store. Consequently, you can leverage the underlying document store and the index_to_docstore_id dictionary to retrieve a document based on its ID generated by faiss:
```
from langchain.vectorstores.faiss import FAISS
from langchain.embeddings.fake import FakeEmbeddings
from langchain.docstore import InMemoryDocstore
from langchain.docstore.document import Document
import faiss
embedding_size = 1536
index = faiss.IndexFlatL2(embedding_size)
embedding_fn = FakeEmbeddings(size=embedding_size).embed_query
docstore = InMemoryDocstore({})
vectorstore = FAISS(embedding_fn, index, docstore, {})
documents = [Document(page_content='Hello How are you doing')]
vectorstore.add_documents(documents)
index_to_docstore_id = vectorstore.index_to_docstore_id
for i in range(len(documents)):
    print(docstore._dict[index_to_docstore_id[i]])
```

In [70]:
def select(
        db,
        keys: List[str]
):
    """Retrieve vectors by keys, which are the IDs in the vectorstore.
    Args:
        keys: primary keys to identify the records to select.

    Returns:
        List of vectors corresponding to the ids, or None if an id does not exist.
    """
    _id_to_index = {
        _id: _index for _index, _id in db.index_to_docstore_id.items()
    }
    indices_to_retrieve = [_id_to_index.get(_id) for _id in keys]
    vectors = db.index.reconstruct_n(0, len(db.index_to_docstore_id))
    return (
        [db.docstore.search(_id) for _id in keys],
        [vectors[index] for index in indices_to_retrieve if index is not None]
    )

select(db=db, keys=['DOC04123333'])

([Document(page_content='the president shall give to congress information about the state of our union', metadata={'index': 99})],
 [array([-2.28103362e-02,  6.77464530e-02,  9.50659811e-03,  5.78828976e-02,
         -3.08208242e-02,  6.06366321e-02,  3.15811373e-02, -5.21173514e-02,
         -8.54805335e-02, -1.82847157e-02, -1.27744377e-01,  3.03568784e-03,
         -1.91189686e-03, -2.76795924e-02, -6.97776005e-02,  2.95578577e-02,
          7.60543793e-02,  3.85096408e-02, -1.77105214e-03, -1.50946816e-02,
          1.09686725e-01,  6.01188689e-02, -7.03044906e-02,  5.85764041e-03,
          2.17567589e-02,  1.40187228e-02, -4.88124974e-02, -6.09397814e-02,
         -5.87077960e-02, -3.19501348e-02, -2.63713324e-03, -7.52108395e-02,
          2.12408993e-02,  6.47165701e-02, -1.89610273e-02, -1.28192827e-01,
          7.25442618e-02,  3.60509939e-02,  8.64282921e-02, -2.63611060e-02,
          1.16491783e-02, -5.09439409e-02,  8.39687791e-03,  2.07770411e-02,
         -2.92405374e-

In [71]:
def select(
        db,
        ids: List[str]
):
    """Retrieve vectors by ids, which are the IDs in the vectorstore.
    Args:
        ids: Docstore ids to identify the records to select.

    Returns: Generator to yield (document, vector) for the ids
    """
    # Langchain Docstore IDs
    docstore_ids = ids

    # FAISS vector indices
    _id_to_index = {
        _id: _index for _index, _id in db.index_to_docstore_id.items()
    }
    indices_to_retrieve = [_id_to_index.get(_id) for _id in docstore_ids]

    for index, _id in zip(indices_to_retrieve, docstore_ids):
        document: Document = db.docstore.search(search=_id)
        vector: np.ndarray = db.index.reconstruct_n(n0=index, ni=1)[0]
        yield document, vector


In [72]:
g = select(db, ['DOC04123333'])
doc, vec = next(g)
assert np.allclose(db.embedding_function.embed_query(doc.page_content), vec)

doc
vec

array([-2.28103362e-02,  6.77464530e-02,  9.50659811e-03,  5.78828976e-02,
       -3.08208242e-02,  6.06366321e-02,  3.15811373e-02, -5.21173514e-02,
       -8.54805335e-02, -1.82847157e-02, -1.27744377e-01,  3.03568784e-03,
       -1.91189686e-03, -2.76795924e-02, -6.97776005e-02,  2.95578577e-02,
        7.60543793e-02,  3.85096408e-02, -1.77105214e-03, -1.50946816e-02,
        1.09686725e-01,  6.01188689e-02, -7.03044906e-02,  5.85764041e-03,
        2.17567589e-02,  1.40187228e-02, -4.88124974e-02, -6.09397814e-02,
       -5.87077960e-02, -3.19501348e-02, -2.63713324e-03, -7.52108395e-02,
        2.12408993e-02,  6.47165701e-02, -1.89610273e-02, -1.28192827e-01,
        7.25442618e-02,  3.60509939e-02,  8.64282921e-02, -2.63611060e-02,
        1.16491783e-02, -5.09439409e-02,  8.39687791e-03,  2.07770411e-02,
       -2.92405374e-02,  1.01810060e-01,  4.98622190e-03,  2.33929735e-02,
       -9.77457408e-03,  7.11746067e-02, -1.25237415e-03,  2.90405843e-02,
        2.32936069e-02,  

## Delete

In [73]:
db.delete(['DOC04123333'])

True

---
# Server

## Flask

In [74]:
import json
from flask import Flask, request

app = Flask(__name__)


@app.route('/embedding', methods=['POST'])
def generate_embedding():
    query = request.json['query']
    results = db.similarity_search_with_score(query, 3)
    return {'results': json.dumps(results, default=str)}

if __name__ == '__main__':
    app.run(port=8001)

ModuleNotFoundError: No module named 'flask'

## FAISS gRPC Server

* [Faiss gRPC Server](https://github.com/louiezzang/faiss-server)

> A library for efficient similarity search and clustering of dense vectors.

In [None]:
pow(2, 15) * 8