# Langchain Exploration



In [12]:
#!pip install faiss-gpu # For CUDA 7.5+ Supported GPU's.
#!pip install faiss-cpu # For CPU Installation
#!pip3 install Langchain[FAISS]
#!pip install sentence-transformers
#!pip install pypdf

In [2]:
# Uncomment the following line if you need to initialize FAISS with no AVX2 optimization
# os.environ['FAISS_NO_AVX2'] = '1'

In [25]:
import json
import os
import time
from typing import (
    List,
    Dict,
)

import numpy as np
import pandas as pd
import pickle

from transformers import (
    AutoTokenizer,
)
from langchain_core.documents import (
    Document
)
from langchain.document_loaders import (
    TextLoader,
    PyPDFLoader
)
from langchain.text_splitter import (
    CharacterTextSplitter,
    RecursiveCharacterTextSplitter,
    SentenceTransformersTokenTextSplitter
)
from langchain.embeddings import (
    SentenceTransformerEmbeddings
)
from langchain.vectorstores import (
    FAISS
)

In [None]:
# Constant

In [24]:
SENTENCE_TRF_MODEL: str = "all-MiniLM-L6-v2"

---- 

# Document Loader

Generate langchain ```documents:List[Document]``` from the data source.

* [Document loaders](https://python.langchain.com/docs/modules/data_connection/document_loaders/)



## PDF Loader

* [PyPDFLoader](https://api.python.langchain.com/en/stable/document_loaders/langchain_community.document_loaders.pdf.PyPDFLoader.html)

Use [pypdf](https://pypi.org/project/pypdf/) to generate chunks as ```List[Document]``` by pages and stores page numbers in metadata.

In [5]:
loader = PyPDFLoader("./data/test_sample.pdf")
pages: List[Document] = loader.load()
print(f"metadata:{pages[0].metadata} length:{len(pages)}")

metadata:{'source': './data/test_sample.pdf', 'page': 0} length:4


## Text Loader

* [TextLoader](https://api.python.langchain.com/en/stable/document_loaders/langchain_community.document_loaders.text.TextLoader.html)

Load from a text file into single ```Document``` instance.

In [6]:
loader = TextLoader(
    file_path="./data/state_of_the_union.txt",
    encoding='utf-8'
)
documents = loader.load()
print(f"metadata:{documents[0].metadata} length:{len(documents)}")

metadata:{'source': './data/state_of_the_union.txt'} length:1


# Document

A Document is a piece of text and associated metadata.

* [class Document](https://api.python.langchain.com/en/stable/_modules/langchain_core/documents/base.html#Document)


In [7]:
for attr in dir(documents[0]):
    print(attr) if not attr.startswith("_") else None

Config
construct
copy
dict
from_orm
get_lc_namespace
is_lc_serializable
json
lc_attributes
lc_id
lc_secrets
metadata
page_content
parse_file
parse_obj
parse_raw
schema
schema_json
to_json
to_json_not_implemented
type
update_forward_refs
validate


---
# Document Transformers

* [Document transformers](https://python.langchain.com/docs/modules/data_connection/document_transformers/)


## Text Split

The objective of the splitter is to package as many sentences as possible into a chunk so that **consequtive sentences stay together**.

* [Text splitters](https://python.langchain.com/docs/modules/data_connection/document_transformers/#text-splitters)

1. Split the text up into small, semantically meaningful chunks (often sentences).
2. Start combining these small chunks into a larger chunk until you reach a certain size (as measured by some function).
3. Once you reach that size, make that chunk its own piece of text and then start creating a new chunk of text with some overlap (to keep context between chunks).

That means there are two different axes along which you can customize your text splitter:

1. How the text is split
2. How the chunk size is measured



### RecursiveCharacterTextSplitter

The default recommended is [RecursiveCharacterTextSplitter](https://api.python.langchain.com/en/stable/text_splitter/langchain.text_splitter.RecursiveCharacterTextSplitter.html) that tries to create chunks based on splitting on the character in ```["\n\n", "\n", " ", ""]```  but if any chunks are too large it then moves onto the next character.

* ```length_function```: how the length of chunks is calculated. Defaults to just counting number of characters, but it's pretty common to pass a token counter here.
* ```chunk_size```: the maximum size of your chunks (as measured by the length function).
* ```chunk_overlap```: the maximum overlap between chunks to have some overlap to maintain some continuity between chunks (e.g. do a sliding window).
* ```add_start_index```: whether to include the starting position of each chunk within the original document in the metadata.



In [8]:
recursive_text_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n"],
    is_separator_regex=False,
    keep_separator=False,
    chunk_size = 15,
    chunk_overlap  = 0,
    length_function = len,
    add_start_index = True,
)

# for page in pages:
#     print(json.dumps(page.to_json(), indent=4, default=str, ensure_ascii=True))

# Split the 1st PDF page text into chunks 
for chunk in recursive_text_splitter.split_documents([pages[0]]):
    print(json.dumps(chunk.to_json(), indent=4, default=str, ensure_ascii=True))

{
    "lc": 1,
    "type": "constructor",
    "id": [
        "langchain",
        "schema",
        "document",
        "Document"
    ],
    "kwargs": {
        "page_content": "PDF Bookmark Sample Page 1 of 4 ",
        "metadata": {
            "source": "./data/test_sample.pdf",
            "page": 0,
            "start_index": 5
        }
    }
}
{
    "lc": 1,
    "type": "constructor",
    "id": [
        "langchain",
        "schema",
        "document",
        "Document"
    ],
    "kwargs": {
        "page_content": " PDF B OOKMARK SAMPLE  ",
        "metadata": {
            "source": "./data/test_sample.pdf",
            "page": 0,
            "start_index": 38
        }
    }
}
{
    "lc": 1,
    "type": "constructor",
    "id": [
        "langchain",
        "schema",
        "document",
        "Document"
    ],
    "kwargs": {
        "page_content": "Sample Date: May 2001 ",
        "metadata": {
            "source": "./data/test_sample.pdf",
            "page": 0,


### CharacterTextSplitter

* [What does langchain CharacterTextSplitter's chunk_size param even do?](https://stackoverflow.com/a/77341919/4281353)

> CharacterTextSpliiter behaves differently from what you expected. It first looks for the **first 6** characters and then splits the next chunk from the closest separator, **not from the 7th characte**.
> ```
> text_splitter = CharacterTextSplitter(
>     separator="\n",
>     chunk_size=6,           # <--- Look for the first 6 characters, then start looking for the separator.
> )



In [17]:
text_splitter = CharacterTextSplitter(
    separator="\n\n",
    is_separator_regex=False,
    chunk_size=1200,
    chunk_overlap=0
)
docs = text_splitter.split_documents(documents)

In [18]:
for doc in docs:
    print(json.dumps(doc.to_json(), indent=4, default=str, ensure_ascii=True))

{
    "lc": 1,
    "type": "constructor",
    "id": [
        "langchain",
        "schema",
        "document",
        "Document"
    ],
    "kwargs": {
        "page_content": "Madame Speaker, Vice President Biden, members of Congress, distinguished guests, and fellow Americans:\n\nOur Constitution declares that from time to time, the president shall give to Congress information about the state of our union. For 220 years, our leaders have fulfilled this duty. They have done so during periods of prosperity and tranquility. And they have done so in the midst of war and depression; at moments of great strife and great struggle.\n\nIt's tempting to look back on these moments and assume that our progress was inevitable, that America was always destined to succeed. But when the Union was turned back at Bull Run and the Allies first landed at Omaha Beach, victory was very much in doubt. When the market crashed on Black Tuesday and civil rights marchers were beaten on Bloody Sunday, the fu

### SentenceTransformersTokenTextSplitter

* [Split by tokens](https://python.langchain.com/docs/modules/data_connection/document_transformers/text_splitters/split_by_token)

Split not to exceed the token limits required by the model. Tokenizer depends on the Sentence Transformer model to use, hence the model name must match.

* [Sentence Transformers](https://huggingface.co/sentence-transformers)
* [Sentence Transformer - all-MiniLM-L6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2)

```RecursiveCharacterTextSplitter().from_huggingface_tokenizer(sentence_trf_tokenizer)``` does not work as the number of tokens exceed the limit.

```
sentence_trf_tokenizer = AutoTokenizer.from_pretrained(f'sentence-transformers/{SENTENCE_TRF_MODEL}')
sentence_trf_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n"],
    is_separator_regex=False,
    keep_separator=False,
    chunk_size = 10,
    chunk_overlap  = 0,
    # length_function = len,
    add_start_index = True,
).from_huggingface_tokenizer(sentence_trf_tokenizer)
sentence_trf_splitter.count_tokens(text=sentences[0].page_content)
----
3683   # <--- exceeds the limit
```

In [61]:
sentence_trf_splitter = SentenceTransformersTokenTextSplitter(
    model_name=SENTENCE_TRF_MODEL,
    chunk_overlap=10    # number of tokens to overlap, not characters
)
print(f"Max tokens for the model [{SENTENCE_TRF_MODEL}] is [{sentence_trf_splitter.maximum_tokens_per_chunk}].")

Max tokens for the model [all-MiniLM-L6-v2] is [256].


In [66]:
sentences[0].metadata

{'source': './data/state_of_the_union.txt'}

In [67]:
sentences: List[Document] = sentence_trf_splitter.split_documents(documents)

_sentence: Document
for index, _sentence in enumerate(sentences): 
    _sentence.metadata['index'] = index

In [68]:
print(json.dumps(sentences[0].to_json(), indent=4, default=str))

{
    "lc": 1,
    "type": "constructor",
    "id": [
        "langchain",
        "schema",
        "document",
        "Document"
    ],
    "kwargs": {
        "page_content": "madame speaker, vice president biden, members of congress, distinguished guests, and fellow americans : our constitution declares that from time to time, the president shall give to congress information about the state of our union. for 220 years, our leaders have fulfilled this duty. they have done so during periods of prosperity and tranquility. and they have done so in the midst of war and depression ; at moments of great strife and great struggle. it's tempting to look back on these moments and assume that our progress was inevitable, that america was always destined to succeed. but when the union was turned back at bull run and the allies first landed at omaha beach, victory was very much in doubt. when the market crashed on black tuesday and civil rights marchers were beaten on bloody sunday, the future

---
# Vector Database FAISS

* [Facebook Research - Faiss: A library for efficient similarity search](https://engineering.fb.com/2017/03/29/data-infrastructure/faiss-a-library-for-efficient-similarity-search/)
* [Introduction to Facebook AI Similarity Search (Faiss)](https://www.pinecone.io/learn/series/faiss/faiss-tutorial/)
* [FAISS - Github](https://github.com/facebookresearch/faiss)
* [FAISS - Readthedocs](https://faiss.ai/index.html)
* [Langchain - Vector stores](https://python.langchain.com/docs/modules/data_connection/vectorstores/)
* [Langchain - FAISS](https://python.langchain.com/docs/integrations/vectorstores/faiss)
* [Langchain API - vectorstores.faiss.FAISS](https://api.python.langchain.com/en/stable/vectorstores/langchain_community.vectorstores.faiss.FAISS.html)

## Build Vector Database

In [65]:
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

In [71]:
db = FAISS.from_documents(
    documents=sentences, 
    embedding=embedding_function
)

## Search Similar Documents

* [search(query: str, search_type: str, **kwargs: Any)](https://api.python.langchain.com/en/stable/vectorstores/langchain_community.vectorstores.faiss.FAISS.html#langchain_community.vectorstores.faiss.FAISS.search)
* [similarity_search_with_score(query: str, k: int = 4, filter)](https://api.python.langchain.com/en/stable/vectorstores/langchain_community.vectorstores.faiss.FAISS.html#langchain_community.vectorstores.faiss.FAISS.similarity_search_with_score)
* [Similarity Search with score](https://python.langchain.com/docs/integrations/vectorstores/faiss#similarity-search-with-score)

> ```similarity_search_with_score``` returns ```(doc, score)``` where score is **L2 distance**. Therefore, **a lower score is better**.

In [87]:
query: str = "the president shall give to congress information about the state of our union"
for doc, score in db.similarity_search_with_score(query, 3):
    print(f"score:{score} : {json.dumps(doc.to_json(), indent=4, default=str)}")

score:1.2690781354904175 : {
    "lc": 1,
    "type": "constructor",
    "id": [
        "langchain",
        "schema",
        "document",
        "Document"
    ],
    "kwargs": {
        "page_content": "about how washington works that have been growing for years. to close that credibility gap we must take action on both ends of pennsylvania avenue to end the outsized influence of lobbyists ; to do our work openly and to give our people the government they deserve. that's what i came to washington to do. that's why - - for the first time in history - - my administration posts our white house visitors online. and that's why we've excluded lobbyists from policymaking jobs or seats on federal boards and commissions. but we can't stop there. it's time to require lobbyists to disclose each contact they make on behalf of a client with my administration or congress. and it's time to put strict limits on the contributions that lobbyists give to candidates for federal office. last week, the 

---
# Server

## Flask

In [None]:
import json
from flask import Flask, request

app = Flask(__name__)


@app.route('/embedding', methods=['POST'])
def generate_embedding():
    query = request.json['query']
    results = db.similarity_search_with_score(query, 3)
    return {'results': json.dumps(results, default=str)}

if __name__ == '__main__':
    app.run(port=8001)

## FAISS gRPC Server

* [Faiss gRPC Server](https://github.com/louiezzang/faiss-server)

> A library for efficient similarity search and clustering of dense vectors.