## Semi Structured and multimodal RAG
- We will use Unstructured to parse both text and tables from documents (PDFs).
- We will use the multi-vector retriever to store raw tables, text along with table summaries better suited for retrieval.
- We will use LCEL to implement the chains used.

Notebook for reference: https://github.com/langchain-ai/langchain/blob/master/cookbook/Semi_structured_and_multi_modal_RAG.ipynb

In [9]:
from typing import Any
import pandas as pd
import numpy as np
from groq import Groq
import os
import pinecone
import requests


from langchain_community.vectorstores import Chroma
from langchain.text_splitter import TokenTextSplitter
from langchain.docstore.document import Document
from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain_pinecone import PineconeVectorStore
from transformers import AutoModelForCausalLM, AutoTokenizer
from sklearn.metrics.pairwise import cosine_similarity

# Load API Keys
from unstructured.staging.base import elements_to_json, elements_from_json
from unstructured.staging.base import convert_to_dict
from unstructured.staging.base import convert_to_csv
import json
from IPython.display import display, HTML
import yaml
from groq import Groq
from dotenv import load_dotenv

from pydantic import BaseModel
from unstructured.partition.pdf import partition_pdf
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
#Need to import groq from langchain
import uuid

from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain_core.documents import Document
from langchain_openai import OpenAIEmbeddings
from langchain_core.runnables import RunnablePassthrough
from langchain_community.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain_pinecone import PineconeVectorStore
from transformers import AutoModelForCausalLM, AutoTokenizer
from sklearn.metrics.pairwise import cosine_similarity

# Can try paddle OCR instead of tesseract


In [2]:
load_dotenv()
groq_api_key = os.getenv('GROQ_API_KEY')
hf_key = os.getenv('HUGGINGFACE_API_KEY')
pinecone_api_key = os.getenv('PINECONE_API_KEY')
openai_api_key = os.getenv('OPENAI_API_KEY')
client = Groq(api_key = groq_api_key)
model = "llama3-8b-8192"

## Data Loading 
- Using partitionpdf, which segments a pdf document by using a layout model.
- This layout model makes it possible to extract elements, such as tables, from PDFs.
- We will also use unstructured chunking
  - Tries to identify document sections
  - builds text blocks that maintain sections while also honoring user-defined chunk sizes

In [3]:
# Code taken from unstructured website and stack overflow 
path_to_hsi = "../data/HSI1000_1to9.pdf"
raw_pdf_elements = partition_pdf("../data/HSI1000_1to9.pdf", 
                        strategy = "hi_res", 
                        infer_table_structure=True, 
                        )

# Create a dictionary to store counts of each type
category_counts = {}

for element in raw_pdf_elements:
    category = str(type(element))
    if category in category_counts:
        category_counts[category] += 1
    else:
        category_counts[category] = 1

# Unique_categories will have unique elements
unique_categories = set(category_counts.keys())
category_counts

# Save output to json file (Future use mongodb maybe)
convert_to_dict(raw_pdf_elements)

element_output_file = "../data/element_entities.json"
elements_to_json(raw_pdf_elements, filename=element_output_file)

Some weights of the model checkpoint at microsoft/table-transformer-structure-recognition were not used when initializing TableTransformerForObjectDetection: ['model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked']
- This IS expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
with open("../data/element_entities.json", "r", encoding='utf-8') as fin:
    read_elements = json.load(fin)
print(f"length before filtering: {len(read_elements)}")

unwanted_types = ['Footer', 'Image', 'FigureCaption', 'UncategorizedText']
filtered_el = []
for el in read_elements:
    if el['type'] in unwanted_types:
        continue
    else:
        filtered_el.append(el)
print(f"length after filtering: {len(filtered_el)}")

length before filtering: 130
length after filtering: 109


In [5]:
class Element(BaseModel):
    type: str
    text: Any
    
table_elements =  [Element(type= 'Table', text=el['metadata']['text_as_html']) for el in filtered_el if el['type'] == 'Table']
print(len(table_elements))
text_elements =  [Element(type= el['type'], text=el['text']) for el in filtered_el if el['type'] != 'Table']
print(len(text_elements))

3
106


In [11]:
# from typing import AsyncIterator, Iterator
# from langchain_core.document_loaders import BaseLoader
# from langchain_core.documents import Document

# # Custom class to load a string into a document from a string (which is what we have)
# class CustomDocumentLoader(BaseLoader):
#     """An example document loader that reads a file line by line."""

#     def __init__(self, file_path: str) -> None:
#         """Initialize the loader with a file path.

#         Args:
#             file_path: The path to the file to load.
#         """
#         self.file_path = file_path

#     def lazy_load(self) -> Iterator[Document]:  # <-- Does not take any arguments
#         """A lazy loader that reads a file line by line.

#         When you're implementing lazy load methods, you should use a generator
#         to yield documents one by one.
#         """
#         with open(self.file_path, encoding="utf-8") as f:
#             line_number = 0
#             for line in f:
#                 yield Document(
#                     page_content=line,
#                     metadata={"line_number": line_number, "source": self.file_path},
#                 )
#                 line_number += 1

#     # alazy_load is OPTIONAL.
#     # If you leave out the implementation, a default implementation which delegates to lazy_load will be used!
#     async def alazy_load(
#         self,
#     ) -> AsyncIterator[Document]:  # <-- Does not take any arguments
#         """An async lazy loader that reads a file line by line."""
#         # Requires aiofiles
#         # Install with `pip install aiofiles`
#         # https://github.com/Tinche/aiofiles
#         import aiofiles

#         async with aiofiles.open(self.file_path, encoding="utf-8") as f:
#             line_number = 0
#             async for line in f:
#                 yield Document(
#                     page_content=line,
#                     metadata={"line_number": line_number, "source": self.file_path},
#                 )
#                 line_number += 1

with open("../data/hsi_notes_1to9.txt", "w", encoding="utf-8") as fout: 
    document = "\n".join([doc.text for doc in text_elements])
    fout.write(document)
    
# loader = CustomDocumentLoader("../data/hsi_notes_1to9.txt")

In [23]:
# Implementing recursive text splitter:
with open("../data/hsi_notes_1to9.txt") as fin:
    text_notes = fin.read()
    
seperator_ls = ["\n\n", "\n", ".", "!", "?", ",", " ", ""]
text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=1000,
    chunk_overlap=30,
    length_function=len,
    separators=seperator_ls
)

text_chunks = text_splitter.create_documents([text_notes])
for i in range(len(text_chunks)):
    print(text_chunks[i])
    if i >10:
        break

page_content='Lecture 1\nHSI1000\n1 The Founding of Modern Science\nIntended Learning Outcomes for Lecture 01 You should be able to do the following after this lecture.\n(1) Describe what is science and explain the scientific method “in a nutshell”, illustrating your explanation with a straightforward example.\n(2) Describe the roles scientific observations play in the scientific method. (3) Explain what are the main concerns that should be addressed when making scientific observations. (4) Explain why anomalous phenomena are important for science, illustrating your explanation with some\nexamples from the scientific revolution.\n(5) In the context of the scientific revolution, discuss the difference between an evidence-based understanding of the natural world versus one based on authority.\n(6) Discuss the steam engine’s contribution to the Industrial Revolution and its impact on population growth in industrialized nations.'
page_content='1.1 What is Science? Hi all, welcome to the fi

In [26]:
text_chunks[0]

Document(page_content='Lecture 1\nHSI1000\n1 The Founding of Modern Science\nIntended Learning Outcomes for Lecture 01 You should be able to do the following after this lecture.\n(1) Describe what is science and explain the scientific method “in a nutshell”, illustrating your explanation with a straightforward example.\n(2) Describe the roles scientific observations play in the scientific method. (3) Explain what are the main concerns that should be addressed when making scientific observations. (4) Explain why anomalous phenomena are important for science, illustrating your explanation with some\nexamples from the scientific revolution.\n(5) In the context of the scientific revolution, discuss the difference between an evidence-based understanding of the natural world versus one based on authority.\n(6) Discuss the steam engine’s contribution to the Industrial Revolution and its impact on population growth in industrialized nations.')

In [33]:
# Load embeddings. Need to change from ...co/models/ to ...co/pipeline/feature-extraction/...
HF_API_URL = "https://api-inference.huggingface.co/pipeline/feature-extraction/sentence-transformers/all-mpnet-base-v2"
headers = {"Authorization": f"Bearer {hf_key}"}

def embed_documents(payload):
	response = requests.post(HF_API_URL, headers=headers, json=payload)
	return response.json()

payload = [doc.page_content for doc in text_chunks]
payload_embeddings = embed_documents(payload)
print(f"Dimension of embeddings: {len(payload_embeddings[0])}\nFirst 20 embeddings: {payload_embeddings[0][:20]}")

Dimension of embeddings: 768
First 20 embeddings: [-0.0317227840423584, -0.02262827567756176, -0.01190539076924324, -0.038950592279434204, -0.022500036284327507, -0.004191144369542599, 0.022412359714508057, 0.03265296667814255, 0.004366844426840544, -0.01434148009866476, 0.06965132802724838, 0.037254586815834045, -0.0011084076249971986, -0.020408283919095993, 0.05689816176891327, -0.07069188356399536, 0.004587174858897924, 0.018916362896561623, -0.04707832261919975, 0.004359079524874687]


In [43]:
qn = "Who was hired in 1846 at Vienna General Hospital?"
prompt_embeddings = embed_documents([qn]) 
similarities = cosine_similarity(prompt_embeddings, payload_embeddings)[0] 
closest_similarity_index = np.argmax(similarities) 
most_relevant_chunk = text_chunks[closest_similarity_index]
print(most_relevant_chunk.page_content)

Figure 3 Kolletschka from Wikimedia: Unknown author, Public domain, via Wikimedia Commons There is another very pertinent fact, or observation, in this case. Right after the student doctors attended the anatomical pathology lab, where they dissected badly infected corpses, they would go to Dr Semmelweis’ maternity clinic to assist in the births of expectant mothers.
Do remember that the way disease was spread, caused, and treated in those days was completely misunderstood. No one knew about germs, so there certainly were no antibiotics and there was no disinfecting of anything – no hand washing of one’s hands especially when, by just looking at them, they were quite obviously clean.
So now we have our careful observations, or facts,
(1) The mortality rate of mothers due to childbed fever in a clinic attended by doctors was, on average, five times higher than what appears to be a similar clinic with similar mothers but attended by midwives instead of doctors.


In [45]:
payload_embeddings

[[-0.0317227840423584,
  -0.02262827567756176,
  -0.01190539076924324,
  -0.038950592279434204,
  -0.022500036284327507,
  -0.004191144369542599,
  0.022412359714508057,
  0.03265296667814255,
  0.004366844426840544,
  -0.01434148009866476,
  0.06965132802724838,
  0.037254586815834045,
  -0.0011084076249971986,
  -0.020408283919095993,
  0.05689816176891327,
  -0.07069188356399536,
  0.004587174858897924,
  0.018916362896561623,
  -0.04707832261919975,
  0.004359079524874687,
  -0.02215094491839409,
  -0.045850954949855804,
  -0.0040743304416537285,
  -0.0065469020046293736,
  0.020865511149168015,
  0.004147663712501526,
  0.03342483565211296,
  -0.03804897889494896,
  -0.0009463656460866332,
  -0.042541563510894775,
  0.04778869450092316,
  0.024261048063635826,
  0.032666612416505814,
  0.014270822517573833,
  2.331332325411495e-06,
  -0.026117565110325813,
  0.02759288251399994,
  0.030856067314743996,
  -0.053202446550130844,
  0.009243343956768513,
  0.08404632657766342,
  0.029

In [None]:
# Code to initialise Pinecone Db
from pinecone.grpc import PineconeGRPC as Pinecone, ServerlessSpec, PineconeVectorStore
import os
import time

pc = Pinecone(api_key=pinecone_api_key)
index_name = "hsi-rag"

# I want 2 namespaces, one for table embeddings and one for text embeddings
text_namespace = "text-embeddings"
docsearch = PineconeVectorStore.from_documents(
    documents=text_documents,
    index_name=index_name,
    embedding=text_embeddings, 
    namespace=text_namespace 
)

time.sleep(1)