## Semi Structured and multimodal RAG
- We will use Unstructured to parse both text and tables from documents (PDFs).
- We will use the multi-vector retriever to store raw tables, text along with table summaries better suited for retrieval.
- We will use LCEL to implement the chains used.

Notebook for reference: https://github.com/langchain-ai/langchain/blob/master/cookbook/Semi_structured_and_multi_modal_RAG.ipynb

In [1]:
from typing import Any
import pandas as pd
import numpy as np
from groq import Groq
import os
import pinecone
import requests


from langchain_community.vectorstores import Chroma
from langchain.text_splitter import TokenTextSplitter
from langchain.docstore.document import Document
from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain_pinecone import PineconeVectorStore
from transformers import AutoModelForCausalLM, AutoTokenizer
from sklearn.metrics.pairwise import cosine_similarity

# Load API Keys
from unstructured.staging.base import elements_to_json, elements_from_json
from unstructured.staging.base import convert_to_dict
from unstructured.staging.base import convert_to_csv
import json
from IPython.display import display, HTML
import yaml
from groq import Groq
from dotenv import load_dotenv

from pydantic import BaseModel
from unstructured.partition.pdf import partition_pdf
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
#Need to import groq from langchain
import uuid

from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain_core.documents import Document
from langchain_openai import OpenAIEmbeddings
from langchain_core.runnables import RunnablePassthrough
from langchain_community.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain_pinecone import PineconeVectorStore
from transformers import AutoModelForCausalLM, AutoTokenizer
from sklearn.metrics.pairwise import cosine_similarity

# Can try paddle OCR instead of tesseract


In [2]:
load_dotenv()
groq_api_key = os.getenv('GROQ_API_KEY')
hf_key = os.getenv('HUGGINGFACE_API_KEY')
pinecone_api_key = os.getenv('PINECONE_API_KEY')
openai_api_key = os.getenv('OPENAI_API_KEY')
groq_client = Groq(api_key = groq_api_key)
model = "llama3-8b-8192"

## Data Loading 
- Using partitionpdf, which segments a pdf document by using a layout model.
- This layout model makes it possible to extract elements, such as tables, from PDFs.
- We will also use unstructured chunking
  - Tries to identify document sections
  - builds text blocks that maintain sections while also honoring user-defined chunk sizes

In [11]:
# Code taken from unstructured website and stack overflow 
path_to_hsi = "../data/HSI1000_1to9.pdf"
raw_pdf_elements = partition_pdf("../data/HSI1000_1to9.pdf", 
                        strategy="hi_res", 
                        hi_res_model_name="yolox",
                        infer_table_structure=True,
                        max_characters=2000,
                        new_after_n_chars=1000,
                        combine_text_under_n_chars=1000
                        )

# Create a dictionary to store counts of each type
category_counts = {}

for element in raw_pdf_elements:
    category = str(type(element))
    if category in category_counts:
        category_counts[category] += 1
    else:
        category_counts[category] = 1

# Unique_categories will have unique elements
unique_categories = set(category_counts.keys())
category_counts

# Save output to json file (Future use mongodb maybe)
convert_to_dict(raw_pdf_elements)

element_output_file = "../data/element_entities.json"
elements_to_json(raw_pdf_elements, filename=element_output_file)

In [12]:
with open("../data/element_entities.json", "r", encoding='utf-8') as fin:
    read_elements = json.load(fin)
print(f"length before filtering: {len(read_elements)}")

unwanted_types = ['Footer', 'Image', 'FigureCaption', 'UncategorizedText']
filtered_el = []
for el in read_elements:
    if el['type'] in unwanted_types:
        continue
    else:
        filtered_el.append(el)
print(f"length after filtering: {len(filtered_el)}")

length before filtering: 130
length after filtering: 109


In [7]:
filtered_el[0]

{'type': 'Title',
 'element_id': 'cc9971d7967ab7ce6a3ac73cc065832e',
 'metadata': {'coordinates': {'points': [[195.6, 158.1],
    [195.6, 187.6],
    [313.3, 187.6],
    [313.3, 158.1]],
   'system': 'PixelSpace',
   'layout_width': 1653,
   'layout_height': 2339},
  'filename': 'HSI1000_1to9.pdf',
  'file_directory': '../data',
  'last_modified': '2024-06-12T13:15:53',
  'filetype': 'application/pdf',
  'page_number': 1},
 'text': 'Lecture 1'}

In [21]:
table_elements =  [el for el in filtered_el if el['type'] == 'Table']
print(len(table_elements))
text_elements =  [el for el in filtered_el if el['type'] != 'Table']
print(len(text_elements))

3
106


In [14]:
text_elements[0]

{'type': 'Title',
 'element_id': 'cc9971d7967ab7ce6a3ac73cc065832e',
 'metadata': {'coordinates': {'points': [[195.6, 158.1],
    [195.6, 187.6],
    [313.3, 187.6],
    [313.3, 158.1]],
   'system': 'PixelSpace',
   'layout_width': 1653,
   'layout_height': 2339},
  'filename': 'HSI1000_1to9.pdf',
  'file_directory': '../data',
  'last_modified': '2024-06-12T13:15:53',
  'filetype': 'application/pdf',
  'page_number': 1},
 'text': 'Lecture 1'}

In [22]:
import pandas as pd

data=[]
for c in raw_pdf_elements: 
    row = {}
    row['Page Number'] = c.metadata.page_number 
    if row['Element Type'] == 'Table':
        row['text'] = c.metadata.text_as_html
    else:
        row['text'] = c.text 
    data.append(row)
  
df = pd.DataFrame(data)
df['combined'] = df.apply(lambda x: f"Page Number: {x['Page Number']}\n\n{x['text']}", axis=1)
df.head()

KeyError: 'Element Type'

In [17]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

separator_ls = ["\n\n", "\n", ".", "!", "?", ",", " ", ""]
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=300,
    chunk_overlap=50,
    length_function=len,
    separators=separator_ls
)

chunks = []
metadata = []

for _, row in df.iterrows():
    chunked_text = text_splitter.split_text(row['combined'])
    for chunk in chunked_text:
        chunks.append(chunk)
        metadata.append({
            'Page Number': row['Page Number']
        })

chunked_df = pd.DataFrame({
    'text': chunks,
    'metadata': metadata
})

chunked_df.head()

Unnamed: 0,text,metadata
0,Element Type: Title\nFilename: HSI1000_1to9.pd...,"{'Element Type': 'Title', 'Filename': 'HSI1000..."
1,Element Type: NarrativeText\nFilename: HSI1000...,"{'Element Type': 'NarrativeText', 'Filename': ..."
2,Element Type: Title\nFilename: HSI1000_1to9.pd...,"{'Element Type': 'Title', 'Filename': 'HSI1000..."
3,Element Type: NarrativeText\nFilename: HSI1000...,"{'Element Type': 'NarrativeText', 'Filename': ..."
4,Element Type: ListItem\nFilename: HSI1000_1to9...,"{'Element Type': 'ListItem', 'Filename': 'HSI1..."


In [19]:
chunked_df.iloc[0].text

'Element Type: Title\nFilename: HSI1000_1to9.pdf\nDate Modified: 2024-06-12T13:15:53\nFiletype: application/pdf\nPage Number: 1\n\nLecture 1'

In [23]:
with open("../data/element_entities.json", "r", encoding='utf-8') as fin:
    read_elements = json.load(fin)
print(f"length before filtering: {len(read_elements)}")

unwanted_types = ['Footer', 'Image', 'FigureCaption', 'UncategorizedText']
filtered_el = []
for el in read_elements:
    if el['type'] in unwanted_types:
        continue
    else:
        filtered_el.append(el)
print(f"length after filtering: {len(filtered_el)}")

length before filtering: 130
length after filtering: 109


In [24]:
class Element(BaseModel):
    type: str
    text: Any
    
table_elements =  [Element(type= 'Table', text=el['metadata']['text_as_html']) for el in filtered_el if el['type'] == 'Table']
print(len(table_elements))
text_elements =  [Element(type= el['type'], text=el['text']) for el in filtered_el if el['type'] != 'Table']
print(len(text_elements))

3
106


In [6]:
# from typing import AsyncIterator, Iterator
# from langchain_core.document_loaders import BaseLoader
# from langchain_core.documents import Document

# # Custom class to load a string into a document from a string (which is what we have)
# class CustomDocumentLoader(BaseLoader):
#     """An example document loader that reads a file line by line."""

#     def __init__(self, file_path: str) -> None:
#         """Initialize the loader with a file path.

#         Args:
#             file_path: The path to the file to load.
#         """
#         self.file_path = file_path

#     def lazy_load(self) -> Iterator[Document]:  # <-- Does not take any arguments
#         """A lazy loader that reads a file line by line.

#         When you're implementing lazy load methods, you should use a generator
#         to yield documents one by one.
#         """
#         with open(self.file_path, encoding="utf-8") as f:
#             line_number = 0
#             for line in f:
#                 yield Document(
#                     page_content=line,
#                     metadata={"line_number": line_number, "source": self.file_path},
#                 )
#                 line_number += 1

#     # alazy_load is OPTIONAL.
#     # If you leave out the implementation, a default implementation which delegates to lazy_load will be used!
#     async def alazy_load(
#         self,
#     ) -> AsyncIterator[Document]:  # <-- Does not take any arguments
#         """An async lazy loader that reads a file line by line."""
#         # Requires aiofiles
#         # Install with `pip install aiofiles`
#         # https://github.com/Tinche/aiofiles
#         import aiofiles

#         async with aiofiles.open(self.file_path, encoding="utf-8") as f:
#             line_number = 0
#             async for line in f:
#                 yield Document(
#                     page_content=line,
#                     metadata={"line_number": line_number, "source": self.file_path},
#                 )
#                 line_number += 1

with open("../data/hsi_notes_1to9.txt", "w", encoding="utf-8") as fout: 
    document = "\n".join([doc.text for doc in text_elements])
    fout.write(document)
    
# loader = CustomDocumentLoader("../data/hsi_notes_1to9.txt")

In [7]:
# Implementing recursive text splitter:
with open("../data/hsi_notes_1to9.txt") as fin:
    text_notes = fin.read()
    
seperator_ls = ["\n\n", "\n", ".", "!", "?", ",", " ", ""]
text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=300,
    chunk_overlap=50,
    length_function=len,
    separators=seperator_ls
)

text_chunks = text_splitter.create_documents([text_notes])
print(len(text_chunks))

109


In [8]:
# Load embeddings. Need to change from ...co/models/ to ...co/pipeline/feature-extraction/...
HF_API_URL = "https://api-inference.huggingface.co/pipeline/feature-extraction/sentence-transformers/all-mpnet-base-v2"
headers = {"Authorization": f"Bearer {hf_key}"}

def embed_documents(payload):
	response = requests.post(HF_API_URL, headers=headers, json=payload)
	return response.json()

payload = [doc.page_content for doc in text_chunks]
payload_embeddings = embed_documents(payload)
print(f"Dimension of embeddings: {len(payload_embeddings[0])}\nFirst 20 embeddings: {payload_embeddings[0][:20]}")

Dimension of embeddings: 768
First 20 embeddings: [-0.02262585423886776, -0.06885164231061935, -0.000889421091414988, -0.03406470641493797, -0.0012884179595857859, 0.001033041742630303, 0.0027252230793237686, 0.019959641620516777, 0.01079109963029623, -0.00538204051554203, 0.049704715609550476, 0.047944437712430954, 0.0277978777885437, -0.0077078077010810375, 0.04660750553011894, -0.10496175289154053, 0.03672698140144348, -0.007716397289186716, -0.02255360595881939, -0.02787116728723049]


In [13]:
qn = "How long was Dr ignaz's contract at the hospital?"
prompt_embeddings = embed_documents(qn) 
similarities = cosine_similarity([prompt_embeddings], payload_embeddings)[0] 
closest_similarity_index = np.argmax(similarities) 
most_relevant_chunk = text_chunks[closest_similarity_index].page_content
print(f"The most relevant chunk found:\n{most_relevant_chunk}")

Dr. Ignaz Semmelweis (Figure 1) was hired on a three-year contract into the Vienna General Hospital's maternity clinic from 1846 – 1849. At the time “childbed fever”, aka puerperal fever, was running rampant in hospitals all over Europe and the US


In [19]:
from langchain_groq import ChatGroq

def llama_chat(user_question, context):
    chat = ChatGroq(temperature=0, model_name="llama3-8b-8192")
    system = '''
            You are a science professor in a university. Given the user's question and relevant sections from a set of school notes,\
            answer the question by including direct quotes from the notes.
            '''
    human = "{text}"
    prompt = ChatPromptTemplate.from_messages(
        [
            (
                "system", system
            ),
            (
                "human", human
                )
        ]
    )
    chain = prompt | chat
    return chain.invoke({"text": f"User Question: " + user_question + "\n\nRelevant section in textbook:\n\n" + context})

answer = llama_chat(qn, most_relevant_chunk)
print(answer.content)

According to the relevant section in the textbook, Dr. Ignaz Semmelweis was hired on a three-year contract into the Vienna General Hospital's maternity clinic from 1846 to 1849.


In [20]:
# Table section
table_elements

[Element(type='Table', text='<table><thead><th></th><th>Laptop doesn’t boot</th></thead><tr><td></td><td>Battery dead</td></tr><tr><td>the explanation |</td><td>Plug in external power</td></tr><tr><td>of test</td><td>Laptop seems to boot, so the battery must have been dead</td></tr></table>'),
 Element(type='Table', text='<table><tr><td rowspan="2">Explanation Test the explanation |</td><td>Laptop monitor not working</td></tr><tr><td></td><td>Try connecting the external monitor with HDMI cable 1</td></tr><tr><td rowspan="2">Result of test</td><td>Laptop seems to boot, but there’s nothing on the screen. (a) Either the graphics card or motherboard has issues,</td></tr><tr><td></td><td>or (b) Something was wrong with our test.</td></tr></table>'),
 Element(type='Table', text='<table><tr><td>Observation</td><td>Laptop seems boot, but there’s nothing on the screen</td></tr><tr><td>Explanation</td><td>Laptop monitor not working</td></tr><tr><td>Test the explanation</td><td>| Try connecting t

In [40]:
import time
from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings


pinecone_index_name = "hsi-rag"
namespace = 'pages_1to9'
embedding_function = SentenceTransformerEmbeddings(model_name="all-mpnet-base-v2")
docsearch = PineconeVectorStore.from_documents(
                                        documents=text_chunks,
                                        embedding=embedding_function,
                                        index_name=pinecone_index_name,
                                        namespace = namespace)
time.sleep(1)



In [41]:
text_chunks[0]

Document(page_content='Lecture 1\nHSI1000\n1 The Founding of Modern Science\nIntended Learning Outcomes for Lecture 01 You should be able to do the following after this lecture.\n(1) Describe what is science and explain the scientific method “in a nutshell”, illustrating your explanation with a straightforward example.', metadata={'text': 'Lecture 1\nHSI1000\n1 The Founding of Modern Science\nIntended Learning Outcomes for Lecture 01 You should be able to do the following after this lecture.\n(1) Describe what is science and explain the scientific method “in a nutshell”, illustrating your explanation with a straightforward example.'})