## Semi Structured and multimodal RAG

- We will use Unstructured to parse both text and tables from documents (PDFs).
- We will use the multi-vector retriever to store raw tables, text along with table summaries better suited for retrieval.
- We will use LCEL to implement the chains used.

Notebook for reference: https://github.com/langchain-ai/langchain/blob/master/cookbook/Semi_structured_and_multi_modal_RAG.ipynb


In [10]:
from typing import Any, List, Dict
import os
import sys
import time
import logging
import json
import yaml
import requests
import warnings
import pandas as pd
import numpy as np
from datetime import datetime
from tqdm import trange
from dotenv import load_dotenv
from io import BytesIO
import io
import zipfile
import re

# Adobe PDF Services imports
from adobe.pdfservices.operation.auth.service_principal_credentials import ServicePrincipalCredentials
from adobe.pdfservices.operation.exception.exceptions import ServiceApiException, ServiceUsageException, SdkException
from adobe.pdfservices.operation.io.cloud_asset import CloudAsset
from adobe.pdfservices.operation.io.stream_asset import StreamAsset
from adobe.pdfservices.operation.pdf_services import PDFServices
from adobe.pdfservices.operation.pdf_services_media_type import PDFServicesMediaType
from adobe.pdfservices.operation.pdfjobs.jobs.extract_pdf_job import ExtractPDFJob
from adobe.pdfservices.operation.pdfjobs.params.extract_pdf.extract_element_type import ExtractElementType
from adobe.pdfservices.operation.pdfjobs.params.extract_pdf.extract_pdf_params import ExtractPDFParams
from adobe.pdfservices.operation.pdfjobs.result.extract_pdf_result import ExtractPDFResult

# Pinecone and Langchain imports
from pinecone import Pinecone
from pinecone_text.sparse import BM25Encoder
from langchain_groq import ChatGroq
from groq import Groq
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.prompts import PromptTemplate, ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
warnings.filterwarnings("ignore")

# Load environment variables
load_dotenv()
groq_api_key = os.getenv('GROQ_API_KEY')
hf_key = os.getenv('HUGGINGFACE_API_KEY')
pinecone_api_key = os.getenv('PINECONE_API_KEY')
openai_api_key = os.getenv('OPENAI_API_KEY')

# Initialize clients
pc = Pinecone(api_key=pinecone_api_key)

# Define model
model = "llama3-8b-8192"

## Using ADOBE API to extract components

In [5]:
file_path = '../data/HSI1000_1to9_unbocked.pdf'

# Initialize the logger
logging.basicConfig(level=logging.INFO)

class ExtractTextTableInfoFromPDF:
    def __init__(self):
        try:
            file = open(file_path, 'rb')
            input_stream = file.read()
            file.close()

            # Initial setup, create credentials instance
            credentials = ServicePrincipalCredentials(
                client_id=os.getenv('ADOBE_SERVICES_CLIENT_ID'),
                client_secret=os.getenv('ADOBE_SERVICES_CLIENT_SECRET')
            )

            # Creates a PDF Services instance
            pdf_services = PDFServices(credentials=credentials)

            # Creates an asset(s) from source file(s) and upload
            input_asset = pdf_services.upload(input_stream=input_stream, mime_type=PDFServicesMediaType.PDF)

            # Create parameters for the job
            extract_pdf_params = ExtractPDFParams(
                elements_to_extract=[ExtractElementType.TEXT, ExtractElementType.TABLES],
            )

            # Creates a new job instance
            extract_pdf_job = ExtractPDFJob(input_asset=input_asset, extract_pdf_params=extract_pdf_params)

            # Submit the job and gets the job result
            location = pdf_services.submit(extract_pdf_job)
            pdf_services_response = pdf_services.get_job_result(location, ExtractPDFResult)

            # Get content from the resulting asset(s)
            result_asset: CloudAsset = pdf_services_response.get_result().get_resource()
            stream_asset: StreamAsset = pdf_services.get_content(result_asset)

            
            zip_bytes = io.BytesIO(stream_asset.get_input_stream())
            with zipfile.ZipFile(zip_bytes, 'r') as zip_ref:
                # Extract all the contents into memory
                self.extracted_data = {name: zip_ref.read(name) for name in zip_ref.namelist()}
                
        except (ServiceApiException, ServiceUsageException, SdkException) as e:
            logging.exception(f'Exception encountered while executing operation: {e}')

extractor = ExtractTextTableInfoFromPDF()
extracted_data = extractor.extracted_data

2024-06-16 00:56:25,295 - INFO - Started uploading asset
2024-06-16 00:56:27,851 - INFO - Finished uploading asset
2024-06-16 00:56:27,856 - INFO - Started submitting EXTRACT_PDF job
2024-06-16 00:56:29,167 - INFO - Started getting job result
2024-06-16 00:56:35,957 - INFO - Finished polling for status
2024-06-16 00:56:35,960 - INFO - Finished getting job result
2024-06-16 00:56:35,961 - INFO - Started getting content
2024-06-16 00:56:36,304 - INFO - Finished getting content


In [105]:
def eval_table_index_llama(table_str):
    class Header(BaseModel):
        index: int = Field(description="Header of the table, 0 for first row as the header, 1 for first column as the header")
        
    parser = JsonOutputParser(pydantic_object=Header)

    chat = ChatGroq(temperature=0, model_name="llama3-8b-8192")
    template = '''You will assist me in deciding, based on the first 2 entries of a table, whether the first row or the first colum should be the header. 
            You are to output an int, 0 or 1. Where 0 if the first row is header, and 1 if the first column is the header.
            Follow the format instructions carefully.
            Table:
            {table}
            
            {format_instructions}
            '''
    prompt = PromptTemplate(
        template=template,
        input_variables=["table"],
        partial_variables={"format_instructions": parser.get_format_instructions()},
    )
    chain = prompt | chat | parser
    return chain.invoke({"table": table_str})

def clean_values(x):
    if isinstance(x, str):
        return x.replace('_x000D_', '').strip()
    return x

def get_table_check_string(df):
    table_str = ""
    for i in range(2):
        if i ==1:
            table_str += f"Row {i}: {df.iloc[i].values.tolist()}"  
        else:
            table_str += f"Row {i}: {df.iloc[i].values.tolist()}\n"
    return table_str

def convert_table_to_str(df):
    for index, row in df.iterrows():
        row_str = ""
        for col in df.columns:
            sentences = re.split(r'(?<=\.)\s*', row[col])
            row_sentence = ""
            for i in range(len(sentences)):
                row_sentence += sentences[i] + "\n"
            row_str += f"{col}: {row_sentence}, "
        formatted = row_str[:-2]
    return formatted
    
def get_table_meta(elements):
    table_file_pages = {}
    for el in elements:
        # Using get to avoid KeyError and ensure 'filePaths' is not empty
        file_paths = el.get('filePaths')
        if file_paths:
            page = el.get('Page', 'Unknown')  # Provide a default page number if missing
            table_file_pages[file_paths[0]] = {"Page": page}
    return table_file_pages

# Process JSON data
if 'structuredData.json' in extracted_data:
    json_data = json.loads(extracted_data['structuredData.json'])
    print("JSON Data:", json_data)
    
def get_table_pages_and_text_chunks(json_data):
    if 'elements' not in json_data:
        logging.error("Missing 'elements' key in json_data")
        raise ValueError("Missing 'elements' key in json_data")

    table_file_pages = {}
    page_text = ""
    start_page = 0
    all_chunks = []
    separator_list = ["\n\n", "\n", ". ", "!", "?", ",", " ", "", ")", "("]
    
    try:
        text_splitter = RecursiveCharacterTextSplitter(
                chunk_size=300,
                chunk_overlap=50,
                length_function=len,
                separators=separator_list)
    except Exception as e:
        logging.error(f"Failed to initialize text splitter: {e}")
        raise

    list_label = ""
    for i, element in enumerate(json_data['elements']):
        try:
            current_page = element['Page']
        except KeyError:
            logging.warning(f"Missing 'Page' key in element at index {i}")
            continue

        try:
            if current_page > start_page:
                # Update the new page number
                start_page = current_page
                # Generate the chunks for the previous page
                separated_list = text_splitter.split_text(page_text)
                for chunk in separated_list:
                    if chunk not in [". ", "."]:  # Simplified condition
                        all_chunks.append({'Page': start_page + 1, 'text': chunk})
                # Update the string of text 
                page_text = ""
                list_label = ""
            else:
                if 'Text' in element and not element['Path'].startswith("//Document/Table"):
                    if element['Path'].endswith("Lbl"):
                        list_label = element['Text']
                    else:
                        page_text += list_label + element['Text'] + "\n" if list_label else element['Text'] + "\n"
                        list_label = ""
        except KeyError as e:
            logging.warning(f"Key error in element processing at index {i}: {e}")

        # Obtaining the table metadata
        try:
            file_paths = element.get('filePaths')
            if file_paths:
                page = element.get('Page', 'Unknown')
                match = re.search(r'\d+', file_paths[0])
                table_index = match.group(0)
                table_file_pages[int(table_index)] = {"Page": page}
        except Exception as e:
            logging.error(f"Error processing file paths at index {i}: {e}")

    return table_file_pages, all_chunks

table_file_pages, all_chunks = get_table_pages_and_text_chunks(json_data)

#The literal extraction of the file itself
excel_files = {k: v for k, v in extracted_data.items() if k.endswith('.xlsx')}

table_dataframes = {}

i=0
for filename, content in excel_files.items():
    excel_stream = BytesIO(content)
    df = pd.read_excel(excel_stream, header=None)
    df = df.applymap(clean_values)
    df_str = get_table_check_string(df) 
    # dic = eval_table_index_llama(df_str)
    # header_index = dic['index']
    header_index = 1
    
    # ie header_index is non zero
    if header_index:
        df = pd.read_excel(excel_stream, header=None)
        df = df.applymap(clean_values)
        df = df.T
        # Set the first row as the new header
        new_header = df.iloc[0]  # Take the first row for the header
        df = df[1:]  # Take the data less the header row
        df.columns = new_header  # Set the header row as the df header
        # Optionally, reset index if necessary
        df.reset_index(drop=True, inplace=True)
    else:
        df = pd.read_excel(excel_stream, header=0)
    table_str = convert_table_to_str(df)
    table_dataframes[i] = table_str
    i+= 1
table_dataframes

JSON Data: {'version': {'json_export': '202', 'page_segmentation': '5', 'schema': '1.1.0', 'structure': '1.1093.0', 'table_structure': '5'}, 'extended_metadata': {'ID_instance': '8F 5E 6E 56 1D BA B2 11 0A 00 67 45 8B 6B C6 23 ', 'ID_permanent': '46 39 20 34 38 20 36 39 20 35 36 20 31 44 20 42 41 20 42 32 20 31 31 20 30 41 20 30 30 20 36 37 20 34 35 20 38 42 20 36 42 20 43 36 20 32 33 20 ', 'has_acroform': False, 'has_embedded_files': False, 'is_XFA': False, 'is_certified': False, 'is_encrypted': False, 'is_digitally_signed': False, 'language': 'en', 'page_count': 9, 'pdf_version': '1.6', 'pdfa_compliance_level': '', 'pdfua_compliance_level': ''}, 'elements': [{'Bounds': [63.263397216796875, 728.4153594970703, 395.4293975830078, 756.5929870605469], 'Font': {'alt_family_name': 'Calibri', 'embedded': True, 'encoding': 'Custom', 'family_name': 'Calibri', 'font_type': 'TrueType', 'italic': False, 'monospaced': False, 'name': 'AAAAAE+Calibri-Light', 'subset': True, 'weight': 300}, 'HasClip'

{0: 'Observation: Laptop seems boot, but there’s nothing on the screen\n, Explanation: Laptop monitor not working\n, Test the explanation: Try connecting the external monitor with HDMI cable 2\n, Result of test: Laptop definitely boots and the external monitor shows the start screen.\n\n',
 1: 'Observation: Laptop doesn’t boot\n, Explanation: Battery dead\n, Test the explanation: Plug in external power\n, Result of test: Laptop seems to boot, so the battery must have been dead\n',
 2: 'Observations: (1) Tea bag bloats and floats on top of the water when boiling water is poured directly on top of it.\n(2) Tea bag doesn’t bloat and sinks in the water when boiling water is poured on the side and not directly onto it.\n\n, Explanation: Water poured on top of the tea bag fills the pores of the teabag itself, trapping any gas inside before it can escape.\nThe hot water heats the trapped air, causing it to expand.\nThe trapped air prevents the tea bag from being dunked.\nPores of the tea bag 

In [107]:
table_dataframes

{0: 'Observation: Laptop seems boot, but there’s nothing on the screen\n, Explanation: Laptop monitor not working\n, Test the explanation: Try connecting the external monitor with HDMI cable 2\n, Result of test: Laptop definitely boots and the external monitor shows the start screen.\n\n',
 1: 'Observation: Laptop doesn’t boot\n, Explanation: Battery dead\n, Test the explanation: Plug in external power\n, Result of test: Laptop seems to boot, so the battery must have been dead\n',
 2: 'Observations: (1) Tea bag bloats and floats on top of the water when boiling water is poured directly on top of it.\n(2) Tea bag doesn’t bloat and sinks in the water when boiling water is poured on the side and not directly onto it.\n\n, Explanation: Water poured on top of the tea bag fills the pores of the teabag itself, trapping any gas inside before it can escape.\nThe hot water heats the trapped air, causing it to expand.\nThe trapped air prevents the tea bag from being dunked.\nPores of the tea bag 

In [108]:
meta_table_batch = [f"Page {table_file_pages[table_index]['Page']}: {table_str}" for table_index, table_str in table_dataframes.items()]
meta_table_batch

['Page 1: Observation: Laptop seems boot, but there’s nothing on the screen\n, Explanation: Laptop monitor not working\n, Test the explanation: Try connecting the external monitor with HDMI cable 2\n, Result of test: Laptop definitely boots and the external monitor shows the start screen.\n\n',
 'Page 1: Observation: Laptop doesn’t boot\n, Explanation: Battery dead\n, Test the explanation: Plug in external power\n, Result of test: Laptop seems to boot, so the battery must have been dead\n',
 'Page 2: Observations: (1) Tea bag bloats and floats on top of the water when boiling water is poured directly on top of it.\n(2) Tea bag doesn’t bloat and sinks in the water when boiling water is poured on the side and not directly onto it.\n\n, Explanation: Water poured on top of the tea bag fills the pores of the teabag itself, trapping any gas inside before it can escape.\nThe hot water heats the trapped air, causing it to expand.\nThe trapped air prevents the tea bag from being dunked.\nPores 

In [None]:
# TODO
def get_table_chunks(table_dataframes):
    return

In [None]:
# TODO
def upsert_table_pinecone(table_documents):
    return

In [30]:
def upsert_text_pinecone(text_documents):
    bm25 = BM25Encoder()

	# Load embeddings. Need to change from ...co/models/ to ...co/pipeline/feature-extraction/...
    HF_API_URL = "https://api-inference.huggingface.co/pipeline/feature-extraction/sentence-transformers/all-mpnet-base-v2"
    headers = {"Authorization": f"Bearer {hf_key}"}

    def dense_embed(payload: str) -> str:
        response = requests.post(HF_API_URL, headers=headers, json=payload)
        return response.json()

    # Convert text_documents to DataFrame
    df = pd.DataFrame(text_documents)

    batch_size = 32

    # Loop through the DataFrame 'df' in batches of size 'batch_size'
    for i in trange(0, len(df), batch_size):
        i_end = min(i+batch_size, len(df)) # Determine the end index of the current batch
        df_batch = df.iloc[i:i_end] # Extract the current batch from the DataFrame
        df_dict = df_batch.to_dict(orient="records") # Convert the batch to a list of dictionaries
        
        meta_text_batch = [
            f"Page {row['Page']}: {row['text']}" for _, row in df_batch.iterrows()
        ]
        
        bm25.fit(meta_text_batch)

        text_chunks = df_batch['text'].tolist()
        
        # Encode combined metadata and text using BM25Encoder to create sparse embeddings
        sparse_embeddings = bm25.encode_documents([combined for combined in meta_text_batch])

        # Encode text using SentenceTransformer to create dense embeddings
        dense_embeddings = dense_embed(text_chunks)
        
        # Generate a list of IDs for the current batch
        ids = ['vec' +str(x) for x in range(i, i_end)]
        time.sleep(2)
        pinecone_batch_upserts = []
        
        for _id, sparse, dense, meta in zip(ids, sparse_embeddings, dense_embeddings, df_dict):
            pinecone_batch_upserts.append({
                'id': _id,
                'values': dense,
                'sparse_values': sparse,
                'metadata': meta
            })
        
        index = pc.Index('hsi-notes')
        
        # RUN ONLY WHEN WANT TO UPSERT NEW BATCH
        if isinstance(dense_embeddings, list):
            upsert_response = index.upsert(vectors = pinecone_batch_upserts, namespace='page-1to9-with-metadata')
        else:
            print("Embedding model not connected properly. Dense embeddings not generated. ")
            return
        print(f"Batch starting with index {(i%batch_size) + 1} upserted")
    return

 33%|███▎      | 1/3 [00:17<00:35, 17.61s/it]

Batch starting with index 0 upserted


 67%|██████▋   | 2/3 [00:34<00:17, 17.24s/it]

Batch starting with index 32 upserted


100%|██████████| 3/3 [00:50<00:00, 17.00s/it]

Batch starting with index 64 upserted





### Using the pinecone index to test the retrieval


In [56]:
def get_relevant_chunks(query, top_k):
    index = pc.Index('hsi-notes')
    # Create dense vector of user query
    dense_query = dense_embed(query)
    matches = index.query( 
        namespace='page-1to9-texts',
        top_k=top_k, 
        vector=dense_query, 
        include_metadata=True
        )
    return matches

def pretty_print_matches(result):
    print(f"Namespace searched: {result['namespace']}\n")
    num_results = len(result['matches'])
    print(f"Top {num_results} relevant chunks found:\n")
    for i in range(num_results):
        print(f"Found on page {int(result['matches'][i]['metadata']['Page'])}:")
        print(f"{result['matches'][i]['metadata']['text']}")
        print(f"Dotproduct score: {result['matches'][i]['score']}")
        print("-" * 80)

def get_llm_context(query, top_k):
    index_stats = pc.describe_index(os.environ['PINECONE_INDEX_NAME'])
    if index_stats['status']['ready'] and index_stats['status']['state'] == "Ready":
        relevant_matches = get_relevant_chunks(query, top_k)        
    # ideally its just to combine the first 2 matches. Or maybe to go by dotproduct score and difference 
    context = ""
    for i in range(len(relevant_matches['matches'])):
        context += f"Page number: {int(relevant_matches['matches'][i]['metadata']['Page'])}" + relevant_matches['matches'][i]['metadata']['text'] + "\n"
    return context

In [13]:
from langchain_groq import ChatGroq

def llama_chat(user_question, k):
    # context = get_llm_context(user_question, k)
    context = '''Then a rather unfortunate incident occurred in 1847 in an anatomical pathology lab. Dr 
                Semmelweis’ friend, who he greatly admired, died after being accidentally pricked by a 
                scalpel being used by a student doctor while he was assisting in performing an autopsy. 
                Professor  Kolletschka  (Figure  3)  suffered  identical  signs  and  symptoms  as  the  mothers 
                who died of childbed fever. Dr Semmelweis wrote about the incident'''
    chat = ChatGroq(temperature=0, model_name="llama3-8b-8192")
    system = '''
            You are a science professor in a university. 
            Given the user's question and relevant sections from a set of school notes about scientific methodology and the history of science.
            You will also answer the question by including direct quotes from the notes, \
            along with the page number where the answer or answers can be found.
            '''
    human = "{text}"
    prompt = ChatPromptTemplate.from_messages(
        [
            (
                "system", system
            ),
            (
                "human", human
                )
        ]
    )
    chain = prompt | chat
    return chain.invoke({"text": f"User Question: " + user_question + "\n\nRelevant section in textbook:\n\n" + context})

answer = llama_chat("How did Dr Semmelweis' friend die?", 5)
print(answer.content)

2024-06-16 01:13:47,201 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"


According to the notes, Dr. Semmelweis' friend, Professor Kolletschka, died after being accidentally pricked by a scalpel being used by a student doctor while assisting in an autopsy. This incident occurred in 1847.


In [219]:
def show_element_paths(json_data):
    for i, el in enumerate(json_data['elements']):
        print(f"{i}: {el['Path']}")

0: //Document/Title
1: //Document/Aside/P
2: //Document/Aside/P[2]
3: //Document/Aside/L/LI/Lbl
4: //Document/Aside/L/LI/LBody
5: //Document/Aside/L/LI[2]/Lbl
6: //Document/Aside/L/LI[2]/LBody
7: //Document/Aside/L/LI[3]/Lbl
8: //Document/Aside/L/LI[3]/LBody
9: //Document/Aside/L/LI[4]/Lbl
10: //Document/Aside/L/LI[4]/LBody
11: //Document/Aside/L/LI[5]/Lbl
12: //Document/Aside/L/LI[5]/LBody
13: //Document/Aside/L/LI[6]/Lbl
14: //Document/Aside/L/LI[6]/LBody
15: //Document/H1
16: //Document/P
17: //Document/P[2]
18: //Document/H2
19: //Document/P[3]
20: //Document/H3
21: //Document/P[4]
22: //Document/P[5]
23: //Document/P[6]
24: //Document/P[7]
25: //Document/P[8]
26: //Document/H3[2]
27: //Document/P[9]
28: //Document/P[10]
29: //Document/P[11]
30: //Document/P[12]
31: //Document/P[13]
32: //Document/H1[2]
33: //Document/H2[2]
34: //Document/P[14]
35: //Document/Table
36: //Document/Table/TR/TH
37: //Document/Table/TR/TH/P
38: //Document/Table/TR/TD
39: //Document/Table/TR/TD/P
40: //