In [1]:
import os
import logging
import json
import uuid
from datetime import datetime
import time
import sys
import warnings
import pandas as pd
from tqdm import trange
from dotenv import load_dotenv
from io import BytesIO
import io
import zipfile
import re

# Adobe PDF Services imports
from adobe.pdfservices.operation.auth.service_principal_credentials import ServicePrincipalCredentials
from adobe.pdfservices.operation.exception.exceptions import ServiceApiException, ServiceUsageException, SdkException
from adobe.pdfservices.operation.io.cloud_asset import CloudAsset
from adobe.pdfservices.operation.io.stream_asset import StreamAsset
from adobe.pdfservices.operation.pdf_services import PDFServices
from adobe.pdfservices.operation.pdf_services_media_type import PDFServicesMediaType
from adobe.pdfservices.operation.pdfjobs.jobs.extract_pdf_job import ExtractPDFJob
from adobe.pdfservices.operation.pdfjobs.params.extract_pdf.extract_element_type import ExtractElementType
from adobe.pdfservices.operation.pdfjobs.params.extract_pdf.extract_pdf_params import ExtractPDFParams
from adobe.pdfservices.operation.pdfjobs.result.extract_pdf_result import ExtractPDFResult

# Pinecone and Langchain imports
from pinecone import Pinecone
from pinecone_text.sparse import BM25Encoder
from langchain_groq import ChatGroq
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.prompts import PromptTemplate, ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
warnings.filterwarnings("ignore")

# Load environment variables
load_dotenv()
groq_api_key = os.getenv('GROQ_API_KEY')
hf_key = os.getenv('HUGGINGFACE_API_KEY')
pinecone_api_key = os.getenv('PINECONE_API_KEY')
openai_api_key = os.getenv('OPENAI_API_KEY')
dense_embedder_api = os.getenv("HF_API_URL")

# Initialize clients
pc = Pinecone(api_key=pinecone_api_key)

# Define model
chat_model = "llama3-8b-8192"
index = pc.Index('hsi-notes')
namespace = 'Chapter-1'

from typing import Optional, Union, TypeAlias

# Import other necessary modules
from llama_index.legacy import Document
from llama_index.legacy import VectorStoreIndex, SimpleDirectoryReader, ServiceContext, PromptTemplate
from llama_index.legacy.node_parser import SimpleNodeParser, SentenceWindowNodeParser
from llama_index.legacy.postprocessor import MetadataReplacementPostProcessor, SentenceTransformerRerank
from llama_index.legacy.schema import MetadataMode

In [2]:
# Initialize the logger
logging.basicConfig(level=logging.INFO)

class ExtractTextTableInfoFromPDF:
    def __init__(self, file_path):
        self.unique_id = str(uuid.uuid4())
        try:
            file = open(file_path, 'rb')
            input_stream = file.read()
            file.close()

            # Initial setup, create credentials instance
            credentials = ServicePrincipalCredentials(
                client_id=os.getenv('ADOBE_SERVICES_CLIENT_ID'),
                client_secret=os.getenv('ADOBE_SERVICES_CLIENT_SECRET')
            )

            # Creates a PDF Services instance
            pdf_services = PDFServices(credentials=credentials)

            # Creates an asset(s) from source file(s) and upload
            input_asset = pdf_services.upload(input_stream=input_stream, mime_type=PDFServicesMediaType.PDF)

            # Create parameters for the job
            extract_pdf_params = ExtractPDFParams(
                elements_to_extract=[ExtractElementType.TEXT, ExtractElementType.TABLES],
            )

            # Creates a new job instance
            extract_pdf_job = ExtractPDFJob(input_asset=input_asset, extract_pdf_params=extract_pdf_params)

            # Submit the job and gets the job result
            location = pdf_services.submit(extract_pdf_job)
            pdf_services_response = pdf_services.get_job_result(location, ExtractPDFResult)

            # Get content from the resulting asset(s)
            result_asset: CloudAsset = pdf_services_response.get_result().get_resource()
            stream_asset: StreamAsset = pdf_services.get_content(result_asset)
            
            zip_bytes = io.BytesIO(stream_asset.get_input_stream())
            with zipfile.ZipFile(zip_bytes, 'r') as zip_ref:
                # Extract all the contents into memory
                self.extracted_data = {name: zip_ref.read(name) for name in zip_ref.namelist()}
                
        except (ServiceApiException, ServiceUsageException, SdkException) as e:
            logging.exception(f'Exception encountered while executing operation: {e}')
    
    # Generates a string containing a directory structure and file name for the output file using unique_id
    @staticmethod
    def create_output_file_path(unique_id: str) -> str:
        os.makedirs("../data/Extracted_data", exist_ok=True)
        return f"../data/Extracted_data/{unique_id}.zip"

    @classmethod
    def create_with_unique_id(cls, file_path):
        instance = cls(file_path)
        return instance, instance.unique_id

In [3]:
# Get the extracted data from the extractor
def get_extracted_data(extracted_data):
    if 'structuredData.json' in extracted_data:
        json_data = json.loads(extracted_data['structuredData.json'])
    return json_data

# Function to initialise a flexible text splitter
def initialise_text_splitter(chunk_size, chunk_overlap):
    separator_list = ["\n\n", "\n", ". ", "!", "?", ",", " ", "", ")", "("]
    try:
        text_splitter = RecursiveCharacterTextSplitter(
                chunk_size=chunk_size,
                chunk_overlap=chunk_overlap,
                length_function=len,
                separators=separator_list)
    except Exception as e:
        logging.error(f"Failed to initialize text splitter: {e}")
        return None
    return text_splitter

# Function to obtain text chunks using the text splitter
def get_text_chunks(file_path, json_data, text_splitter):
    if 'elements' not in json_data:
        logging.error("Missing 'elements' key in json_data")
        raise ValueError("Missing 'elements' key in json_data")

    if not text_splitter:
        logging.error("Text splitter not initialised properly. ")
        sys.exit()  
        
    file_name = file_path.split("/")[-1]

    # Chunks are split by pages here
    page_text = ""
    start_page = 0
    all_chunks = []
    
    list_label = ""
    for i in range(len(json_data['elements'])):
        try:
            current_page = json_data['elements'][i]['Page']
        except KeyError:
            logging.warning(f"Missing 'Page' key in element at index {i}")
            continue

        try:
            if current_page > start_page:
                # Update the new page number
                start_page = current_page               
                # Generate the chunks for the previous page
                separated_list = text_splitter.split_text(page_text)
                for chunk in separated_list:
                    if chunk not in [". ", "."]:  # Simplified condition
                        all_chunks.append({'ElementType': 'Text', 'file_name': file_name, 'Page': current_page, 'Text': chunk})
                # Update the string of text 
                page_text = ""
                list_label = ""
            else:
                if 'Text' in json_data['elements'][i]:  # Check if Text is not empty
                    if json_data['elements'][i]['Path'].endswith("Lbl") and not json_data['elements'][i]['Path'].startswith("//Document/Table"):
                        list_label = json_data['elements'][i]['Text']
                    else:
                        if list_label:
                            page_text += list_label + json_data['elements'][i]['Text']
                            list_label = ""  # Reset list label to empty string
                        else:
                            page_text += json_data['elements'][i]['Text'] + "\n"
        except KeyError as e:
            logging.warning(f"Key error in json_data['elements'][i] processing at index {i}: {e}")
    
    
    # Processing the last page of the text
    if page_text:
        separated_list = text_splitter.split_text(page_text)
        for chunk in separated_list:
            if chunk not in [". ", "."]:
                all_chunks.append({'ElementType': 'Text', 'file_name': file_name, 'Page': current_page, 'Text': chunk})

    return all_chunks

# Function to derive the nodes from the text chunks
def convert_textchunks_to_nodes(text_chunks):
    # Conversion of text chunks to Documents
    text_documents = [Document(text=chunk['Text'],
                            metadata={
                                "file_name": chunk['file_name'],
                                "page": chunk['Page']
                                },
                            excluded_llm_metadata_keys=["file_name"],
                            metadata_seperator="::",
                            metadata_template="{key}=>{value}",
                            text_template="Metadata: {metadata_str}\n-----\nContent: {content}")

                                for chunk in text_chunks]
    print(
        "The LLM sees this: \n",
        text_documents[0].get_content(metadata_mode=MetadataMode.LLM),
    )
    print("-"* 80)
    print(
        "The Embedding model sees this: \n",
        text_documents[0].get_content(metadata_mode=MetadataMode.EMBED),
    )

    # create the sentence window node parser w/ default settings
    SW_node_parser = SentenceWindowNodeParser.from_defaults(
        window_size=1,
        window_metadata_key="window",
        original_text_metadata_key="original_text",
    )

    # Create the nodes
    nodes = SW_node_parser.get_nodes_from_documents(text_documents)
    return nodes

# Function to get the unique tables from all the table elements
def extract_unique_tables(table_elements):
    tables = set()
    for item in table_elements:
        match = re.search(r'/Table(\[\d+\])?', item['Path'])
        if match:
            tables.add('Table' + (match.group(1) if match.group(1) else ''))
    
    unique_tables = list(tables)
    unique_tables[0] += "/"

    extracted_tables = {}
    i=0
    for table_name in unique_tables:
        table = []
        for el in table_elements:
            if table_name in el['Path']:
                # ADjust this here if u need to extract more information from the table elements
                table.append({"path": el['Path'], "text": el['Text'], "Page": el["Page"]})
        extracted_tables[i+1] = table
        i += 1

    return dict(sorted(extracted_tables.items()))

# Function to take in table elements from a specific table and convert it to a pandas dataframe
def transform_table(table, output_file_path):
    # only need to look at the first row
    first_row = [el for el in table if "TR/" in el['path']]
    # Function to decide if table has 2 indexes
    def is_2_index(first_row):
        for el in first_row:
            if "TH/" in el['path']:
                return False
            else:
                return True
    
    # Function to decide if the table's row is the header   
    def is_row_header(first_row):
        for el in first_row:
            if "TH" not in el['path']:
                return False
        return True
    
    # If the table has 2 indexes
    if is_2_index(first_row):
        print("This table has 2 indexes")
        # Function to produce table which has 2 indexes
        def get_2_index_table(table):
            table_rows = set([re.search(r'Table(?:\[\d+\])?/TR(?:\[\d+\])?', item['path']).group() for item in table if re.search(r'Table(?:\[\d+\])?/TR(?:\[\d+\])?', item['path'])])
            uniq_rows = sorted([row_name.split("/")[-1] for row_name in list(table_rows)])

            # Get indexes from the first row:
            row_indexes = [el['text'].strip() for el in table if "TR/TH" in el['path']]

            data = {}

            # Only look at second row onwards
            for i in range(1,len(uniq_rows)):
                row_name = uniq_rows[i]
                row = [el for el in table if row_name in el['path']]
                row_key = row[0]['text'].strip()
                
                unique_tds = set()
                for item in row:
                    path_parts = item['path'].split('/')
                    for part in path_parts:
                        if 'TD' in part:
                            unique_tds.add(part)
                # Convert the set to a list and sort it for consistent output
                unique_tds_list = sorted(list(unique_tds))

                sections_of_row = []
                for td in unique_tds_list:
                    td_section = ""
                    for i in range(len(row)):
                        if i ==0:
                            td += "/"
                        if td in row[i]['path']:
                            td_section += row[i]['text'].strip()
                            
                    sections_of_row.append(td_section)
                
                data[row_key] = sections_of_row
                
            df = pd.DataFrame(data, index=row_indexes).T
            df.to_csv(output_file_path)
            return df    
        
        df = get_2_index_table(table)
        return df
    
    # If the table only has one index
    else:
        print("This table has 1 index")

        # If the header for this df is the row
        if is_row_header(first_row):
            print("This is a row indexed table")
            def get_row_header_table(table):
                table_rows = set([re.search(r'Table(?:\[\d+\])?/TR(?:\[\d+\])?', item['path']).group() for item in table if re.search(r'Table(?:\[\d+\])?/TR(?:\[\d+\])?', item['path'])])
                uniq_rows = sorted([row_name.split("/")[-1] for row_name in list(table_rows)])

                # Table headers, also the keys
                headers = [el['text'].strip() for el in table if uniq_rows[0]+"/" in el['path']]
                rows = []
                for i in range(1,len(uniq_rows)):
                    row_name = uniq_rows[i]
                    row = [el for el in table if row_name in el['path']]
                    # rows.append(row)
                    
                    unique_tds = set()
                    for item in row:
                        path_parts = item['path'].split('/')
                        for part in path_parts:
                            if 'TD' in part:
                                unique_tds.add(part)
                    # Convert the set to a list and sort it for consistent output
                    unique_tds_list = sorted(list(unique_tds))

                    sections_of_row = []
                    for td in unique_tds_list:
                        td_section = ""
                        for i in range(len(row)):
                            if i ==0:
                                td += "/"
                            if td in row[i]['path']:
                                td_section += row[i]['text'].strip()
                                
                        sections_of_row.append(td_section)
                    rows.append(sections_of_row)
                                
                df = pd.DataFrame(rows, columns=headers)
                return df
            
            df = get_row_header_table(table)
            df.to_csv(output_file_path, index=False)
            return df
                
        # The header for this df is the column
        else:
            print("This table is a column indexed table...")
            def get_column_header_table(table):
                table_rows = set([re.search(r'Table(?:\[\d+\])?/TR(?:\[\d+\])?', item['path']).group() for item in table if re.search(r'Table(?:\[\d+\])?/TR(?:\[\d+\])?', item['path'])])
                uniq_rows = sorted([row_name.split("/")[-1] for row_name in list(table_rows)])

                data = {}

                for i in range(len(uniq_rows)):
                                    row_name = uniq_rows[i]
                                    if i == 0:
                                        row = [el for el in table if row_name+"/" in el['path']]
                                    else:
                                        row = [el for el in table if row_name in el['path']]
                                    
                                    unique_tds = set()
                                    for item in row:
                                        path_parts = item['path'].split('/')
                                        for part in path_parts:
                                            if 'TD' in part:
                                                unique_tds.add(part)
                                    # Convert the set to a list and sort it for consistent output
                                    unique_tds_list = sorted(list(unique_tds))
                                    
                                    sections_of_row = []
                                    for td in unique_tds_list:
                                        td_section = ""
                                        for i in range(len(row)):
                                            if i ==0:
                                                td += "/"
                                            if td in row[i]['path']:
                                                td_section += row[i]['text'].strip()
                                                
                                        sections_of_row.append(td_section)
                                                
                                    row_key = row[0]['text'].strip()
                                    data[row_key] = sections_of_row
                df = pd.DataFrame.from_dict(data, orient='index').T
                return df
                                
            df = get_column_header_table(table)
            df.to_csv(output_file_path, index=False)
            return df

# Function that saves the tables in CSV format to a unique job id       
def save_tables_to_csv(extracted_tables, table_output_directory):
    
    os.makedirs(table_output_directory, exist_ok=True)
    for table_num, table in extracted_tables.items():
        output_file_path = os.path.join(table_output_directory, f"table_{table_num}.csv")
        df = transform_table(table, output_file_path)


## Runing the script

In [None]:
file_path = '../PDF/HSI1000-chapter1.pdf'
extractor, unique_id = ExtractTextTableInfoFromPDF.create_with_unique_id(file_path)
print("\nUnique ID:", unique_id)
extracted_data = extractor.extracted_data
pdf_data = get_extracted_data(extracted_data)

# Section here to derive the csvs from the table elements
table_elements = [el for el in pdf_data['elements'] if "Table" in el['Path'] and 'Text' in el and "TR" in el['Path']]
print("\nUnique ID:", unique_id)

# IF there are even any table elements in the PDF
if table_elements:
    table_output_directory = f"../data/{unique_id}" 
    extracted_tables = extract_unique_tables(table_elements)    
    save_tables_to_csv(extracted_tables, table_output_directory)
    
# Section here to derive the nodes from the text elements
text_splitter = initialise_text_splitter(600, 50)
text_chunks = get_text_chunks(file_path, pdf_data, text_splitter)
text_nodes = convert_textchunks_to_nodes(text_chunks)

In [17]:
table = extracted_tables[2]
table

[{'path': '//Document/Table[4]/TR/TD/P', 'text': 'Observations ', 'Page': 3},
 {'path': '//Document/Table[4]/TR/TD[2]/L/LI/Lbl', 'text': '(1) ', 'Page': 3},
 {'path': '//Document/Table[4]/TR/TD[2]/L/LI/LBody',
  'text': 'Tea bag bloats and floats on top of the water when boiling water is poured directly on top of it. ',
  'Page': 3},
 {'path': '//Document/Table[4]/TR/TD[2]/L/LI[2]/Lbl',
  'text': '(2) ',
  'Page': 3},
 {'path': '//Document/Table[4]/TR/TD[2]/L/LI[2]/LBody',
  'text': 'Tea bag doesn’t bloat and sinks in the water when boiling water is poured on the side and not directly onto it. ',
  'Page': 3},
 {'path': '//Document/Table[4]/TR[2]/TD/P', 'text': 'Explanation ', 'Page': 3},
 {'path': '//Document/Table[4]/TR[2]/TD[2]/P',
  'text': 'Water poured on top of the tea bag fills the pores of the teabag itself, trapping any gas inside before it can escape. The hot water heats the trapped air, causing it to expand. The trapped air prevents the tea bag from being dunked. ',
  'Page

In [15]:
# only need to look at the first row
first_row = [el for el in table if "TR[2]/" in el['path']]
    # Function to decide if table has 2 indexes

# Function to check if the API could not identofy the index of the table
def is_unidentified():
    pass

# Function to transform the unidentified table by determining its index using LLM
def transform_unidentified_table():
    
    # Need to use the groq method
    for el in table:
        pass
        pass


def is_2_index(first_row):
        for el in first_row:
            if "TH/" in el['path']:
                return False
            else:
                return True
    
    # Function to decide if the table's row is the header   
def is_row_header(first_row):
        for el in first_row:
            if "TH" not in el['path']:
                return False
        return True
    
is_2_index(first_row)
# is_row_header(first_row)

True

In [16]:
first_row

[{'path': '//Document/Table[4]/TR[2]/TD/P', 'text': 'Explanation ', 'Page': 3},
 {'path': '//Document/Table[4]/TR[2]/TD[2]/P',
  'text': 'Water poured on top of the tea bag fills the pores of the teabag itself, trapping any gas inside before it can escape. The hot water heats the trapped air, causing it to expand. The trapped air prevents the tea bag from being dunked. ',
  'Page': 3},
 {'path': '//Document/Table[4]/TR[2]/TD[2]/P[2]/Sub',
  'text': 'Pores of the tea bag can get sealed up with water ',
  'Page': 3},
 {'path': '//Document/Table[4]/TR[2]/TD[2]/P[2]/Sub[2]',
  'text': 'and prevent air from escaping. ',
  'Page': 3}]

In [8]:
df = transform_table(table, "nice")

ValueError: Length of values (2) does not match length of index (0)

## Next thing to do:
- Clean the documents 

In [None]:
def clean_up_text(content: str) -> str:
    """
    Remove unwanted characters and patterns in text input.

    :param content: Text input.

    :return: Cleaned version of original text input.
    """

    # Fix hyphenated words broken by newline
    content = re.sub(r'(\w+)-\n(\w+)', r'\1\2', content)

    # Remove specific unwanted patterns and characters
    unwanted_patterns = [
        "\\n", "  —", "——————————", "—————————", "—————",
        r'\\u[\dA-Fa-f]{4}', r'\uf075', r'\uf0b7'
    ]
    for pattern in unwanted_patterns:
        content = re.sub(pattern, "", content)

    # Fix improperly spaced hyphenated words and normalize whitespace
    content = re.sub(r'(\w)\s*-\s*(\w)', r'\1-\2', content)
    content = re.sub(r'\s+', ' ', content)

    return content

# Call function
cleaned_docs = []
for d in documents:
    cleaned_text = clean_up_text(d.text)
    d.text = cleaned_text
    cleaned_docs.append(d)

## Old functions 

In [4]:

# Get the extracted data from the extractor
def get_extracted_data(extracted_data):
    if 'structuredData.json' in extracted_data:
        json_data = json.loads(extracted_data['structuredData.json'])
    return json_data



# This function converts tables to strings to be able to be processed by LLMs. 
def get_and_save_table_strings(extractor, table_output_directory):
    
    # Function to convert each row in a raw unprocessed table (ie index of table not decided) into a string
    def get_raw_table_string(df):
        table_str = ""
        for i in range(2):
            if i ==1:
                table_str += f"Row {i}: {df.iloc[i].values.tolist()}"  
            else:
                table_str += f"Row {i}: {df.iloc[i].values.tolist()}\n"
        return table_str
    
    # Function to decide if header is first row or first column
    def evaluate_table_index_llama(table_str):
        class Header(BaseModel):
            index: int = Field(description="Header of the table, 0 for first row as the header, 1 for first column as the header")
            
        parser = JsonOutputParser(pydantic_object=Header)

        chat = ChatGroq(temperature=0, model_name="llama3-8b-8192")
        
        template = '''You will assist me in deciding, based on the first 2 entries of a table, whether the first row or the first colum should be the header. 
                You are to output an int, 0 or 1. Where 0 if the first row is header, and 1 if the first column is the header.
                Follow the format instructions carefully.
                Table:
                {table}
                
                {format_instructions}
                '''
        prompt = PromptTemplate(
            template=template,
            input_variables=["table"],
            partial_variables={"format_instructions": parser.get_format_instructions()},
        )
        chain = prompt | chat | parser
        return chain.invoke({"table": table_str})

    # Tables need procecssing when extraced from BytesIO
    def clean_table_values(x):
        if isinstance(x, str):
            return x.replace('_x000D_', '').strip()
        return x
    
    # Code adapted from a medium blog on how to represent rows of tables in strings
    def convert_table_to_string(df):
        for _, row in df.iterrows():
            row_str = ""
            for col in df.columns:
                sentences = re.split(r'(?<=\.)\s*', row[col])
                row_sentence = ""
                for i in range(len(sentences)):
                    row_sentence += sentences[i] + "\n"
                row_str += f"{col}: {row_sentence}, "
            formatted = row_str[:-2]
        return formatted

    os.makedirs(table_output_directory, exist_ok=True)
    
    # The literal extraction of the file itself
    excel_files = {k: v for k, v in extractor.extracted_data.items() if k.endswith('.xlsx')}
    
    table_dataframes = {}

    num_tables =0
    for _, content in excel_files.items():
        excel_stream = BytesIO(content)
        df = pd.read_excel(excel_stream, header=None)
        
        df = df.applymap(clean_table_values)
        # # Uncomment bottom code to ensure that Groq decides table header index 
        
        # df_str = get_raw_table_string(df) 
        # dic = evaluate_table_index_llama(df_str)
        # header_index = dic['index']
        
        # Uncomment this if u uncomment the code above
        header_index = 1
        
        # If header_index is non zero
        if header_index == 1:
            df = pd.read_excel(excel_stream, header=None)
            df = df.applymap(clean_table_values)
            df = df.T
            # Set the first row as the new header
            new_header = df.iloc[0]  # Take the first row for the header
            df = df[1:]  # Take the data less the header row
            df.columns = new_header  # Set the header row as the df header
            # Optionally, reset index if necessary
            df.reset_index(drop=True, inplace=True)
        else:
            df = pd.read_excel(excel_stream, header=0)
        
        output_file_path = os.path.join(table_output_directory, f"table_{num_tables}.csv")

        # Writing the table to the corresponding csv file. 
        df.to_csv(output_file_path, index=False)
            
        table_str = convert_table_to_string(df)
        table_dataframes[num_tables] = table_str
        num_tables += 1
        
    return table_dataframes



# Function to initialise a flexible text splitter
def initialise_text_splitter(chunk_size, chunk_overlap):
    separator_list = ["\n\n", "\n", ". ", "!", "?", ",", " ", "", ")", "("]
    try:
        text_splitter = RecursiveCharacterTextSplitter(
                chunk_size=chunk_size,
                chunk_overlap=chunk_overlap,
                length_function=len,
                separators=separator_list)
    except Exception as e:
        logging.error(f"Failed to initialize text splitter: {e}")
        return None
    return text_splitter



# Function to obtain text chunks using the text splitter
def get_text_chunks(file_path, json_data, text_splitter):
    if 'elements' not in json_data:
        logging.error("Missing 'elements' key in json_data")
        raise ValueError("Missing 'elements' key in json_data")

    if not text_splitter:
        logging.error("Text splitter not initialised properly. ")
        sys.exit()  
        
    file_name = file_path.split("/")[-1]

    # Chunks are split by pages here
    page_text = ""
    start_page = 0
    all_chunks = []
    
    list_label = ""
    for i in range(len(json_data['elements'])):
        try:
            current_page = json_data['elements'][i]['Page']
        except KeyError:
            logging.warning(f"Missing 'Page' key in element at index {i}")
            continue

        try:
            if current_page > start_page:
                # Update the new page number
                start_page = current_page               
                # Generate the chunks for the previous page
                separated_list = text_splitter.split_text(page_text)
                for chunk in separated_list:
                    if chunk not in [". ", "."]:  # Simplified condition
                        all_chunks.append({'ElementType': 'Text', 'file_name': file_name, 'Page': current_page, 'Text': chunk})
                # Update the string of text 
                page_text = ""
                list_label = ""
            else:
                if 'Text' in json_data['elements'][i]:  # Check if Text is not empty
                    if json_data['elements'][i]['Path'].endswith("Lbl") and not json_data['elements'][i]['Path'].startswith("//Document/Table"):
                        list_label = json_data['elements'][i]['Text']
                    else:
                        if list_label:
                            page_text += list_label + json_data['elements'][i]['Text']
                            list_label = ""  # Reset list label to empty string
                        else:
                            page_text += json_data['elements'][i]['Text'] + "\n"
        except KeyError as e:
            logging.warning(f"Key error in json_data['elements'][i] processing at index {i}: {e}")
    
    
    # Processing the last page of the text
    if page_text:
        separated_list = text_splitter.split_text(page_text)
        for chunk in separated_list:
            if chunk not in [". ", "."]:
                all_chunks.append({'ElementType': 'Text', 'file_name': file_name, 'Page': current_page, 'Text': chunk})

    return all_chunks


# Further enhancement to include the tables with the metadata so that it can be parsed to the 
    # function that takes in the tables + metadata for embeddings generation
def get_table_strings_with_metadata(table_dataframes, json_data):
    
    # Function to obtain the page number of each table
    def get_table_pages(json_data):
        table_file_pages = {}
        # Obtaining the table metadata
        for i in range(len(json_data['elements'])):
            try:
                file_paths = json_data['elements'][i].get('filePaths')
                if file_paths:
                    page = json_data['elements'][i].get('Page', 'Unknown')
                    match = re.search(r'\d+', file_paths[0])
                    table_index = match.group(0)
                    table_file_pages[int(table_index)] = {"Page": page}
            except Exception as e:
                logging.error(f"Error processing file paths at index {i}: {e}")
        return table_file_pages
    
    table_file_pages = get_table_pages(json_data)
    meta_table_batch = []
    table_dfs = []
    for table_index,_ in table_dataframes.items():
        dic = {}
        dic['ElementType'] = 'Table'
        dic['Page'] = table_file_pages[table_index]['Page']
        dic['Table'] = table_dataframes[table_index]
        table_dfs.append(dic)

        #  Obtain metadata for sparse embeddings
        meta_table_batch.append(f"ElementType 'Table', Page {table_file_pages[table_index]['Page']}, {table_dataframes[table_index]}")
    return table_dfs, meta_table_batch

In [4]:
file_path = '../PDF/HSI1000-chapter1.pdf'
extractor, unique_id = ExtractTextTableInfoFromPDF.create_with_unique_id(file_path)
print("\nUnique ID:", unique_id)
table_output_directory = f"../data/{unique_id}"
extracted_data = extractor.extracted_data
pdf_data = get_extracted_data(extracted_data)
table_dataframes = get_and_save_table_strings(extractor, table_output_directory)

# Use some form of evaluator to decide chunk size?
text_splitter = initialise_text_splitter(300, 50)

# Get out important information
text_chunks = get_text_chunks(file_path, pdf_data, text_splitter)
# table_dfs, meta_table_batch= get_table_strings_with_metadata(table_dataframes, pdf_data)

2024-07-02 18:38:10,933 - INFO - Started uploading asset
2024-07-02 18:38:15,314 - INFO - Finished uploading asset
2024-07-02 18:38:15,316 - INFO - Started submitting EXTRACT_PDF job
2024-07-02 18:38:16,553 - INFO - Started getting job result
2024-07-02 18:38:27,172 - INFO - Finished polling for status
2024-07-02 18:38:27,176 - INFO - Finished getting job result
2024-07-02 18:38:27,178 - INFO - Started getting content
2024-07-02 18:38:27,486 - INFO - Finished getting content



Unique ID: 0e4dcde5-fe65-4391-ade1-d1389e41d635
