In [1]:
import yaml, os, camelot
from typing import List, Dict
from llama_index.schema import IndexNode
from llama_index.llms import AzureOpenAI
from llama_index.llm_predictor import LLMPredictor
from llama_index import set_global_service_context
from llama_index.node_parser import SimpleNodeParser
from llama_index.retrievers import RecursiveRetriever
from llama_hub.file.pymu_pdf.base import PyMuPDFReader
from llama_index.embeddings import HuggingFaceEmbedding
from llama_index.query_engine import RetrieverQueryEngine
from llama_index.response_synthesizers import get_response_synthesizer
from llama_index.query_engine import PandasQueryEngine, RetrieverQueryEngine
from llama_index import VectorStoreIndex, SimpleDirectoryReader, ServiceContext

In [2]:
# !wget "https://www.dropbox.com/scl/fi/waoz9bo9yiemnhnqvu0cc/billionaires_page.pdf?rlkey=4i08msa7zr1lpnuq2y1vs2xgw&dl=1" -O ./data/billionaires_page.pdf

In [3]:
with open('/Users/1zuu/Desktop/LLM RESEARCH/LLMPro/cadentials.yaml') as f:
    credentials = yaml.load(f, Loader=yaml.FullLoader)

os.environ['AD_OPENAI_API_KEY'] = credentials['AD_OPENAI_API_KEY']

In [4]:
embedding_llm = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
llm=AzureOpenAI(
                deployment_name=credentials['AD_DEPLOYMENT_ID'],
                model=credentials['AD_ENGINE'],
                api_key=credentials['AD_OPENAI_API_KEY'],
                api_version=credentials['AD_OPENAI_API_VERSION'],
                azure_endpoint=credentials['AD_OPENAI_API_BASE']
                )
chat_llm = LLMPredictor(llm)

service_context = ServiceContext.from_defaults(
                                                embed_model=embedding_llm,
                                                llm_predictor=chat_llm
                                                )
set_global_service_context(service_context)

  from .autonotebook import tqdm as notebook_tqdm


# Load n Process Data

## load tables and text seperate

In [5]:
def get_tables(path: str, pages: List[int]):
    table_dfs = []
    for page in pages:
        table_list = camelot.read_pdf(path, pages=str(page))
        table_df = table_list[0].df
        table_df = (
            table_df.rename(columns=table_df.iloc[0])
            .drop(table_df.index[0])
            .reset_index(drop=True)
        )
        table_dfs.append(table_df)
    return table_dfs

def get_texts(path: str, pages: List[int]):
    reader = PyMuPDFReader() 
    docs = reader.load(path)
    return docs

In [6]:
table_dfs = get_tables('./data/billionaires_page.pdf', pages=[3, 25])

In [7]:
table_dfs[0]

Unnamed: 0,No.,Name,Net worth\n(USD),Age,Nationality,Primary source(s) of wealth
0,1,Bernard Arnault &\nfamily,$211 billion,74,France,LVMH
1,2,Elon Musk,$180 billion,51,United\nStates,"Tesla, SpaceX, X Corp."
2,3,Jeff Bezos,$114 billion,59,United\nStates,Amazon
3,4,Larry Ellison,$107 billion,78,United\nStates,Oracle Corporation
4,5,Warren Buffett,$106 billion,92,United\nStates,Berkshire Hathaway
5,6,Bill Gates,$104 billion,67,United\nStates,Microsoft
6,7,Michael Bloomberg,$94.5 billion,81,United\nStates,Bloomberg L.P.
7,8,Carlos Slim & family,$93 billion,83,Mexico,"Telmex, América Móvil, Grupo\nCarso"
8,9,Mukesh Ambani,$83.4 billion,65,India,Reliance Industries
9,10,Steve Ballmer,$80.7 billion,67,United\nStates,Microsoft


In [8]:
table_dfs[1]

Unnamed: 0,Year,Number of billionaires,Group's combined net worth
0,2023[2],2640.0,$12.2 trillion
1,2022[6],2668.0,$12.7 trillion
2,2021[11],2755.0,$13.1 trillion
3,2020,2095.0,$8.0 trillion
4,2019,2153.0,$8.7 trillion
5,2018,2208.0,$9.1 trillion
6,2017,2043.0,$7.7 trillion
7,2016,1810.0,$6.5 trillion
8,2015[18],1826.0,$7.1 trillion
9,2014[67],1645.0,$6.4 trillion


## create query engine for each table

In [9]:
df_query_engines = [
                    PandasQueryEngine(table_df, service_context=service_context)
                    for table_df in table_dfs
                    ]
df_query_engines

[<llama_index.query_engine.pandas_query_engine.PandasQueryEngine at 0x303d2ac70>,
 <llama_index.query_engine.pandas_query_engine.PandasQueryEngine at 0x303d2a820>]

In [10]:
docs = get_texts('./data/billionaires_page.pdf', pages=[3, 25])
docs

[Document(id_='911ade7c-6c15-4383-a446-292691a9be9e', embedding=None, metadata={'total_pages': 33, 'file_path': './data/billionaires_page.pdf', 'source': '1'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, hash='3085460f77d5c834fa568bf3cc234045a1f64f107d0d73bb7fcead11b0bd9385', text="7/1/23, 11:31 PM\nThe World's Billionaires - Wikipedia\nhttps://en.wikipedia.org/wiki/The_World%27s_Billionaires\n1/33\nThe World's Billionaires\nList of the world's billionaires, ranked in order of net worth\nThe net worth of the world's billionaires increased from\nless than US$1 trillion in 2000 to over $7 trillion in 2015.\nPublication details\nPublisher\nWhale Media Investments\nForbes family\nPublication\nForbes\nFirst published\nMarch 1987[1]\nLatest publication\nApril 4, 2023\nCurrent list details (2023)[2]\nWealthiest\nBernard Arnault\nNet worth (1st)\n\xa0US$211\xa0billion\nNumber of\nbillionaires\n\xa02,640 (from 2668)\nTotal list net worth\nvalue\n\xa0US$12.2

## sample querying on one table

In [11]:
response = df_query_engines[0].query(
    "What's the net worth of the second richest billionaire in 2023?"
)
print(str(response))

$180 billion


# Build Vector Index

In [12]:
doc_nodes = service_context.node_parser.get_nodes_from_documents(docs)
doc_nodes

[TextNode(id_='8d0fd192-b8d1-4862-8b20-40a1632803b0', embedding=None, metadata={'total_pages': 33, 'file_path': './data/billionaires_page.pdf', 'source': '1'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='911ade7c-6c15-4383-a446-292691a9be9e', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'total_pages': 33, 'file_path': './data/billionaires_page.pdf', 'source': '1'}, hash='3085460f77d5c834fa568bf3cc234045a1f64f107d0d73bb7fcead11b0bd9385'), <NodeRelationship.NEXT: '3'>: RelatedNodeInfo(node_id='c24df129-0abc-4f8e-adb0-2cca9e476f8f', node_type=<ObjectType.TEXT: '1'>, metadata={}, hash='0c0b25b567e251feec434e1ad30fb3dfcb8d46133aff64e1411171c3a0dfc863')}, hash='99ec3aac27a0aa8f985b19c741957be94b353bebcb4f7716db9b197dc2723998', text="7/1/23, 11:31 PM\nThe World's Billionaires - Wikipedia\nhttps://en.wikipedia.org/wiki/The_World%27s_Billionaires\n1/33\nThe World's Billionaires\nList of the world's bi

### Set description for each table that make sure recursive retiever capable of indexing right table

In [13]:
summaries = [
            (
                "This node provides information about the world's richest billionaires"
                " in 2023"
            ),
            (
                "This node provides information on the number of billionaires and"
                " their combined net worth from 2000 to 2023."
            )
            ]

df_nodes = [
            IndexNode(text=summary, index_id=f"pandas{idx}")
            for idx, summary in enumerate(summaries)
            ]

df_id_query_engine_mapping = {
                            f"pandas{idx}": df_query_engine
                            for idx, df_query_engine in enumerate(df_query_engines)
                        }

In [14]:
vector_index_text = VectorStoreIndex(doc_nodes)
vector_index = VectorStoreIndex(doc_nodes + df_nodes)

In [24]:
# save the vector index
vector_index_text.storage_context.persist(persist_dir="./db/text_index")
vector_index.storage_context.persist(persist_dir="./db/recursive_index")