In [1]:
import requests
import yaml, os
from pathlib import Path
from typing import List, Dict
from llama_index.llms import AzureOpenAI
from llama_index.llm_predictor import LLMPredictor
from llama_index import set_global_service_context
from llama_index.text_splitter import SentenceSplitter
from llama_index.embeddings import HuggingFaceEmbedding
from llama_index.node_parser import SentenceWindowNodeParser
from llama_index.response_synthesizers import get_response_synthesizer
from llama_index import SimpleDirectoryReader, ServiceContext, SimpleDirectoryReader, VectorStoreIndex

## Very useful when you have large documents

In [2]:
import nest_asyncio
nest_asyncio.apply()

In [3]:
with open('/Users/1zuu/Desktop/LLM RESEARCH/LLMPro/cadentials.yaml') as f:
    credentials = yaml.load(f, Loader=yaml.FullLoader)

os.environ['AD_OPENAI_API_KEY'] = credentials['AD_OPENAI_API_KEY']

In [13]:
embedding_llm = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
llm=AzureOpenAI(
                deployment_name=credentials['AD_DEPLOYMENT_ID'],
                model=credentials['AD_ENGINE'],
                api_key=credentials['AD_OPENAI_API_KEY'],
                api_version=credentials['AD_OPENAI_API_VERSION'],
                azure_endpoint=credentials['AD_OPENAI_API_BASE']
                )
chat_llm = LLMPredictor(llm)

node_parser = SentenceWindowNodeParser.from_defaults(
                                                    window_size=3,
                                                    window_metadata_key="window",
                                                    original_text_metadata_key="original_text",
                                                    )
text_splitter = SentenceSplitter()
'''
About the SentenceSplitter:

In general, this class tries to keep sentences and paragraphs together. 
Therefore compared to the original TokenTextSplitter, there are less likely 
to be hanging sentences or parts of sentences at the end of the node chunk.

'''
service_context = ServiceContext.from_defaults(
                                                embed_model=embedding_llm,
                                                llm_predictor=chat_llm,
                                                # node_parser=node_parser
                                                )
set_global_service_context(service_context)

In [12]:
# !wget "https://www.ipcc.ch/report/ar6/wg2/downloads/report/IPCC_AR6_WGII_Chapter03.pdf" -O ./data/IPCC_AR6_WGII_Chapter03.pdf

--2023-12-21 19:42:38--  https://www.ipcc.ch/report/ar6/wg2/downloads/report/IPCC_AR6_WGII_Chapter03.pdf
Resolving www.ipcc.ch (www.ipcc.ch)... 172.67.16.59, 104.20.23.161, 104.20.24.161
Connecting to www.ipcc.ch (www.ipcc.ch)|172.67.16.59|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 21752444 (21M) [application/pdf]
Saving to: ‘./data/IPCC_AR6_WGII_Chapter03.pdf’


2023-12-21 19:42:41 (7.55 MB/s) - ‘./data/IPCC_AR6_WGII_Chapter03.pdf’ saved [21752444/21752444]



In [5]:
documents = SimpleDirectoryReader(input_files=["./data/IPCC_AR6_WGII_Chapter03.pdf"]).load_data()
len(documents)

172

#### We extract out the set of nodes that will be stored in the VectorIndex. This includes both the nodes with the sentence window parser, as well as the “base” nodes extracted using the standard parser.

In [6]:
nodes = node_parser.get_nodes_from_documents(documents)
base_nodes = text_splitter.get_nodes_from_documents(documents)

In [18]:
len(nodes), len(base_nodes)

(11087, 461)

In [11]:
sentence_index = VectorStoreIndex(nodes, service_context=service_context)
base_index = VectorStoreIndex(base_nodes, service_context=service_context)

In [12]:
sentence_index.storage_context.persist(persist_dir="./db/sentence_index")
base_index.storage_context.persist(persist_dir="./db/base_index")