In [3]:
from llama_index.llms.openai import OpenAI
from llama_index.core import Settings
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader

In [1]:
import getpass
import os

os.environ["OPENAI_API_KEY"] = getpass.getpass()

 ········


In [58]:
response = OpenAI().complete("What is Capital of India?")
print(response)

The capital of India is New Delhi.


## Document Loading

In [59]:
from llama_index.core import SimpleDirectoryReader

documents = SimpleDirectoryReader("D:\Data\LLM\llamaindex\IndianEconomy").load_data()

In [60]:
len(documents)

9

In [61]:
documents[0].metadata

{'page_label': '1',
 'file_name': 'Indian_Economy_Overview.pdf',
 'file_path': 'D:\\Data\\LLM\\llamaindex\\IndianEconomy\\Indian_Economy_Overview.pdf',
 'file_type': 'application/pdf',
 'file_size': 7918,
 'creation_date': '2024-06-09',
 'last_modified_date': '2024-06-09'}

In [62]:
llm = OpenAI(model="gpt-3.5-turbo", temperature=0.0)

In [18]:
#%pip install llama-index-embeddings-openai

In [63]:
import os

from llama_index.core.node_parser import SemanticSplitterNodeParser
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.ingestion import IngestionPipeline, IngestionCache

In [64]:
# This will be the model we use both for Node parsing and for vectorization
embed_model = OpenAIEmbedding()

# Define the initial pipeline
pipeline = IngestionPipeline(
    transformations=[
        SemanticSplitterNodeParser(
            buffer_size=1,
            breakpoint_percentile_threshold=95, 
            embed_model=embed_model,
            ),
        embed_model,
        ],
    )

In [65]:
from pinecone import Pinecone, ServerlessSpec

In [66]:
import os
from pinecone import Pinecone

# initialize connection to pinecone (get API key at app.pinecone.io)
api_key = os.environ.get('80d261ee-5381-4e8f-b3c3-e30a86f5d39d') or '80d261ee-5381-4e8f-b3c3-e30a86f5d39d'

# configure client
pc = Pinecone(api_key=api_key)

os.environ['PINECONE_API_KEY'] = '80d261ee-5381-4e8f-b3c3-e30a86f5d39d'

In [67]:
from pinecone import ServerlessSpec

cloud = os.environ.get('PINECONE_CLOUD') or 'aws'
region = os.environ.get('PINECONE_REGION') or 'us-east-1'

spec = ServerlessSpec(cloud=cloud, region=region)

In [68]:
index_name = 'gen-qa-openai-fast'

In [69]:
# check if index already exists (it shouldn't if this is first time)
if index_name not in pc.list_indexes().names():
    # if does not exist, create index
    pc.create_index(
        index_name,
        dimension=1536,  # dimensionality of text-embedding-ada-002
        metric='cosine',
        spec=spec
    )
# connect to index
index = pc.Index(index_name)
# view index stats
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [70]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.vector_stores.pinecone import PineconeVectorStore
from llama_index.core import StorageContext

In [72]:
vector_store = PineconeVectorStore(pinecone_index=index)

In [73]:
# Our pipeline with the addition of our PineconeVectorStore
pipeline = IngestionPipeline(
    transformations=[
        SemanticSplitterNodeParser(
            buffer_size=1,
            breakpoint_percentile_threshold=95, 
            embed_model=embed_model,
            ),
        embed_model,
        ],
        vector_store=vector_store  # Our new addition
    )

# Now we run our pipeline!
pipeline.run(documents=documents)


Upserted vectors:   0%|          | 0/17 [00:00<?, ?it/s]

[TextNode(id_='16b8fa58-c94d-4502-8740-ada2ec531fc8', embedding=[-0.0010383485350757837, 0.006870504934340715, -0.002203427953645587, -0.019114399328827858, -0.024318816140294075, -0.00032738817390054464, -0.0059648011811077595, -0.0029587442986667156, -0.03084799274802208, -0.01165248453617096, 0.026495208963751793, 0.007651167456060648, -0.003338936949148774, 0.02406197413802147, -0.005184139125049114, 0.016437843441963196, 0.015086047351360321, -0.027427947148680687, 0.007272664457559586, 0.0017911301692947745, -0.0251298937946558, 0.008164850063621998, 0.00022663710115011781, -0.006025632377713919, -0.002845531329512596, 0.018087035045027733, 0.014193861745297909, -0.024102529510855675, 0.006975268945097923, 0.027360357344150543, -0.030604669824242592, -0.0238321702927351, -0.0022879152093082666, -0.018397947773337364, -0.005177380051463842, -0.011666002683341503, 0.0012951898388564587, -0.018587199971079826, 0.045447394251823425, -0.0230751633644104, 0.011172596365213394, 0.003798

In [75]:
index

<pinecone.data.index.Index at 0x1beb419ca50>

In [85]:
from llama_index.core import VectorStoreIndex
from llama_index.core.retrievers import VectorIndexRetriever

# Instantiate VectorStoreIndex object from your vector_store object
vector_index = VectorStoreIndex.from_vector_store(vector_store=vector_store)

# Grab 5 search results
retriever = VectorIndexRetriever(index=vector_index, similarity_top_k=2)

# Query vector DB
answer = retriever.retrieve('What are the Key Sectors of the Indian Economy?')

# Inspect results
print([i.get_content() for i in answer])

# >>> ['some relevant search result 1', 'some relevant search result 1'...]


["Indian Economy: An Overview\nKey Sectors of the Indian Economy\nAgriculture: Agriculture has historically been the backbone of the Indian economy. It employs around\n42% of the workforce and contributes approximately 16% to the GDP. Key agricultural products\ninclude rice, wheat, pulses, spices, and cotton. Despite its importance, the sector faces challenges\nsuch as low productivity, fragmented land holdings, and dependence on monsoons.\nIndustry: The industrial sector has witnessed significant growth post-reforms. It contributes about\n25% to the GDP and includes sub-sectors like manufacturing, mining, and construction. The 'Make\nin India' initiative launched in 2014 aims to boost manufacturing, attract FDI, and create jobs,\npositioning India as a global manufacturing hub.\nServices: The services sector is the largest contributor to the Indian economy, accounting for around\n55% of the GDP. It encompasses a wide range of activities including IT and IT-enabled services,\ntelecommu

## Querying

In [86]:
from llama_index.core import VectorStoreIndex, get_response_synthesizer
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.postprocessor import SimilarityPostprocessor

In [87]:
response_synthesizer = get_response_synthesizer()

In [88]:
query_engine_res = RetrieverQueryEngine(
    retriever=retriever,
    response_synthesizer=response_synthesizer,
    node_postprocessors=[SimilarityPostprocessor(similarity_cutoff=0.8)],
)

In [89]:
response = query_engine_res.query("Future Prospects of Indian Economy?")
print(response)

The future prospects of the Indian economy are positive due to factors such as a young and growing population providing a vast labor force and consumer market, rapid digitalization and technological adoption transforming various sectors, continued focus on economic reforms and infrastructure development, and increasing integration with the global economy through trade agreements and FDI. To fully realize its potential, India must address structural issues, invest in human capital, and ensure inclusive growth.
