In [1]:
'''
LLMs can be used in variety of ways for different purposes - 
Indexing - Relevance of data/Summarize the raw data/Index summaries 
Querying - Retrieval And Responce Synthesi 
'''

'\nLLMs can be used in variety of ways for different purposes - \nIndexing - Relevance of data/Summarize the raw data/Index summaries \nQuerying - Retrieval And Responce Synthesi \n'

In [2]:
#api key 
import os 

os.environ["OPENAI_API_KEY"] = ""

In [3]:
from llama_index.llms.openai import OpenAI

response = OpenAI().complete("Paul Graham is ")
print(response)

a computer scientist, entrepreneur, and venture capitalist. He is best known for co-founding the startup accelerator Y Combinator and for his work in the field of programming languages and software development. Graham is also a prolific writer and has published several influential essays on topics ranging from technology and startups to philosophy and culture.


In [4]:
#change the model 
from llama_index.core import Settings
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader

Settings.llm = OpenAI(temperature=0.2, model="gpt-3.5-turbo")

# run on your document 
documents = SimpleDirectoryReader("data").load_data()
index = VectorStoreIndex.from_documents(
    documents,
)

In [5]:
# loading data 

'''
Ingestion pipeline - three main statges 
1) Loading data
2) Transformation of data
3) Index and storing of data 
'''

#loader 
'''
Reader - Data connector
Ingest data from different source, and change them into Document. 
Document - Collection of raw data with metadata - Text/image/audio 
'''

#simple directory reader 
from llama_index.core import SimpleDirectoryReader
documents = SimpleDirectoryReader("./data").load_data()

'''
We can also collect data from SQL Database 

from llama_index.core import download_loader

from llama_index.readers.database import DatabaseReader

reader = DatabaseReader(
    scheme=os.getenv("DB_SCHEME"),
    host=os.getenv("DB_HOST"),
    port=os.getenv("DB_PORT"),
    user=os.getenv("DB_USER"),
    password=os.getenv("DB_PASS"),
    dbname=os.getenv("DB_NAME"),
)

query = "SELECT * FROM users"
documents = reader.load_data(query=query)

More coonectors(to fetch data) on llamahub 

'''


#creating document from scratch 
from llama_index.core import Document
doc = Document(text="text")

In [6]:
#transformation 
'''
Transformation - chunking, metadata extraction, embedding each chunk
Input of transformation is node(subclass of document)


'''

#high level transformation, not much control 
from llama_index.core import VectorStoreIndex

vector_index = VectorStoreIndex.from_documents(documents)
vector_index.as_query_engine()



#custom text splitter - a little change to high level transformation
from llama_index.core.node_parser import SentenceSplitter

text_splitter = SentenceSplitter(chunk_size=512, chunk_overlap=10)

# global change 
from llama_index.core import Settings

Settings.text_splitter = text_splitter
index = VectorStoreIndex.from_documents(
    documents, 
    transformations=[text_splitter] # new part 
)
index.as_query_engine()




#low level transformation, defining each component as explicitly

#first, nodes(chunk) are created by splitting document
from llama_index.core import SimpleDirectoryReader
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core.node_parser import TokenTextSplitter

#create documents -- use text splitter -- create nodes
documents = SimpleDirectoryReader("./data").load_data()
pipeline = IngestionPipeline(transformations=[TokenTextSplitter()])
nodes = pipeline.run(documents=documents)

#add metadata -- manually or with metadata extractor
document = Document(
    text="text",
    metadata={"filename": "<doc_file_name>", "category": "<category>"},
)

#add embedding 
#more on this later 

In [7]:
#indexing 

'''
Index are created over Documents so that they can be used for querying 
Index is a datastructure composed of Documents. 

Index Types - 
1) Vector Store Index - creates vector embedding of each node.
    a) Vector embedding - numerical representation of the semantics, or meaning of your text
    b) By default LlamaIndex uses text-embedding-ada-002
    c) Top_k - The number of embeddings it returns. This is also known as top-k semantic retrieval

2) Summary Index - Generate summary of the documents. It returns all the documents in summarized text 

3) Knowledge Graph Index - If data is interconnected(graph) 
    1) Converts unstrucutred data into entity based querying 
'''


#vector store indexing 
from llama_index.core import VectorStoreIndex

index = VectorStoreIndex.from_documents(documents) #first convert docs in nodes then make index


#build index over nodes
from llama_index.core import VectorStoreIndex

index = VectorStoreIndex(nodes)



In [8]:
#storings 

'''
Store the indexed data to avoide re-indexing. By default its stored in local memory 

'''

#store index in particular directory 
index.storage_context.persist(persist_dir="")

#for graph 
#graph.root_index.storage_context.persist(persist_dir="<persist_dir>")

#loading index from memory
from llama_index.core import StorageContext, load_index_from_storage

# rebuild storage context
storage_context = StorageContext.from_defaults(persist_dir="")
# load index
index = load_index_from_storage(storage_context)


In [9]:
'''
Vector Store 

QDrant 
Pincone

Explanation is very generic 
'''

#inserting more data(documents/nodes)
from llama_index.core import VectorStoreIndex

index = VectorStoreIndex([])
for doc in documents:
    index.insert(doc)


In [10]:
# querying 

'''
Most significant part of LLM Application: Querying 

Just a prompt call to the LLM
Complex querying requires repeated/chained prompt or even a reasoning loop across multiple components.

'''

#simple query # high level API call
query_engine = index.as_query_engine()
response = query_engine.query(
    "Explain emperical risk minimization."
)
print(response)

Empirical risk minimization involves minimizing a certain quantity known as the empirical risk, denoted as Rs(f), over a set of labeled data points. This empirical risk is defined as the average loss over all data points in the set S. Mathematically, it is expressed as Rs(f) = (1/n) * Σ l(f(xi), yi) for i from 1 to n, where n is the number of data points, xi and yi are the input and corresponding output for each data point, and l is the loss function. The process of finding the minimum empirical risk is referred to as optimization within a given function class F.


In [12]:
'''
Stages of query - 
Retrieval - find and return the most relevant documents
Postprocessing - Nodes are reranked/filtered based on the metadata that they have(keywords)
Responce Synthesis - Query + Relevant data + Prompt sent to LLM to generate responce
'''

#low level API call   # Granular control over the data and API
# know as retriever 

from llama_index.core import VectorStoreIndex, get_response_synthesizer
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.postprocessor import SimilarityPostprocessor

# build index
index = VectorStoreIndex.from_documents(documents)

# configure retriever
retriever = VectorIndexRetriever(
    index=index,
    similarity_top_k=10,
)

# configure response synthesizer
response_synthesizer = get_response_synthesizer()

# assemble query engine
query_engine = RetrieverQueryEngine(
    retriever=retriever, #retrieve 
    response_synthesizer=response_synthesizer, # query + relevant docs
    node_postprocessors=[SimilarityPostprocessor(similarity_cutoff=0.7) ],
)

# query
response = query_engine.query("Explain eperical risk minimizer")
print(response)



Empirical Risk Minimizer (ERM) involves minimizing the empirical risk, which is the average loss over all data points in a given set. This is achieved by finding the function within a function class that minimizes this average loss. The process of minimizing this quantity is referred to as optimization. The goal of ERM is to learn a predictor function that can accurately map input data to output data, enabling predictions for any given input.


In [17]:
#define components from scratch 

from llama_index.core.postprocessor import KeywordNodePostprocessor

#retriever 
retriever = VectorIndexRetriever(
    index=index,
    similarity_top_k=10,
)


#node postpreprocessor 
'''
Improve relevancy of the nodes that fetched

1) Keyword - Filter by required keyword or keyword which should be removed 
2) Similarity - Filter by threshold value/score 
3) PrevNextNode - augments retrieved node based on neigbours 


NOTE- THESE METHODS MIGHT RESULT IN EMPTY RESPONCES

'''

node_postprocessors = [
    KeywordNodePostprocessor(
        required_keywords=["ERM"], exclude_keywords=[""]
    )
]
query_engine = RetrieverQueryEngine.from_args(
    retriever, node_postprocessors=node_postprocessors
)
response = query_engine.query("Explain emperical risk minimization")

In [18]:
print(response)

Empirical risk minimization involves minimizing a certain quantity known as the empirical risk, denoted as Rs(f), over a set of labeled data points. This empirical risk is defined as the average loss over all data points in the set S. Mathematically, it is expressed as the minimum of Rs(f) over all functions f in the function class F. The process of finding this minimum is referred to as optimization.


In [22]:
# Evaluation 
from llama_index.core import VectorStoreIndex
from llama_index.llms.openai import OpenAI
from llama_index.core.evaluation import FaithfulnessEvaluator


#for async error - https://github.com/run-llama/llama_index/issues/9978
import nest_asyncio 
nest_asyncio.apply()

# create llm
llm = OpenAI(model="gpt-4", temperature=0.0)

evaluator = FaithfulnessEvaluator(llm=llm)

query_engine = index.as_query_engine()
response = query_engine.query(
    "Explain emperical risk minimization."
)

print(response)

eval_result = evaluator.evaluate_response(response=response)
print(str(eval_result.passing))

#print(eval_result)

Empirical risk minimization involves minimizing a certain quantity known as the empirical risk, denoted as Rs(f), over a set of labeled data points. This empirical risk is defined as the average loss over all data points in the set S. The goal is to find the function f that minimizes this empirical risk, which is achieved through optimization by selecting the function that yields the minimum empirical risk among all functions in the function class F.
True
query=None contexts=['The\nimage\nexplains\nthe\nconcept\nof\nEmpirical\nRisk\nMinimizer\n(ERM).\nIt\nstarts\nwith\na\ndefinition:\nGiven\na\nfunction\nclass\nF,\nwhich\nmaps\nfrom\nthe\nset\nX\nto\nthe\nset\nY,\nempirical\nrisk\nminimization\nover\na\nset\nof\nlabeled\ndata\npoints\nS\ncorresponds\nto\nminimizing\na\ncertain\nquantity.\nThis\nquantity\nis\nthe\nempirical\nrisk,\ndenoted\nas\nRs(f).\nMathematically ,\nthis\nis\nexpressed\nas\nthe\nminimum\nof\nRs(f)\nover\nall\nfunctions\nf\nin\nthe\nfunction\nclass\nF.\nThe\nempirica