### Setup

In [1]:
# NOTE: This is ONLY necessary in jupyter notebook.
# Details: Jupyter runs an event-loop behind the scenes. 
#          This results in nested event-loops when we start an event-loop to make async queries.
#          This is normally not allowed, we use nest_asyncio to allow it for convenience.  
import nest_asyncio
nest_asyncio.apply()

In [2]:
import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

from llama_index import (
    GPTVectorStoreIndex,
    GPTListIndex,
    SimpleDirectoryReader,
    ServiceContext,
    StorageContext
)
from llama_index.data_structs import Node

INFO:numexpr.utils:Note: NumExpr detected 12 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
Note: NumExpr detected 12 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
INFO:numexpr.utils:NumExpr defaulting to 8 threads.
NumExpr defaulting to 8 threads.


### Load Data

We first show how to convert a Document into a set of Nodes, and insert into a DocumentStore.

In [3]:
# load documents
# documents = SimpleDirectoryReader('../paul_graham_essay/data').load_data()

In [4]:
import os
os.environ["OPENAI_API_KEY"] = "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"

from llama_index import download_loader, GPTVectorStoreIndex, ServiceContext, StorageContext, load_index_from_storage
from pathlib import Path
# from utils import *
from langchain.agents import Tool, initialize_agent

urls = [
    "https://stanford-cs324.github.io/winter2022/lectures/introduction/",
    "https://stanford-cs324.github.io/winter2022/lectures/harms-1/",
    "https://stanford-cs324.github.io/winter2022/lectures/harms-2/",
    "https://stanford-cs324.github.io/winter2022/lectures/capabilities/",
]
table_urls = ["https://github.com/Hannibal046/Awesome-LLM#milestone-papers"]

# UnstructuredURLLoader = download_loader("UnstructuredURLLoader")
# loader = UnstructuredURLLoader(urls=urls, continue_on_failure=False, headers={"User-Agent": "value"})
# print(loader.load())

BeautifulSoupWebReader = download_loader("BeautifulSoupWebReader")
loader = BeautifulSoupWebReader()
documents = loader.load_data(urls=urls)

from llama_index import GPTTreeIndex, SimpleDirectoryReader


### Support for other data_types

In [None]:
# Image


In [5]:
# initialize service context (set chunk size)
service_context = ServiceContext.from_defaults(chunk_size_limit=1024)
nodes = service_context.node_parser.get_nodes_from_documents(documents)

In [6]:
# initialize storage context (by default it's in-memory)
storage_context = StorageContext.from_defaults()
storage_context.docstore.add_documents(nodes)

### Define List Index and Vector Index over Same Data 

In [7]:
list_index = GPTListIndex(nodes, storage_context=storage_context)
vector_index = GPTVectorStoreIndex(nodes, storage_context=storage_context)

INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 0 tokens
> [build_index_from_nodes] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total embedding token usage: 0 tokens
> [build_index_from_nodes] Total embedding token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 0 tokens
> [build_index_from_nodes] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total embedding token usage: 24846 tokens
> [build_index_from_nodes] Total embedding token usage: 24846 tokens


### Define Node/Query Engine for these Indices

We define a Node and Query Engine for each Index. We then define an outer "tool" index to store
these Nodes, which can be treated as metadata.

In [8]:
list_index_node = Node(
    "Lecture notes in Introduction.",
    doc_id="list_index"
)
list_query_engine = list_index.as_query_engine(
    response_mode="tree_summarize", use_async=True
)
vector_index_node = Node(
    "Useful for questions around the author's education, from Paul Graham essay on What I Worked On.",
    doc_id="vector_index"
)
vector_query_engine = vector_index.as_query_engine(
    response_mode="tree_summarize", use_async=True
)

### Define a Vector Index Retriever for these Nodes

Define a vector index on top of these Nodes which in turn correspond to the underlying query engines.

In [9]:
# create an outer "tool" index to store the underlying index information
tool_index = GPTVectorStoreIndex([list_index_node, vector_index_node])
# get retriever
tool_retriever = tool_index.as_retriever(similarity_top_k=1)

INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 0 tokens
> [build_index_from_nodes] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total embedding token usage: 40 tokens
> [build_index_from_nodes] Total embedding token usage: 40 tokens


### Define Router Query Engine

We define a router query engine using the vector index retriever as input. This retriever will be used to retrieve "Nodes" which contain metadata for query engines. We also take as input a function that maps a Node to a query engine.

In [10]:
def node_to_query_engine(node: Node):
    """Convert node to query engine."""
    # NOTE: hardcode mapping in this case
    mapping = {
        "list_index": list_query_engine,
        "vector_index": vector_query_engine
    }
    return mapping[node.get_doc_id()]

In [11]:
from llama_index.query_engine.router_query_engine import RetrieverRouterQueryEngine


query_engine = RetrieverRouterQueryEngine(
    tool_retriever,
    node_to_query_engine
)

In [31]:
q = 'Which models did Google release in Oct 2018'
response = query_engine.query(q)


INFO:llama_index.token_counter.token_counter:> [retrieve] Total LLM token usage: 0 tokens
> [retrieve] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [retrieve] Total embedding token usage: 8 tokens
> [retrieve] Total embedding token usage: 8 tokens
INFO:llama_index.indices.common_tree.base:> Building index from nodes: 7 chunks
> Building index from nodes: 7 chunks
INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/completions processing_ms=1501 request_id=9e8d6ea8f052c41efcf6fcf32ff55580 response_code=200
message='OpenAI API response' path=https://api.openai.com/v1/completions processing_ms=1501 request_id=9e8d6ea8f052c41efcf6fcf32ff55580 response_code=200
INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/completions processing_ms=1758 request_id=a56b3f1c3adfe30101911e3e7e3b49e0 response_code=200
message='OpenAI API response' path=https://api.openai.com/v1/completions processing_ms=1758 request_id=a56b3f1c3ad

In [32]:
print(str(response))
response.get_formatted_sources()


Google released the BERT (Bidirectional Encoder Representations from Transformers) model and the Perspective API in October 2018.


'> Source (Doc id: 48d7e737-91ef-495c-b521-578cfa7297ff): URL: https://stanford-cs324.github.io/winter2022/lectures/introduction/\n\n  Introduction | CS324  ...\n\n> Source (Doc id: 10e531be-5f00-4661-bd6c-f3442c4c50bf): URL: https://stanford-cs324.github.io/winter2022/lectures/introduction/\n\n\\(p\\). In practice, we d...\n\n> Source (Doc id: 116a265b-3f5f-4237-ba86-fb50106a08bb): URL: https://stanford-cs324.github.io/winter2022/lectures/introduction/\n\n= 0.4, \\quad\\quad\\quad p...\n\n> Source (Doc id: 96e02cf9-2239-44eb-835e-f0929568395c): URL: https://stanford-cs324.github.io/winter2022/lectures/introduction/\n\nrepresent an element \\(x...\n\n> Source (Doc id: 157f6610-6790-4161-aadb-b39011d58bbf): URL: https://stanford-cs324.github.io/winter2022/lectures/introduction/\n\n\\underbrace{p(\\text{spee...\n\n> Source (Doc id: c1270b0b-e6fe-4c7e-9073-68889202dfdb): URL: https://stanford-cs324.github.io/winter2022/lectures/introduction/\n\nvalues of \\(n\\).Now, the...\n\n> Source (D

In [28]:
q = "What are some milestone model architectures and papers in the last few years"
response = query_engine.query(q)


INFO:llama_index.token_counter.token_counter:> [retrieve] Total LLM token usage: 0 tokens
> [retrieve] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [retrieve] Total embedding token usage: 13 tokens
> [retrieve] Total embedding token usage: 13 tokens
INFO:llama_index.token_counter.token_counter:> [retrieve] Total LLM token usage: 0 tokens
> [retrieve] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [retrieve] Total embedding token usage: 0 tokens
> [retrieve] Total embedding token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [get_response] Total LLM token usage: 2041 tokens
> [get_response] Total LLM token usage: 2041 tokens
INFO:llama_index.token_counter.token_counter:> [get_response] Total embedding token usage: 0 tokens
> [get_response] Total embedding token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [get_response] Total LLM token usage: 2041 tokens
> [get_response] Total LLM token 

In [29]:
print(str(response))
response.get_formatted_sources()


Some milestone model architectures and papers in the last few years include Recurrent Neural Networks (RNNs), including Long Short Term Memory (LSTMs) (2003), Transformers (2017), ELMo (2018), GPT (2018), BERT (2018), XLM (2019), GPT-2 (2019), RoBERTa (2019), Megatron-LM (2019), T5 (2019), Turing-NLG (2020), GPT-3 (2020), Megatron-Turing NLG (2020), and Gopher (2021).


'> Source (Doc id: f5fccc85-2c2e-4708-a844-1f65cf9375e0): URL: https://stanford-cs324.github.io/winter2022/lectures/introduction/\n\nis access. Whereas small...\n\n> Source (Doc id: c1270b0b-e6fe-4c7e-9073-68889202dfdb): URL: https://stanford-cs324.github.io/winter2022/lectures/introduction/\n\nvalues of \\(n\\).Now, the...'

In [21]:
q = 'Which models did Google release in Oct 2018'
q = "What are the layers in a transformer block"
response = query_engine.query(q)


INFO:llama_index.token_counter.token_counter:> [retrieve] Total LLM token usage: 0 tokens
> [retrieve] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [retrieve] Total embedding token usage: 8 tokens
> [retrieve] Total embedding token usage: 8 tokens
INFO:llama_index.token_counter.token_counter:> [retrieve] Total LLM token usage: 0 tokens
> [retrieve] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [retrieve] Total embedding token usage: 0 tokens
> [retrieve] Total embedding token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [get_response] Total LLM token usage: 1978 tokens
> [get_response] Total LLM token usage: 1978 tokens
INFO:llama_index.token_counter.token_counter:> [get_response] Total embedding token usage: 0 tokens
> [get_response] Total embedding token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [get_response] Total LLM token usage: 1978 tokens
> [get_response] Total LLM token us

In [22]:
print(str(response))


The layers in a transformer block are typically a multi-head attention layer, a feed-forward layer, and a layer normalization layer.


In [23]:
response.get_formatted_sources()

'> Source (Doc id: 157f6610-6790-4161-aadb-b39011d58bbf): URL: https://stanford-cs324.github.io/winter2022/lectures/introduction/\n\n\\underbrace{p(\\text{spee...\n\n> Source (Doc id: c1270b0b-e6fe-4c7e-9073-68889202dfdb): URL: https://stanford-cs324.github.io/winter2022/lectures/introduction/\n\nvalues of \\(n\\).Now, the...'

In [24]:
response = query_engine.query('Tell me about datasets used to train LLMs and how they’re cleaned')

INFO:llama_index.token_counter.token_counter:> [retrieve] Total LLM token usage: 0 tokens
> [retrieve] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [retrieve] Total embedding token usage: 16 tokens
> [retrieve] Total embedding token usage: 16 tokens
INFO:llama_index.token_counter.token_counter:> [retrieve] Total LLM token usage: 0 tokens
> [retrieve] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [retrieve] Total embedding token usage: 0 tokens
> [retrieve] Total embedding token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [get_response] Total LLM token usage: 1459 tokens
> [get_response] Total LLM token usage: 1459 tokens
INFO:llama_index.token_counter.token_counter:> [get_response] Total embedding token usage: 0 tokens
> [get_response] Total embedding token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [get_response] Total LLM token usage: 1459 tokens
> [get_response] Total LLM token 

In [25]:
print(str(response))


Large language models (LLMs) are trained on datasets that contain a huge amount of Internet data, such as Reddit. To ensure that the data is clean and free of bias, careful curation is necessary. Datasets such as RealToxicityPrompts are used to evaluate a language model’s propensity for producing toxic content. Additionally, datasets such as GPT-3 have been demonstrated to output anti-Muslim stereotypes. To mitigate these issues, data poisoning attacks can be used to inject poison documents into the training set. Furthermore, legal considerations must be taken into account when training language models on copyright data.


In [26]:
response.get_formatted_sources()

'> Source (Doc id: 58bbc077-ae7e-4fbe-915e-1caba0ffef70): URL: https://stanford-cs324.github.io/winter2022/lectures/introduction/\n\nfinished the program. He...\n\n> Source (Doc id: d06a0301-f8b0-458f-b678-63d4d477270e): URL: https://stanford-cs324.github.io/winter2022/lectures/harms-1/\n\nmentions 21 definitions). Unf...'