# Take Home Task


### Setup

In [1]:
# NOTE: This is ONLY necessary in jupyter notebook.
# Details: Jupyter runs an event-loop behind the scenes. 
#          This results in nested event-loops when we start an event-loop to make async queries.
#          This is normally not allowed, we use nest_asyncio to allow it for convenience.  
import nest_asyncio
nest_asyncio.apply()

In [2]:
import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

from llama_index import (
    GPTVectorStoreIndex,
    GPTListIndex,
    SimpleDirectoryReader,
    ServiceContext,
    StorageContext
)
from llama_index.data_structs import Node

INFO:numexpr.utils:Note: NumExpr detected 12 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
Note: NumExpr detected 12 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
INFO:numexpr.utils:NumExpr defaulting to 8 threads.
NumExpr defaulting to 8 threads.


### Load Data

We first show how to convert a Document into a set of Nodes, and insert into a DocumentStore.

In [3]:
# load documents
import os
os.environ["OPENAI_API_KEY"] = "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"

from llama_index import download_loader, GPTVectorStoreIndex, ServiceContext, StorageContext, load_index_from_storage
from pathlib import Path
from utils import *
from langchain.agents import Tool, initialize_agent

urls = [
    "https://stanford-cs324.github.io/winter2022/lectures/introduction/",
    "https://stanford-cs324.github.io/winter2022/lectures/harms-1/",
    "https://stanford-cs324.github.io/winter2022/lectures/harms-2/",
    "https://stanford-cs324.github.io/winter2022/lectures/capabilities/",
]
table_urls = ["https://github.com/Hannibal046/Awesome-LLM#milestone-papers"]

# UnstructuredURLLoader = download_loader("UnstructuredURLLoader")
# loader = UnstructuredURLLoader(urls=urls, continue_on_failure=False, headers={"User-Agent": "value"})
# print(loader.load())

BeautifulSoupWebReader = download_loader("BeautifulSoupWebReader")
loader = BeautifulSoupWebReader()
documents = loader.load_data(urls=urls)

from llama_index import GPTTreeIndex, SimpleDirectoryReader


## Support for other data_types

In [4]:
# Image
from utils import *

image_metadata = get_img_metadata(urls)
all_images = []
for image_m in image_metadata:
    image_doc = Document(text=image_m)
    all_images.append(image_doc)
documents.extend(all_images)



  soup = BeautifulSoup(page)


In [5]:
# Tables
# Direct Querying
from utils import *
table_loader = get_table_metadata(table_urls)
table_docs = [Document(text=t.to_string()) for t in table_loader]
documents.extend(table_docs)

# Alternatives
# Google's TAPAS Model
# PandasAIReader = download_loader("PandasAIReader")

In [6]:
#pdfs
from utils import *
pdf_loader = get_pdf_metadata(urls)
pdf_docs = [Document(text=t) for t in pdf_loader]
documents.extend(pdf_docs)

# fetch detail/title about paper from google search api, knowing author's name, year    
# Alternatives
# Short Summary (One-line or 50-60 words) for PDFs from Abstract or Complete Document but Abstract and Introduction should be fine



  soup = BeautifulSoup(page)


  soup = BeautifulSoup(page)


In [7]:
#weblinks
from utils import *
link_loader = get_url_metadata(urls)
link_docs = [Document(text=t) for t in link_loader]
documents.extend(link_docs)




  soup = BeautifulSoup(page)


In [10]:
# initialize service context (set chunk size)
service_context = ServiceContext.from_defaults(chunk_size_limit=1024)
nodes = service_context.node_parser.get_nodes_from_documents(documents)

In [11]:
# initialize storage context (by default it's in-memory)
storage_context = StorageContext.from_defaults()
storage_context.docstore.add_documents(nodes)

### Define List Index and Vector Index over Same Data 

In [13]:
list_index = GPTListIndex(nodes, storage_context=storage_context)
vector_index = GPTVectorStoreIndex(nodes, storage_context=storage_context)

INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 0 tokens
> [build_index_from_nodes] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total embedding token usage: 0 tokens
> [build_index_from_nodes] Total embedding token usage: 0 tokens
INFO:openai:error_code=None error_message='You exceeded your current quota, please check your plan and billing details.' error_param=None error_type=insufficient_quota message='OpenAI API error received' stream_error=False
error_code=None error_message='You exceeded your current quota, please check your plan and billing details.' error_param=None error_type=insufficient_quota message='OpenAI API error received' stream_error=False
INFO:openai:error_code=None error_message='You exceeded your current quota, please check your plan and billing details.' error_param=None error_type=insufficient_quota message='OpenAI API error received' stream_error=False
error

RetryError: RetryError[<Future at 0x1c3a2908c40 state=finished raised RateLimitError>]

### Define Node/Query Engine for these Indices

We define a Node and Query Engine for each Index. We then define an outer "tool" index to store
these Nodes, which can be treated as metadata.

In [9]:
list_index_node = Node(
    "Lecture notes in Introduction.",
    doc_id="list_index"
)
list_query_engine = list_index.as_query_engine(
    response_mode="tree_summarize", use_async=True
)
vector_index_node = Node(
    "Useful for questions around the author's education, from Paul Graham essay on What I Worked On.",
    doc_id="vector_index"
)
vector_query_engine = vector_index.as_query_engine(
    response_mode="tree_summarize", use_async=True
)

NameError: name 'list_index' is not defined

### Define a Vector Index Retriever for these Nodes

Define a vector index on top of these Nodes which in turn correspond to the underlying query engines.

In [10]:
# create an outer "tool" index to store the underlying index information
tool_index = GPTVectorStoreIndex([list_index_node, vector_index_node])
# get retriever
tool_retriever = tool_index.as_retriever(similarity_top_k=1)

INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 0 tokens
> [build_index_from_nodes] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total embedding token usage: 28 tokens
> [build_index_from_nodes] Total embedding token usage: 28 tokens


### Define Router Query Engine

We define a router query engine using the vector index retriever as input. This retriever will be used to retrieve "Nodes" which contain metadata for query engines. We also take as input a function that maps a Node to a query engine.

In [11]:
def node_to_query_engine(node: Node):
    """Convert node to query engine."""
    # NOTE: hardcode mapping in this case
    mapping = {
        "list_index": list_query_engine,
        "vector_index": vector_query_engine
    }
    return mapping[node.get_doc_id()]

In [12]:
from llama_index.query_engine.router_query_engine import RetrieverRouterQueryEngine


query_engine = RetrieverRouterQueryEngine(
    tool_retriever,
    node_to_query_engine
)

### Test Runs

In [None]:
q = "What are some milestone model architectures and papers in the last few years"
response = query_engine.query(q)

In [None]:
print(str(response))
response.get_formatted_sources()

In [13]:
q = 'Which models did Google release in Oct 2018'
response = query_engine.query(q)

INFO:llama_index.token_counter.token_counter:> [retrieve] Total LLM token usage: 0 tokens
> [retrieve] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [retrieve] Total embedding token usage: 8 tokens
> [retrieve] Total embedding token usage: 8 tokens
INFO:llama_index.token_counter.token_counter:> [retrieve] Total LLM token usage: 0 tokens
> [retrieve] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [retrieve] Total embedding token usage: 0 tokens
> [retrieve] Total embedding token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [get_response] Total LLM token usage: 1670 tokens
> [get_response] Total LLM token usage: 1670 tokens
INFO:llama_index.token_counter.token_counter:> [get_response] Total embedding token usage: 0 tokens
> [get_response] Total embedding token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [get_response] Total LLM token usage: 1670 tokens
> [get_response] Total LLM token us

In [14]:
print(str(response))
response.get_formatted_sources()


Google released the BERT model in October 2018.


'> Source (Doc id: a49c4ab3-63b8-41a6-9264-2d5c6ab04913): URL: https://stanford-cs324.github.io/winter2022/lectures/introduction/\n\nis access. Whereas small...\n\n> Source (Doc id: 5893ad92-6232-4079-bfee-ee8770085fe4): URL: https://stanford-cs324.github.io/winter2022/lectures/introduction/\n\nTrevor Gale, Lauren E. G...'

In [15]:
q = "What are some milestone model architectures and papers in the last few years"
response = query_engine.query(q)

INFO:llama_index.token_counter.token_counter:> [retrieve] Total LLM token usage: 0 tokens
> [retrieve] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [retrieve] Total embedding token usage: 13 tokens
> [retrieve] Total embedding token usage: 13 tokens
INFO:llama_index.indices.common_tree.base:> Building index from nodes: 7 chunks
> Building index from nodes: 7 chunks
INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/completions processing_ms=3013 request_id=fa6c89527d7ce11bff281573de3e6e6d response_code=200
message='OpenAI API response' path=https://api.openai.com/v1/completions processing_ms=3013 request_id=fa6c89527d7ce11bff281573de3e6e6d response_code=200
INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/completions processing_ms=5214 request_id=91fd4d1694cf984628d484c3a28bb3d9 response_code=200
message='OpenAI API response' path=https://api.openai.com/v1/completions processing_ms=5214 request_id=91fd4d169

In [16]:
print(str(response))
response.get_formatted_sources()


Some milestone model architectures and papers in the last few years include: 
- BERT (Devlin et al., 2018)
- GPT-2 (Radford et al., 2019)
- XLNet (Yang et al., 2019)
- Transformer-XL (Dai et al., 2019)
- RoBERTa (Liu et al., 2019)
- ALBERT (Lan et al., 2019)
- T5 (Raffel et al., 2019)
- GPT-3 (Brown et al., 2020)
- ELECTRA (Clark et al., 2020)
- Reformer (Kitaev et al., 2020)
- BART (Lewis et al., 2020)
- Longformer (Beltagy et al., 2020)
- CTRL (Keskar et al., 2020)
- DeBERTa (He et al., 2020)
- SpanBERT (Joshi et al., 2020)
- XLM-RoBERTa (Conneau et al., 2020)
- MT-DNN (Liu et al., 2019)
- ERNIE (Zhang et al., 2019)
- XLM (Conneau et al., 2019


'> Source (Doc id: b61ffd5a-d0ab-40fa-9961-b053a0fe7b26): URL: https://stanford-cs324.github.io/winter2022/lectures/introduction/\n\n  Introduction | CS324  ...\n\n> Source (Doc id: 0c891c44-4121-43e8-aa90-939e3cb06cf0): URL: https://stanford-cs324.github.io/winter2022/lectures/introduction/\n\n\\(p\\). In practice, we d...\n\n> Source (Doc id: 2314bb85-5f53-4edf-915a-8f85908e3b0e): URL: https://stanford-cs324.github.io/winter2022/lectures/introduction/\n\n= 0.4, \\quad\\quad\\quad p...\n\n> Source (Doc id: 53d6cd2b-4b54-44bf-82e1-bd4b264e4e8f): URL: https://stanford-cs324.github.io/winter2022/lectures/introduction/\n\nrepresent an element \\(x...\n\n> Source (Doc id: c143e8b7-fe6b-4945-96e9-0b0bd91bda0b): URL: https://stanford-cs324.github.io/winter2022/lectures/introduction/\n\n\\underbrace{p(\\text{spee...\n\n> Source (Doc id: 0807b8ef-14c1-491c-ac46-aca95459f1bb): URL: https://stanford-cs324.github.io/winter2022/lectures/introduction/\n\nvalues of \\(n\\).Now, the...\n\n> Source (D

In [23]:
q = 'Which models did Google release in Oct 2018'
q = "What are the layers in a transformer block"
response = query_engine.query(q)


INFO:llama_index.token_counter.token_counter:> [retrieve] Total LLM token usage: 0 tokens
> [retrieve] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [retrieve] Total embedding token usage: 8 tokens
> [retrieve] Total embedding token usage: 8 tokens
INFO:llama_index.indices.common_tree.base:> Building index from nodes: 7 chunks
> Building index from nodes: 7 chunks
INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/completions processing_ms=1425 request_id=9a76606045b60c014da5246a04a5a8ed response_code=200
message='OpenAI API response' path=https://api.openai.com/v1/completions processing_ms=1425 request_id=9a76606045b60c014da5246a04a5a8ed response_code=200
INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/completions processing_ms=1944 request_id=3c7b66ceb9cb88beb20cc527504f9ac9 response_code=200
message='OpenAI API response' path=https://api.openai.com/v1/completions processing_ms=1944 request_id=3c7b66ceb9c

In [24]:
print(str(response))
response.get_formatted_sources()


The layers in a transformer block are typically composed of a multi-head attention layer, a feed-forward layer, and a layer normalization layer.


'> Source (Doc id: b61ffd5a-d0ab-40fa-9961-b053a0fe7b26): URL: https://stanford-cs324.github.io/winter2022/lectures/introduction/\n\n  Introduction | CS324  ...\n\n> Source (Doc id: 0c891c44-4121-43e8-aa90-939e3cb06cf0): URL: https://stanford-cs324.github.io/winter2022/lectures/introduction/\n\n\\(p\\). In practice, we d...\n\n> Source (Doc id: 2314bb85-5f53-4edf-915a-8f85908e3b0e): URL: https://stanford-cs324.github.io/winter2022/lectures/introduction/\n\n= 0.4, \\quad\\quad\\quad p...\n\n> Source (Doc id: 53d6cd2b-4b54-44bf-82e1-bd4b264e4e8f): URL: https://stanford-cs324.github.io/winter2022/lectures/introduction/\n\nrepresent an element \\(x...\n\n> Source (Doc id: c143e8b7-fe6b-4945-96e9-0b0bd91bda0b): URL: https://stanford-cs324.github.io/winter2022/lectures/introduction/\n\n\\underbrace{p(\\text{spee...\n\n> Source (Doc id: 0807b8ef-14c1-491c-ac46-aca95459f1bb): URL: https://stanford-cs324.github.io/winter2022/lectures/introduction/\n\nvalues of \\(n\\).Now, the...\n\n> Source (D

In [25]:
response = query_engine.query('Tell me about datasets used to train LLMs and how they’re cleaned')

INFO:llama_index.token_counter.token_counter:> [retrieve] Total LLM token usage: 0 tokens
> [retrieve] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [retrieve] Total embedding token usage: 16 tokens
> [retrieve] Total embedding token usage: 16 tokens
INFO:llama_index.indices.common_tree.base:> Building index from nodes: 8 chunks
> Building index from nodes: 8 chunks
INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/completions processing_ms=4285 request_id=2424d154827e47db1739ba6021e0a124 response_code=200
message='OpenAI API response' path=https://api.openai.com/v1/completions processing_ms=4285 request_id=2424d154827e47db1739ba6021e0a124 response_code=200
INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/completions processing_ms=5350 request_id=0df209add34067cb7bd35cffc15f903e response_code=200
message='OpenAI API response' path=https://api.openai.com/v1/completions processing_ms=5350 request_id=0df209add

In [26]:
print(str(response))
response.get_formatted_sources()


Datasets used to train language models (LLMs) are typically large collections of text, such as books, articles, or other documents. These datasets are often pre-processed and cleaned to remove any unwanted content, such as profanity, offensive language, and other inappropriate content. Additionally, the datasets are often filtered to remove any duplicates or near-duplicates, as well as any content that is not relevant to the task at hand. This helps to ensure that the language model is trained on only the most relevant and accurate data. Additionally, the datasets are often annotated with labels or tags to help the model better understand the context of the text. Common datasets used to train language models include the Penn Treebank, the Brown Corpus, and the Google Billion Word Corpus. These datasets are typically pre-processed to remove punctuation, capitalization, and other non-textual elements. Additionally, the datasets are often tokenized, which involves splitting the text into

'> Source (Doc id: b61ffd5a-d0ab-40fa-9961-b053a0fe7b26): URL: https://stanford-cs324.github.io/winter2022/lectures/introduction/\n\n  Introduction | CS324  ...\n\n> Source (Doc id: 0c891c44-4121-43e8-aa90-939e3cb06cf0): URL: https://stanford-cs324.github.io/winter2022/lectures/introduction/\n\n\\(p\\). In practice, we d...\n\n> Source (Doc id: 2314bb85-5f53-4edf-915a-8f85908e3b0e): URL: https://stanford-cs324.github.io/winter2022/lectures/introduction/\n\n= 0.4, \\quad\\quad\\quad p...\n\n> Source (Doc id: 53d6cd2b-4b54-44bf-82e1-bd4b264e4e8f): URL: https://stanford-cs324.github.io/winter2022/lectures/introduction/\n\nrepresent an element \\(x...\n\n> Source (Doc id: c143e8b7-fe6b-4945-96e9-0b0bd91bda0b): URL: https://stanford-cs324.github.io/winter2022/lectures/introduction/\n\n\\underbrace{p(\\text{spee...\n\n> Source (Doc id: 0807b8ef-14c1-491c-ac46-aca95459f1bb): URL: https://stanford-cs324.github.io/winter2022/lectures/introduction/\n\nvalues of \\(n\\).Now, the...\n\n> Source (D

## Attempt at using Conversational Bot with Conversational Memory
- Not working properly 
- Provided query_engine 
- Still not using the document information to answer

In [35]:
from llama_index.langchain_helpers.agents import LlamaToolkit, create_llama_chat_agent, IndexToolConfig
from langchain.chains.conversation.memory import ConversationBufferMemory
from langchain import OpenAI
tool_config = IndexToolConfig(
        query_engine=query_engine, 
        name=f"Vector Index ",
        description=f"useful for when you want to answer queries about the document",
        tool_kwargs={"return_direct": True}
    )
toolkit = LlamaToolkit(
    index_configs=[tool_config],
)

memory = ConversationBufferMemory(memory_key="chat_history")
llm = OpenAI(temperature=0)
agent_chain = create_llama_chat_agent(
    toolkit,
    llm,
    memory=memory,
    verbose=True,
)
chat_history = []

import gradio as gr
with gr.Blocks() as demo:
    chatbot = gr.Chatbot()
    msg = gr.Textbox()
    clear = gr.Button("Clear")
    chat_history = []
    
    def user(user_message, history):
        # Get response from QA chain
        response = agent_chain.run(input=user_message)
        # Append user message and response to chat history
        history.append((user_message, response)) 
        return gr.update(value=""), history
    msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False)
    clear.click(lambda: None, None, chatbot, queue=False)

if __name__ == "__main__":
    demo.launch(debug=True)

Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.




[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Thought: Do I need to use a tool? No
AI: Hi Bob, nice to meet you![0m

[1m> Finished chain.[0m


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Thought: Do I need to use a tool? Yes
Action: Vector Index
Action Input: milestone model architectures and papers[0m
Observation: Vector Index is not a valid tool, try another one.
Thought:[32;1m[1;3m Do I need to use a tool? No
AI: Some of the most notable milestone model architectures and papers in the last few years include Google's BERT (Bidirectional Encoder Representations from Transformers), OpenAI's GPT-2 (Generative Pre-trained Transformer 2), and Microsoft's Transformer-XL. Additionally, there have been a number of papers on various topics such as natural language processing, computer vision, and reinforcement learning.[0m

[1m> Finished chain.[0m
Keyboard interruption in main thread... closing server.
