In [39]:
from llama_index import download_loader, ServiceContext, VectorStoreIndex
from dotenv import load_dotenv, find_dotenv
from llama_index.llms import OpenAI
import openai
import os
_ = load_dotenv(find_dotenv()) # read local .env file
openai.api_key = os.environ['OPENAI_API_KEY']

llm = OpenAI(model="gpt-3.5-turbo")

WikipediaReader = download_loader("WikipediaReader")

loader = WikipediaReader()
pages = ['Nicolas_Cage_filmography', 'The_Best_of_Times_(1981_film)', 'Nicolas_Cage']
documents = loader.load_data(pages=pages, auto_suggest=False, redirect = False)

In [40]:
from langchain.docstore.document import Document
docs = [Document(page_content=documents[i].get_content(), metadata={"source": "https://en.wikipedia.org/wiki/" + pages[i]}) for i in range(len(documents))]

In [9]:
text = [doc.page_content for doc in docs]

In [44]:
from langchain.text_splitter import RecursiveCharacterTextSplitter, SentenceTransformersTokenTextSplitter
character_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", ". ", " ", ""],
    chunk_size=1000,
    chunk_overlap=0
)
docs = character_splitter.split_documents(docs)

print(docs[0])
print(f"\nTotal chunks: {len(docs)}")

page_content='Nicolas Cage is an American actor and producer who began his acting career in 1981 with a role in the television pilot The Best of Times. The following year, Cage made his feature film acting debut in Fast Times at Ridgemont High, the second and last time he was credited by his birth name Nicolas Coppola; he later changed his name professionally to avoid allegations of nepotism due to his connection to the Coppola family. In 1983, Cage starred in a leading role in the teen romantic comedy Valley Girl alongside Deborah Foreman; the film was praised by critics and summarized by Rotten Tomatoes as a "goofy yet amiable film" with "engaging performances from its two leads."In 1984, Cage portrayed a fictionalized version of Irish-American mob hitman Mad Dog Coll ("Vincent Dwyer") in The Cotton Club and appeared in Birdy, a feature chosen by the National Board of Review as one of the top ten films of that year' metadata={'source': 'https://en.wikipedia.org/wiki/Nicolas_Cage_film

In [47]:
query = "Who directed the first movie in which Nicolas Cage appeared?"

from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings()
db = FAISS.from_documents(docs, embeddings)
retriever = db.as_retriever(search_kwargs= {"k": 3})
retrieved_docs = retriever.get_relevant_documents(query)

In [48]:
retrieved_docs

[Document(page_content='Nicolas Kim Coppola (born January 7, 1964), known by his stage name Nicolas Cage, is an American actor and film producer. He is the recipient of various accolades, including an Academy Award, a Screen Actors Guild Award, and a Golden Globe Award. Known for his versatility as an actor, his participation in various film genres has gained him a cult following.Born into the Coppola family, Cage began his career in films such as Fast Times at Ridgemont High (1982) and Valley Girl (1983), as well various films by his uncle Francis Ford Coppola such as Rumble Fish (1983), The Cotton Club (1984), and Peggy Sue Got Married (1986). He earned critical success for his roles in Moonstruck (1987) and  Raising Arizona (1987), and earned an Academy Award for Best Actor for his performance in the dramatic film Leaving Las Vegas (1995). He received another Academy Award nomination for his performance as twins Charlie and Donald Kaufman in the comedy-drama film Adaptation (2002).'

In [49]:
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain.schema import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

template = """Answer the question based only on the following context:

{context}

Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)
model = ChatOpenAI()

def format_docs(retrieved_docs: list):
    return "\n\n".join([d.page_content for d in retrieved_docs])


chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)
chain.invoke(query)


'The context does not provide information about the director of the first movie in which Nicolas Cage appeared.'

# Expantion with multiple queries

# sub questions

In [60]:
from llama_index.question_gen.openai_generator import OpenAIQuestionGenerator
from llama_index.question_gen.llm_generators import LLMQuestionGenerator
from IPython.display import Markdown, display


def display_prompt_dict(prompts_dict):
    for k, p in prompts_dict.items():
        text_md = f"**Prompt Key**: {k}<br>" f"**Text:** <br>"
        display(Markdown(text_md))
        print(p.get_template())
        display(Markdown("<br><br>"))

from llama_index.llms import OpenAI
llm = OpenAI()
question_gen = OpenAIQuestionGenerator.from_defaults(llm=llm)
display_prompt_dict(question_gen.get_prompts())

**Prompt Key**: question_gen_prompt<br>**Text:** <br>

You are a world class state of the art agent.

You have access to multiple tools, each representing a different data source or API.
Each of the tools has a name and a description, formatted as a JSON dictionary.
The keys of the dictionary are the names of the tools and the values are the descriptions.
Your purpose is to help answer a complex user question by generating a list of sub questions that can be answered by the tools.

These are the guidelines you consider when completing your task:
* Be as specific as possible
* The sub questions should be relevant to the user question
* The sub questions should be answerable by the tools provided
* You can generate multiple sub questions for each tool
* Tools must be specified by their name, not their description
* You don't need to use a tool if you don't think it's relevant

Output the list of sub questions by calling the SubQuestionList function.

## Tools
```json
{tools_str}
```

## User Question
{query_str}



<br><br>