In [2]:
# Download the PDF Reid Hoffman book with GPT-4 from his free download link
!wget -q https://www.impromptubook.com/wp-content/uploads/2023/03/impromptu-rh.pdf

In [5]:
from PyPDF2 import PdfReader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS 

In [6]:
doc_reader = PdfReader('impromptu-rh.pdf')

In [7]:
# read data from the file and put them into a variable called raw_text
raw_text = ''
for i, page in enumerate(doc_reader.pages):
    text = page.extract_text()
    if text:
        raw_text += text

In [8]:
len(raw_text)

356630

In [9]:
raw_text[:100]

'Impromptu\nAmplifying Our Humanity \nThrough AI\nBy Reid Hoffman  \nwith GPT-4Impromptu: AmplIfyIng our '

In [10]:
# Splitting up the text into smaller chunks for indexing
text_splitter = CharacterTextSplitter(        
    separator = "\n",
    chunk_size = 1000,
    chunk_overlap  = 200, #striding over the text
    length_function = len,
)
texts = text_splitter.split_text(raw_text)

In [11]:
len(texts)

448

In [12]:
texts[20]

'million registered users. \nIn late January 2023, Microsoft1—which had invested $1 billion \nin OpenAI in 2019—announced that it would be investing $10 \nbillion more in the company. It soon unveiled a new version of \nits search engine Bing, with a variation of ChatGPT built into it.\n1 I sit on Microsoft’s Board of Directors. 10Impromptu: Amplifying Our Humanity Through AI\nBy the start of February 2023, OpenAI said ChatGPT had \none hundred million monthly active users, making it the fast-\nest-growing consumer internet app ever. Along with that \ntorrent of user interest, there were news stories of the new Bing \nchatbot functioning in sporadically unusual ways that were \nvery different from how ChatGPT had generally been engaging \nwith users—including showing “anger,” hurling insults, boast-\ning on its hacking abilities and capacity for revenge, and basi-\ncally acting as if it were auditioning for a future episode of Real \nHousewives: Black Mirror Edition .'

In [13]:
embeddings = OpenAIEmbeddings()
docsearch = FAISS.from_texts(texts, embeddings)

In [14]:
docsearch.embedding_function

<bound method OpenAIEmbeddings.embed_query of OpenAIEmbeddings(client=<class 'openai.api_resources.embedding.Embedding'>, model='text-embedding-ada-002', deployment='text-embedding-ada-002', embedding_ctx_length=8191, openai_api_key=None, openai_organization=None, allowed_special=set(), disallowed_special='all', chunk_size=1000, max_retries=6)>

In [15]:
query = "how does GPT-4 change social media?"
docs = docsearch.similarity_search(query)

In [16]:
len(docs)

4

In [17]:
docs[0]

Document(page_content='rected ways that tools like GPT-4 and DALL-E 2 enable.\nThis is a theme I’ve touched on throughout this travelog, but \nit’s especially relevant in this chapter. From its inception, social \nmedia worked to recast broadcast media’s monolithic and \npassive audiences as interactive, democratic communities, in \nwhich newly empowered participants could connect directly \nwith each other. They could project their own voices broadly, \nwith no editorial “gatekeeping” beyond a given platform’s terms \nof service.\nEven with the rise of recommendation algorithms, social media \nremains a medium where users have more chance to deter -\nmine their own pathways and experiences than they do in the \nworld of traditional media. It’s a medium where they’ve come \nto expect a certain level of autonomy, and typically they look for \nnew ways to expand it.\nSocial media content creators also wear a lot of hats, especially \nwhen starting out. A new YouTube creator is probably n

# Plain QA chain

In [18]:
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import OpenAI

In [19]:
chain = load_qa_chain(OpenAI(), 
                      chain_type="stuff") # we are going to stuff all the docs in at once

In [20]:
chain.llm_chain.prompt.template

"Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.\n\n{context}\n\nQuestion: {question}\nHelpful Answer:"

In [21]:
query = "who are the authors of the book?"
docs = docsearch.similarity_search(query)
chain.run(input_documents=docs, question=query)

' Reid Hoffman and Ben Casnocha wrote the book The Startup of You. Chris Yeh co-authored the book, The Alliance, with Reid Hoffman and Ben Casnocha.'

In [22]:
query = "who is the author of the book?"
query_02 = "has it rained this week?"
docs = docsearch.similarity_search(query_02)
chain.run(input_documents=docs, question=query)

' The author of the book is not specified in the given context.'

In [23]:
query = "who is the book authored by?"
docs = docsearch.similarity_search(query,k=4)
chain.run(input_documents=docs, question=query)

' Reid Hoffman and an unidentified co-author.'

# QA chain with Map reduce

In [24]:
chain = load_qa_chain(OpenAI(), 
                      chain_type="stuff")

query = "who is the book authored by?"
docs = docsearch.similarity_search(query,k=20)
chain.run(input_documents=docs, question=query)

InvalidRequestError: This model's maximum context length is 4097 tokens, however you requested 5435 tokens (5179 in your prompt; 256 for the completion). Please reduce your prompt; or completion length.

In [26]:
chain = load_qa_chain(OpenAI(), 
                      chain_type="map_rerank",
                      return_intermediate_steps=True
                      ) 

query = "who are openai?"
docs = docsearch.similarity_search(query,k=10)
results = chain({"input_documents": docs, "question": query}, return_only_outputs=True)
results

ValueError: Could not parse output:  OpenAI is an artificial intelligence research laboratory founded in San Francisco, California. It is a non-profit organization funded by Elon Musk, Sam Altman, Reid Hoffman, Peter Thiel, Amazon Web Services, Microsoft, and the Government of Ontario. Score: 90

In [27]:
from langchain.chains import RetrievalQA

# set up FAISS as a generic retriever 
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":4})

# create the chain to answer questions 
rqa = RetrievalQA.from_chain_type(llm=OpenAI(), 
                                  chain_type="stuff", 
                                  retriever=retriever, 
                                  return_source_documents=True)

In [28]:
rqa("What is OpenAI?")

{'query': 'What is OpenAI?',
 'result': ' OpenAI is a research organization that develops and shares artificial intelligence tools for the benefit of humanity.',
 'source_documents': [Document(page_content='thing that has largely been happening to individuals rather \nthan for them—an under-the-radar force deployed by Big Tech \nwithout much public knowledge, much less consent, via tech -\nnologies like facial recognition and algorithmic decision-mak-\ning on home loans, job applicant screening, social media rec-\nommendations, and more.\nA founding goal of OpenAI was to develop technologies that put \nthe power of AI directly into the hands of millions of people. \nIn this way, AI might function as a decentralized, personally \nempowering force, rather than a top-down, totalizing one. \nBroadly distributed and easily accessible to individuals making \n11  I imagine this is especially true when one of these tools starts be -\nhaving like Microsoft’s Sydney has on at least some occasion

In [29]:
query = "What does gpt-4 mean for creativity?"
rqa(query)['result']

' GPT-4 can be a powerful tool for amplifying creativity, enabling faster and more comprehensive workflows, and providing a versatile brainstorming and production aid. It can be used to generate text, images, music, video, and other forms of media, as well as to help with tasks such as writing a wedding toast, a job offer, a novel in verse, or planning a vacation itinerary.'