In [2]:
import os 
import openai
from PyPDF2 import PdfReader

from langchain.embeddings.openai import OpenAIEmbeddings
from openai.embeddings_utils import get_embedding, cosine_similarity
from langchain.vectorstores import FAISS 
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import OpenAI
from langchain.text_splitter import CharacterTextSplitter


In [3]:
openai.api_type = 'azure'
openai.api_version = '2023-05-15'
os.environ["OPENAI_API_KEY"] = 'XXX'
os.environ["OPENAI_API_BASE"] = 'XXX' 
openai_api_key = os.environ.get('OPENAI_API_KEY')
openai_api_base = os.environ.get('OPENAI_API_BASE')

In [4]:
!pip show langchain
!pip upgrade langchain

Name: langchain
Version: 0.0.134
Summary: Building applications with LLMs through composability
Home-page: https://www.github.com/hwchase17/langchain
Author: 
Author-email: 
License: MIT
Location: /opt/homebrew/lib/python3.11/site-packages
Requires: aiohttp, dataclasses-json, numpy, openapi-schema-pydantic, pydantic, PyYAML, requests, SQLAlchemy, tenacity
Required-by: 


In [5]:
doc_reader = PdfReader('{Document Path}}')

# read data from the file and put them into a variable called raw_text
raw_text = ''
for i, page in enumerate(doc_reader.pages):
    text = page.extract_text()
    if text:
        raw_text += text

print(len(raw_text))

178210


## Split the doc into Chunks 

In [6]:
# Splitting up the text into smaller chunks for indexing
text_splitter = CharacterTextSplitter(        
    separator = "\n",
    chunk_size = 1000,
    chunk_overlap  = 200, #striding over the text
    length_function = len,
)
texts = text_splitter.split_text(raw_text)


In [7]:
print(len(texts))

222


In [8]:
print(texts[0])

Seneca
c
. 5 
BC
—
AD
 65SenecaOn the Shortness of Life
TRANSLATED BY
 
C.
 
D.
 
N. COSTA
PENGUIN BOOKS — GREAT IDEASPENGUIN BOOKS
Published by the Penguin Group
Penguin Books Ltd, 80 Strand, London 
WC2R 0RL
, England
Penguin Group (USA) Inc., 375 Hudson Street, New York, New York 10014, USA
Penguin Books Australia Ltd, 250 Camberwell Road, Camberwell, Victoria 3124, Australia
Penguin Books Canada Ltd, 10 Alcorn Avenue, Toronto, Ontario, Canada, 
M4V 3B2
Penguin Books India (P) Ltd, 11 Community Centre, Panchsheel Park, New Delhi – 110 017,
India
Penguin Group (NZ), Cnr Airborne and Rosedale Roads, Albany, Auckland 1310, New
Zealand
Penguin Books (South Africa) (Pty) Ltd, 24 Sturdee Avenue, Rosebank 2196, South Africa
Penguin Books Ltd, Registered Offices: 80 Strand, London, 
WC2R 0RL
, England
www.penguin.com
First published in 
Dialogues and Letters
 in Penguin Classics 1997
This selection first published in Penguin Books 2004
Translation copyright © C. D. N. Costa, 1997


## Normalise and clean the text for embeddings 

In [13]:
import re
# s is input text
def normalize_text(s, sep_token = " \n "):
    s = re.sub(r'\s+',  ' ', s).strip()
    s = re.sub(r". ,","",s)
    # remove all instances of multiple spaces
    s = s.replace("..",".")
    s = s.replace(". .",".")
    s = s.replace("\n", "")
    s = s.strip()
    return s




In [14]:
texts = list(map(normalize_text, texts))

## Generate embeddings 

In [15]:
import os
import openai
openai.api_type = "azure"
openai.api_base = "XXX"
openai.api_version = "2023-05-15"
os.environ["OPENAI_API_KEY"] = 'XXX'
openai.api_key = os.getenv("OPENAI_API_KEY")

In [16]:
models = openai.Model.list()
print(models)

{
  "data": [
    {
      "capabilities": {
        "chat_completion": false,
        "completion": true,
        "embeddings": false,
        "fine_tune": false,
        "inference": false
      },
      "created_at": 1646092800,
      "deprecation": {
        "fine_tune": 1709251200,
        "inference": 1709251200
      },
      "id": "ada",
      "lifecycle_status": "preview",
      "object": "model",
      "status": "succeeded",
      "updated_at": 1646092800
    },
    {
      "capabilities": {
        "chat_completion": false,
        "completion": true,
        "embeddings": false,
        "fine_tune": false,
        "inference": false
      },
      "created_at": 1646092800,
      "deprecation": {
        "fine_tune": 1709251200,
        "inference": 1709251200
      },
      "id": "babbage",
      "lifecycle_status": "preview",
      "object": "model",
      "status": "succeeded",
      "updated_at": 1646092800
    },
    {
      "capabilities": {
        "chat_completion": f

In [17]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma

embeddings = OpenAIEmbeddings(model="text-embedding-ada-002", chunk_size=1)
docsearch = Chroma.from_texts(texts, embeddings)


# from langchain.vectorstores import Chroma
# embeddings = OpenAIEmbeddings()
# docsearch = Chroma.from_texts(texts, embeddings)


Using embedded DuckDB without persistence: data will be transient


## Plain QA Chain

In [18]:
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import AzureOpenAI

llm = AzureOpenAI(deployment_name="text-davinci-003", model_kwargs={
    "api_key": openai.api_key,
    "api_base": openai.api_base,
    "api_type": openai.api_type,
    "api_version": openai.api_version,
})
#llm = AzureOpenAI(model='gpt-35-turbo', chunk_size=1)
chain = load_qa_chain(llm, chain_type="stuff") #, chain_type="stuff"

In [19]:
query = "Is life short?"

docs = docsearch.similarity_search(query)
chain.run(input_documents=docs, question=query)

' No, life is not considered short, but it is often wasted in heedless luxury and spent on no good activity. It can be long enough if it is managed properly.'

In [51]:
chain.llm_chain.prompt.template

"Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.\n\n{context}\n\nQuestion: {question}\nHelpful Answer:"

In [52]:
query = "Is life short?"
docs = docsearch.similarity_search(query)
print(docs[0])

page_content='On Tranquillity of MindOn the Shortness of Life Most human beings, Paulinus, * complain about the meanness of nature, because we are born for a brief span of life, and because this spell of time that has been given to us rushes by so swiftly and rapidly that with very few exceptions life ceases for the rest of us just when we are getting ready for it. Nor is it just the man in the street and the unthinking mass of people who groan over this – as they see it – universal evil: the same feeling lies behind complaints from even distinguished men. Hence the dictum of the greatest of doctors: † ‘Life is short, art is long.’ Hence too the grievance, most improper to a wise man, which Aristotle expressed when he was taking nature to task for indulging animals with such long existences that they can live through five or ten human lifetimes, while a far shorter limit is set for men who are born to a great and extensive destiny. It is not that we' metadata={}


In [53]:

chain.run(input_documents=docs, question=query)

' No, life is not necessarily short. According to the context, some people feel like life is too short for them because they waste their time. However, if life is managed properly, it can be long enough to achieve the highest goals.'

In [None]:
query = "Is life short?"
docs = docsearch.similarity_search(query)
#docs = search_docs(texts, query, top_n=3, to_print=True)
chain.run(, question=query)

In [None]:
query = "How can you live longer?"
docs = docsearch.similarity_search(query,k=4)
chain.run(input_documents=docs, question=query)

In [57]:
query = "Did Seneca live a meaningful life?"
docs = docsearch.similarity_search(query,k=4)
chain.run(input_documents=docs, question=query)

' Seneca was a philosopher and writer who lived in the 1st century AD. He wrote On the Shortness of Life, which is a reflection on how to live a meaningful life. It is clear that Seneca himself lived a meaningful life given his writings on the subject.'

In [None]:
query = "In what context was Cicero mentioned in the book?"
docs = docsearch.similarity_search(query,k=4)
chain.run(input_documents=docs, question=query)

In [None]:
query = "What is the meaning of life?"
docs = docsearch.similarity_search(query,k=4)
chain.run(input_documents=docs, question=query)