In [None]:
!pip install langchain
!pip install pypdf2
!pip install openai
!pip install chromadb
!pip install tiktoken

In [None]:
import PyPDF2
import os
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.chains import VectorDBQAWithSourcesChain
from langchain.llms import OpenAI

Set OpenAI Key

In [None]:
os.environ["OPENAI_API_KEY"] = "your api key"

Load files from a directory

In [None]:
def read_and_textify(files):
    text_list = []
    sources_list = []
    for file in files:
        pdfReader = PyPDF2.PdfReader(file)
        #print("Page Number:", len(pdfReader.pages))
        for i in range(len(pdfReader.pages)):
          pageObj = pdfReader.pages[i]
          text = pageObj.extract_text()
          pageObj.clear()
          text_list.append(text)
          sources_list.append(file.name + "_page_"+str(i))
    return [text_list,sources_list]

In [None]:
directory = r"/content/drive/MyDrive/ChatGPT/Shared/ChromaDB/Data/"
files = os.listdir(directory)
files = [open(os.path.join(directory,x),"rb") for x in files if x.endswith(".pdf")]
print(files)

[<_io.BufferedReader name='/content/drive/MyDrive/ChatGPT/Shared/ChromaDB/Data/Australia - Wikipedia.pdf'>]


In [None]:
textify_output = read_and_textify(files)

In [None]:
documents = textify_output[0]
sources = textify_output[1]

In [None]:
documents

['6/3/23, 9:31 PM Australia - Wikipedia\nhttps://en.wikipedia.org/wiki/Australia 1/50Commonwealth of Australia\nFlag\n Coat of arms\nAnthem: "Advance Australia Fair"\nCapital Canberra\n35°18′29″S 149°07′28″E\nLargest city Sydney (metropolitan)\nMelbourne (urban)[a]\nOfficial languages None at the federal level\nNational language English[N 2]\nReligion (2021)[6]43.9% Christianity\n38.9% no religion\n3.2% Islam\n2.7% Hinduism\n2.4% Buddhism\n1.7% other\n7.2% unanswered[5]Australia\nAustralia , officially the Commonwealth of\nAustralia , is a sovereign  country comprising the\nmainland of the Australian contine nt, the islan d\nof Tasmania , and numerous smaller islands .[16]\nAustralia is the largest country by area in Oceania\nand the world\'s sixth-largest count ry. Australia is\nthe oldest,[17] flattest,[18] and driest inha bited\ncontinent,[19][20] with the least fertile soils .[21][22]\nIt is a megadiverse country , and its size gives it a\nwide variety of landscapes and climates, w

In [None]:
sources

['/content/drive/MyDrive/ChatGPT/Shared/ChromaDB/Data/Australia - Wikipedia.pdf_page_0',
 '/content/drive/MyDrive/ChatGPT/Shared/ChromaDB/Data/Australia - Wikipedia.pdf_page_1',
 '/content/drive/MyDrive/ChatGPT/Shared/ChromaDB/Data/Australia - Wikipedia.pdf_page_2']

In [None]:
persist_directory = '/content/drive/MyDrive/ChatGPT/Shared/ChromaDB/VectorStore/'
#extract embeddings
embeddings = OpenAIEmbeddings(openai_api_key = os.environ["OPENAI_API_KEY"])

In [None]:
#vstore with metadata. Here we will store page numbers.
vectordb = Chroma.from_texts(documents, embeddings, metadatas=[{"source": s} for s in sources], persist_directory=persist_directory)
#deciding model
model_name = "gpt-3.5-turbo"
# model_name = "gpt-4"

In [None]:
vectordb.persist()
vectordb = None

In [None]:
vectordb.get()

AttributeError: ignored

In [None]:
# Now we can load the persisted database from disk, and use it as normal.
vectordb = Chroma(persist_directory=persist_directory, embedding_function=embeddings)
vectordb.get()

{'ids': ['10e2aa54-020d-11ee-a576-0242ac1c000c',
  '10e2ac0c-020d-11ee-a576-0242ac1c000c',
  '10e2acac-020d-11ee-a576-0242ac1c000c'],
 'embeddings': None,
 'documents': ['6/3/23, 9:31 PM Australia - Wikipedia\nhttps://en.wikipedia.org/wiki/Australia 1/50Commonwealth of Australia\nFlag\n Coat of arms\nAnthem: "Advance Australia Fair"\nCapital Canberra\n35°18′29″S 149°07′28″E\nLargest city Sydney (metropolitan)\nMelbourne (urban)[a]\nOfficial languages None at the federal level\nNational language English[N 2]\nReligion (2021)[6]43.9% Christianity\n38.9% no religion\n3.2% Islam\n2.7% Hinduism\n2.4% Buddhism\n1.7% other\n7.2% unanswered[5]Australia\nAustralia , officially the Commonwealth of\nAustralia , is a sovereign  country comprising the\nmainland of the Australian contine nt, the islan d\nof Tasmania , and numerous smaller islands .[16]\nAustralia is the largest country by area in Oceania\nand the world\'s sixth-largest count ry. Australia is\nthe oldest,[17] flattest,[18] and driest

In [None]:
qa = VectorDBQAWithSourcesChain.from_chain_type(llm=OpenAI(), k=1, chain_type="stuff", vectorstore=vectordb)



In [None]:
qa({"question": "How large is Australia?"}, return_only_outputs=True)

{'answer': " Australia is the largest country by area in Oceania and the world's sixth-largest country. It has an area of 7,692,024 km2 (2,969,907 sq mi). \n",
 'sources': ' Australia - Wikipedia, https://en.wikipedia.org/wiki/Australia'}

In [None]:
query = "How large is Australia?"
qa(query)

{'question': 'How large is Australia?',
 'answer': " Australia is the largest country by area in Oceania and the world's sixth-largest country. It has an area of 7,692,024 square kilometres (2,969,907 sq mi).\n",
 'sources': 'Australia - Wikipedia, https://en.wikipedia.org/wiki/Australia'}

In [None]:
query = "How many language groups are there in Australia?"
qa(query)

{'question': 'How many language groups are there in Australia?',
 'answer': ' There are over 250 language groups in Australia.\n',
 'sources': 'https://en.wikipedia.org/wiki/Australian_Aboriginal_languages'}

In [None]:
query = "Breifly describe about Indigenous people in Australia"
qa(query)

{'question': 'Breifly describe about Indigenous people in Australia',
 'answer': ' Indigenous Australians comprise two distinct groups: the Aboriginal peoples of the Australian mainland and Torres Strait Islander people. They have an oral culture with spiritual values based on reverence for the land and a belief in the Dreamtime. The oldest human remains found in Australia are the Lake Mungo remains, which have been dated to around 41,000 years ago.\n',
 'sources': '/content/drive/MyDrive/ChatGPT/Shared/ChromaDB/Data/Australia - Wikipedia.pdf_page_2'}