In [1]:
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from dotenv import load_dotenv

In [3]:
from langchain.document_loaders import PyPDFLoader

# Load the PDF file
loader = PyPDFLoader("budget_speech.pdf")
documents = loader.load()

# Print document contents
print(documents)  # List of Document objects
print(len(documents))  # Number of pages loaded


[Document(metadata={'producer': 'Adobe Acrobat Pro 10.1.16', 'creator': 'Adobe Acrobat Pro 10.1.16', 'creationdate': '2023-02-01T05:28:04+05:30', 'moddate': '2023-02-01T08:28:21+05:30', 'title': '', 'source': 'budget_speech.pdf', 'total_pages': 58, 'page': 0, 'page_label': '1'}, page_content='GOVERNMENT OF INDIA\nBUDGET 2023-2024\nSPEECH\nOF\nNIRMALA SITHARAMAN\nMINISTER OF FINANCE\nFebruary 1,  2023'), Document(metadata={'producer': 'Adobe Acrobat Pro 10.1.16', 'creator': 'Adobe Acrobat Pro 10.1.16', 'creationdate': '2023-02-01T05:28:04+05:30', 'moddate': '2023-02-01T08:28:21+05:30', 'title': '', 'source': 'budget_speech.pdf', 'total_pages': 58, 'page': 1, 'page_label': '2'}, page_content=''), Document(metadata={'producer': 'Adobe Acrobat Pro 10.1.16', 'creator': 'Adobe Acrobat Pro 10.1.16', 'creationdate': '2023-02-01T05:28:04+05:30', 'moddate': '2023-02-01T08:28:21+05:30', 'title': '', 'source': 'budget_speech.pdf', 'total_pages': 58, 'page': 2, 'page_label': '3'}, page_content='CON

In [4]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
texts = text_splitter.split_documents(documents)

print(texts)
print(len(texts))

[Document(metadata={'producer': 'Adobe Acrobat Pro 10.1.16', 'creator': 'Adobe Acrobat Pro 10.1.16', 'creationdate': '2023-02-01T05:28:04+05:30', 'moddate': '2023-02-01T08:28:21+05:30', 'title': '', 'source': 'budget_speech.pdf', 'total_pages': 58, 'page': 0, 'page_label': '1'}, page_content='GOVERNMENT OF INDIA\nBUDGET 2023-2024\nSPEECH\nOF\nNIRMALA SITHARAMAN\nMINISTER OF FINANCE\nFebruary 1,  2023'), Document(metadata={'producer': 'Adobe Acrobat Pro 10.1.16', 'creator': 'Adobe Acrobat Pro 10.1.16', 'creationdate': '2023-02-01T05:28:04+05:30', 'moddate': '2023-02-01T08:28:21+05:30', 'title': '', 'source': 'budget_speech.pdf', 'total_pages': 58, 'page': 2, 'page_label': '3'}, page_content='CONTENTS \nPART-A \n Page No. \n\uf0b7 Introduction 1 \n\uf0b7 Achievements since 2014: Leaving no one behind 2 \n\uf0b7 Vision for Amrit Kaal – an empowered and inclusive economy 3 \n\uf0b7 Priorities of this Budget 5 \ni. Inclusive Development  \nii. Reaching the Last Mile \niii. Infrastructure an

In [5]:
print(texts[0])

page_content='GOVERNMENT OF INDIA
BUDGET 2023-2024
SPEECH
OF
NIRMALA SITHARAMAN
MINISTER OF FINANCE
February 1,  2023' metadata={'producer': 'Adobe Acrobat Pro 10.1.16', 'creator': 'Adobe Acrobat Pro 10.1.16', 'creationdate': '2023-02-01T05:28:04+05:30', 'moddate': '2023-02-01T08:28:21+05:30', 'title': '', 'source': 'budget_speech.pdf', 'total_pages': 58, 'page': 0, 'page_label': '1'}


In [6]:
embeddings = OpenAIEmbeddings()

vector = embeddings.embed_query('Testing the embedding model')

print(len(vector))  # 1536 dimensions

  embeddings = OpenAIEmbeddings()


1536


In [7]:
doc_vectors = embeddings.embed_documents([t.page_content for t in texts[:5]])

print(len(doc_vectors))  # 5 vectors in the output
print(doc_vectors[0])    # this will output the first chunk's 1539-dimensional vector

5
[-0.003956862265749568, -0.032019648750261254, -0.025336947331107144, 0.02894534547614068, 0.012629393507617376, -0.008610653704253778, -0.022041188586918024, -0.005555890974762594, -0.00942482335081701, -0.009007968022390057, 0.038168255298502395, 0.010616767236548419, -0.00799188484737153, -0.0015737891687101473, -0.021428933400132667, -0.009164288654134841, 0.017377628040252342, 0.0003726859582739326, 0.024281783066829845, -0.010538606920676027, 0.005669874613522737, -0.0003551813138996215, 0.016713265355337004, 0.009639763288143513, 0.0019882011723588336, -0.01447267273807034, 0.020725492419926286, -0.025467213282464363, -0.007184229336566479, 0.018211336834461096, -0.016426678772235002, -0.01672629269553079, 0.01692169348521177, -0.02081667821334731, -0.00529209990869327, -0.03707401087628889, -0.007268902856874475, 0.0006773888027482384, 0.009913324393696887, -0.014186084292323183, 0.01666115971985218, -0.0068781017431738, 0.010473473013674841, 0.018576085596080646, -0.02352623

In [14]:
from langchain.vectorstores.pgvector import PGVector
CONNECTION_STRING = "postgresql+psycopg2://enterprisedb:edb@ec2-35-177-20-106.eu-west-2.compute.amazonaws.com:5444/vector_db"
COLLECTION_NAME = 'union_budget_vectors'

db = PGVector.from_documents(
    embedding=embeddings,
    documents=texts,
    collection_name=COLLECTION_NAME,
    connection_string=CONNECTION_STRING,
)


In [15]:
query = "what is hapenning with Make AI in India and Make AI work for India"
similar = db.similarity_search_with_score(query, k=2)

for doc in similar:
    print(doc, end="\n\n")

(Document(metadata={'producer': 'Adobe Acrobat Pro 10.1.16', 'creator': 'Adobe Acrobat Pro 10.1.16', 'creationdate': '2023-02-01T05:28:04+05:30', 'moddate': '2023-02-01T08:28:21+05:30', 'title': '', 'source': 'budget_speech.pdf', 'total_pages': 58, 'page': 17, 'page_label': '18'}, page_content='14 \n \n \n \nMission Karmayogi \n58. Under Mission Karmayogi, Centre, States and Union Territories are \nmaking and implementing capacity-building plans for civil servants. The \ngovernment has also launched an integrated online training platform, iGOT \nKarmayogi, to provide continuous learning opportunities for lakhs of \ngovernment employees to upgrade their skills and facilitate people-centric \napproach.   \n59. For enhancing ease of doing business, more than  \n39,000 compliances have been reduced and more than  \n3,400 legal provisions have been decriminalized. For furthering the trust-\nbased governance, we have introduced the Jan Vishwas Bill to amend 42 \nCentral Acts. This Budget pro

In [16]:
vector = embeddings.embed_query(query)
print(vector)

[0.002085911548899713, -0.010727545108627097, -0.0007401195376065622, -0.01148926451939084, -0.001974827584578654, 0.012815220116799182, -0.017039939542189297, 0.023965943296732288, -0.005480146430790445, -0.03673179400448553, 0.02922744825176408, 0.007920469055615404, -0.007370338680504665, -0.006178389223990543, 0.009246424653023746, 0.0034101039198663227, 0.022907999981112388, -0.020707476618024308, 0.016757821448866998, -0.0002574328649303859, -0.020326617843964997, 0.027196196489727428, 0.008830300418807228, 0.025461168632546934, -0.011524529048225486, -0.0016274695308356572, 0.01671550401426542, -0.034813388734719744, 0.005261504955031791, 0.010325526685944436, -0.005233292866302792, -0.004429256020937473, -0.013823793092050578, -0.002075332190249319, -0.013090285304354552, 0.007483185638436816, 0.007313914968707949, 0.007081167215754156, 0.012039394894501579, -0.011249463860670118, 0.039214431735605654, 0.0011707906227980134, -0.008132058091268409, -0.02091906565367731, -0.03029