In [1]:
# !pip install -U langchain-community
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import Docx2txtLoader
from langchain.document_loaders import WikipediaLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pinecone import Pinecone, ServerlessSpec
import tiktoken
import os
from dotenv import load_dotenv

In [2]:
def load_document(doc):
    name, extension= os.path.splitext(doc)

    if extension=='.pdf':
        print("Loading: ", doc)
        loader= PyPDFLoader(doc)
    elif extension=='.docx':
        print("Loading: ", doc)
        loader= Docx2txtLoader(doc)
    else:
        print("Document Format is not Supported")
        return None
    
    data= loader.load()
    return data

In [3]:
def load_wikipedia_loader(query,lang='en',load_max_docs=2):
    print("Loading Query: ", query)
    loader= WikipediaLoader(query=query, lang=lang, load_max_docs=load_max_docs)
    data= loader.load()
    return data

# PDF Loader

In [4]:
response= load_document(r"C:\Users\navin\Downloads\CONSTITUTION.pdf")

Loading:  C:\Users\navin\Downloads\CONSTITUTION.pdf


In [5]:
# Print First page and metadata of the document
print("first page: ",response[0].page_content,end='\n'+'-'*100+'\n')
print("metadata: ", response[0].metadata)

first page:  1 
 
CONSTITUTION 
Introduction:- The Indian constitution is unique in its content and spirit. The salient features of 
the constitution are as follows:- 
 Lengthiest written Constitution 
 Blend of Rigidity and Flexibility 
 Federal system with unitary features 
 Parliamentary form of Government 
 Independent judiciary 
 Single citizenship 
 Emergency provision 
Structure:- The Indian Constitution originally consisted of 395 Art, 22 parts, 8 Schedules.  But 
after the Constitution 104th  Amendment Act, 2003, the Indian Con stitution Consists of 44 8 
Art, 25 parts, 12 Schedules.  
Preamble:- The preamble to the constitution is based on the “ objective resolution”  drafted and 
moved by Pandit Nehru and adopted by constituent assembly. It runs as follows:- 
“We THE PEOPLE OF INDIA,  having solemnly resolved to co nstitute India in to a SOVERIGN 
SOCIALIST SECULAR DEMOCRATIC REPUBLIC and to secure to all its citizen:- 
 JUSTICE, social, Economic, and Political; 
 L

In [6]:
print(f"We have {len(response)} pages in our document")

We have 16 pages in our document


In [7]:
print(f"We have {len(response[0].page_content)} characters on first page of our document")

We have 2358 characters on first page of our document


In [8]:
for i in range(len(response)):
    print(f"We have {len(response[i].page_content)} characters on {i} page of our document")

We have 2358 characters on 0 page of our document
We have 2381 characters on 1 page of our document
We have 2568 characters on 2 page of our document
We have 3815 characters on 3 page of our document
We have 2053 characters on 4 page of our document
We have 2672 characters on 5 page of our document
We have 2367 characters on 6 page of our document
We have 2530 characters on 7 page of our document
We have 2074 characters on 8 page of our document
We have 2298 characters on 9 page of our document
We have 2712 characters on 10 page of our document
We have 2970 characters on 11 page of our document
We have 2663 characters on 12 page of our document
We have 2760 characters on 13 page of our document
We have 3560 characters on 14 page of our document
We have 2549 characters on 15 page of our document


# Word Doc Loader

In [9]:
res= load_document(r"C:\Users\navin\Downloads\budget_speec.docx")

Loading:  C:\Users\navin\Downloads\budget_speec.docx


In [10]:
for i in range(len(res)):
    print(f"We have {len(res[i].page_content)} characters on {i} page of our document")

We have 89040 characters on 0 page of our document


# Wikipedia Loader

In [11]:
res_wiki= load_wikipedia_loader("AI")

Loading Query:  AI




  lis = BeautifulSoup(html).find_all('li')


In [12]:
print(res_wiki[0].page_content)

Artificial intelligence (AI) is the capability of computational systems to perform tasks typically associated with human intelligence, such as learning, reasoning, problem-solving, perception, and decision-making. It is a field of research in computer science that develops and studies methods and software that enable machines to perceive their environment and use learning and intelligence to take actions that maximize their chances of achieving defined goals.
High-profile applications of AI include advanced web search engines (e.g., Google Search); recommendation systems (used by YouTube, Amazon, and Netflix); virtual assistants (e.g., Google Assistant, Siri, and Alexa); autonomous vehicles (e.g., Waymo); generative and creative tools (e.g., language models and AI art); and superhuman play and analysis in strategy games (e.g., chess and Go). However, many AI applications are not perceived as AI: "A lot of cutting edge AI has filtered into general applications, often without being calle

# Chunking Strategy

In [13]:
def chunk_data(data, chunk_size=256):
    text_splitter= RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=0)
    chunks= text_splitter.split_documents(data)
    return chunks

In [14]:
def print_embedding_cost(texts):
    enc= tiktoken.encoding_for_model('text-embedding-ada-002')
    total_tokens= sum([len(enc.encode(page.page_content)) for page in texts])
    print("Total Tokens: ", total_tokens)
    print(f"Embedding Cost in USD: {total_tokens/1000 * 0.0004:.6f}")

In [15]:
chunks= chunk_data(response)

In [16]:
print(len(chunks))

203


In [17]:
print_embedding_cost(chunks)

Total Tokens:  9387
Embedding Cost in USD: 0.003755


# Vector DB - Pinecone

In [None]:
load_dotenv()
pc= Pinecone(api_key= os.getenv('PINECONE_API_KEY'))

# Working with Pinecone

In [19]:
index_name= 'langchain'

if index_name not in pc.list_indexes().names():
    pc.create_index(
        name= index_name,
        dimension= 1536,
        metric="cosine",
        spec= ServerlessSpec(
            cloud= 'aws',
            region='us-east-1'
        )
    )

    print("Index Created Successfully!")

else:
    print(f"Index with {index_name} Already Exists")

Index with langchain Already Exists


In [20]:
pc.list_indexes().names()

['langchain']

In [21]:
user_input= input("If Wanted to Delete Index type = 'delete'")

if user_input.strip().lower()== 'delete':
    print(f"Deleting Index with name : {index_name}")
    if index_name in pc.list_indexes().names():
        pc.delete_index(index_name)
        print(f"Deleted {index_name} index successfully!")
    else:
        print(f"Index with name {index_name} does not exist!")
else:
    print("Describing the stats of index")
    index= pc.Index(index_name)
    print(f"Following are stats of {index_name} index")
    print(index.describe_index_stats())

Describing the stats of index
Following are stats of langchain index
{'dimension': 1536,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {},
 'total_vector_count': 0,
 'vector_type': 'dense'}


In [22]:
pc.describe_index('langchain')

{
    "name": "langchain",
    "metric": "cosine",
    "host": "langchain-lvsncdf.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 1536,
    "deletion_protection": "disabled",
    "tags": null
}

# Working with Vectors

In [32]:
import random

vectors= [[random.random() for _ in range(1536)] for v in range(5)]

print(vectors)

[[0.19269966578842124, 0.6742326128352025, 0.8592854877636491, 0.6967002099443198, 0.6246644112317148, 0.38022040964746917, 0.6971627414233921, 0.861090429379947, 0.7456588163108127, 0.9078006614554927, 0.24794321976475808, 0.6219056311648882, 0.22798417799282011, 0.8692841708047833, 0.8260290178124036, 0.408080595053087, 0.347034980470897, 0.897559245058877, 0.80660074553664, 0.2522289377013045, 0.45646459774058834, 0.30048308790684997, 0.78013800057575, 0.5072869872185846, 0.8835288095318399, 0.5449158582027823, 0.5631682853221552, 0.6920532894590027, 0.17632020768891554, 0.25122613534908667, 0.5573310345682261, 0.5010848984279194, 0.19856000475132451, 0.0812812804470946, 0.35661565726028943, 0.48335800105375337, 0.7048689249069761, 0.2093372610957045, 0.6386688506608615, 0.4733591395558193, 0.8132703805984066, 0.7499093287994951, 0.2867126012908483, 0.13881030889952783, 0.021101381245555606, 0.7955780292935426, 0.9915768992619063, 0.8459370947363413, 0.7114406904871461, 0.1651193608

In [33]:
ids= list('abcde')
print(ids)

['a', 'b', 'c', 'd', 'e']


## Insert Vectors to DB

In [34]:
index_name='langchain'

index= pc.Index(index_name)

index.upsert(vectors=zip(ids, vectors))

{'upserted_count': 5}

## Updating Values in DB

In [39]:
index.upsert(vectors=[('c', [0.5] * 1536)])

{'upserted_count': 1}

# Fetching The Values from DB

In [28]:
index.fetch(ids=['c','d'])

FetchResponse(namespace='', vectors={'c': Vector(id='c', values=[0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 

In [30]:
user_input= input("Type Here [delete]: ")
if user_input.strip().lower()== 'delete':
    print(f"Deleting vector with id : {index_name}")
    index.delete(ids=['c','d'])
else:
    pass

Deleting vector with id : langchain


In [31]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'': {'vector_count': 3}},
 'total_vector_count': 3,
 'vector_type': 'dense'}

In [36]:
# If id does not exist
index.fetch(ids=['x'])

FetchResponse(namespace='', vectors={}, usage={'read_units': 1})

# Find Similar Vector in DB

In [37]:
query_vector= [random.random() for _ in range(1536)]

print(query_vector)

[0.1802484505427031, 0.9318829507095917, 0.8474121202269148, 0.3339049184309424, 0.9573288648196038, 0.671476951330882, 0.8332101093272791, 0.17941235798211197, 0.21472096022227638, 0.2692796958769119, 0.9011263951340521, 0.15284144015684564, 0.7078295015207233, 0.6302096432629346, 0.36147191865863715, 0.18160450771533698, 0.8644932424679553, 0.9177488123767626, 0.6017659884381508, 0.8429950702067606, 0.3287093481097496, 0.1279456709459349, 0.7466496447838961, 0.20033044890045193, 0.03214438470579828, 0.9558337618034042, 0.38177838699634226, 0.2509741550303465, 0.47165481906552953, 0.4591051599147077, 0.7893160336176288, 0.39450923092114787, 0.34688130534651207, 0.23495613616962518, 0.908763717054276, 0.5338139346955378, 0.7490397682522958, 0.8425692190387583, 0.3623590290991263, 0.9731251922942709, 0.4478190132561385, 0.28737075382572874, 0.8563356917350239, 0.006014677535571833, 0.1571578041207874, 0.715198851166649, 0.591464919202748, 0.603089579833811, 0.9121924279497655, 0.8279158

In [40]:
index.query(
    vector=query_vector,
    top_k=3,
    include_values=False
)

{'matches': [{'id': 'c', 'score': 0.864731, 'values': []},
             {'id': 'a', 'score': 0.761441231, 'values': []},
             {'id': 'b', 'score': 0.752476513, 'values': []}],
 'namespace': '',
 'usage': {'read_units': 1}}

# NameSpaces

In [41]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'': {'vector_count': 5}},
 'total_vector_count': 5,
 'vector_type': 'dense'}

In [42]:
ids= list('xyz')
vc= [[random.random() for _ in range(1536)] for v in range(3)]
index.upsert(vectors=zip(ids,vc), namespace="first-namespace")

{'upserted_count': 3}

In [45]:
ids= list('pq')
vc= [[random.random() for _ in range(1536)] for v in range(2)]
index.upsert(vectors=zip(ids,vc), namespace="second-namespace")

{'upserted_count': 2}

In [48]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'': {'vector_count': 5},
                'first-namespace': {'vector_count': 3},
                'second-namespace': {'vector_count': 2}},
 'total_vector_count': 10,
 'vector_type': 'dense'}

In [49]:
index.delete_namespace(namespace='second-namespace')

{}

In [50]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'': {'vector_count': 5},
                'first-namespace': {'vector_count': 3}},
 'total_vector_count': 8,
 'vector_type': 'dense'}

In [51]:
index.fetch(ids=['x'])

FetchResponse(namespace='', vectors={}, usage={'read_units': 1})

## Define namespace Explicitly while doing anything

In [52]:
index.fetch(ids=['x'], namespace='first-namespace')

FetchResponse(namespace='first-namespace', vectors={'x': Vector(id='x', values=[0.35236308, 0.19985649, 0.687280178, 0.986742616, 0.785104215, 0.36542, 0.405518264, 0.31122762, 0.878775716, 0.581108809, 0.520713687, 0.568393588, 0.289077431, 0.344123214, 0.178936109, 0.106466793, 0.800077, 0.517199755, 0.507592916, 0.978749275, 0.569881082, 0.031364128, 0.763770342, 0.0302541461, 0.317531317, 0.948826134, 0.637510896, 0.77795583, 0.982742488, 0.766342103, 0.339690149, 0.505052865, 0.456717908, 0.343368262, 0.788662553, 0.488971621, 0.338482887, 0.655329466, 0.111467972, 0.93746388, 0.818587363, 0.522515, 0.708666086, 0.755623043, 0.957695067, 0.30599314, 0.872869194, 0.236077175, 0.986540914, 0.1281223, 0.834288061, 0.849640667, 0.00472515915, 0.2726928, 0.22905083, 0.0807790235, 0.386660248, 0.122068323, 0.209682763, 0.918386817, 0.258596331, 0.627535939, 0.492555439, 0.89601922, 0.295947164, 0.577468038, 0.733516812, 0.991445959, 0.588416457, 0.236956254, 0.73686409, 0.863334596, 0.7

In [55]:
index.delete(ids=['z'], namespace='first-namespace')

{}

In [58]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'': {'vector_count': 5},
                'first-namespace': {'vector_count': 2}},
 'total_vector_count': 7,
 'vector_type': 'dense'}