In [29]:
# step 1 load the documents
from langchain.document_loaders import PyPDFLoader

loader = PyPDFLoader("Resume.pdf")
documents = loader.load()


In [30]:
# step 2 split the documents
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
docs = text_splitter.split_documents(documents)


In [80]:
# step 3 generate embeddings
from langchain.embeddings import SentenceTransformerEmbeddings

model = SentenceTransformerEmbeddings(model_name="all-mpnet-base-v2")  # 768 dims
text = [doc.page_content for doc in docs]

# Now pass the strings to embed_documents
embeddings = model.embed_documents(text)


print(len(embeddings[0]))  # 768 dims


768


In [56]:
# step 4 setup vector db weaviate 

import weaviate
from weaviate.auth import AuthApiKey
from dotenv import load_dotenv
import os

# Load environment variables from your db.env file
load_dotenv("db.env")

weaviate_url = os.getenv("WEAVIATE_URL")
weaviate_api_key = os.getenv("WEAVIATE_API_KEY")

# Setup authentication using API key
auth = AuthApiKey(api_key=weaviate_api_key)

# Create the Weaviate client (v4 syntax)
client = weaviate.connect_to_weaviate_cloud(
    cluster_url=weaviate_url,
    auth_credentials=Auth.api_key(weaviate_api_key),
)

# Check if the client is ready
print("Client ready:", client.is_ready())

# Close client to clean up resources



Client ready: True


In [69]:
# step 5 create vector db class 
from weaviate.classes.config import Property, DataType, Vectorizers

client.collections.create(
    name="Knowledge",
    properties=[
        Property(name="text", data_type=DataType.TEXT),
    ],
    vectorizer_config={"vectorizer": Vectorizers.NONE}
)


C:\Users\PMLS\anaconda3\Lib\site-packages\weaviate\collections\classes\config.py:1963: PydanticDeprecatedSince211: Accessing the 'model_fields' attribute on the instance is deprecated. Instead, you should access this attribute from the model class. Deprecated in Pydantic V2.11 to be removed in V3.0.
  for cls_field in self.model_fields:


<weaviate.collections.collection.sync.Collection at 0x2cf84cef350>

In [81]:
# step 6 load the data  
collection = client.collections.get("Knowledge")

with collection.batch.fixed_size(batch_size=100) as batch:
    for text_item, embedding_vector in zip(text, embeddings):
        batch.add_object(
            properties={"text": text_item},
            vector=embedding_vector
        )

In [94]:
#step 7 query the data 
collection = client.collections.get("Knowledge")

response = collection.query.near_vector(
    near_vector=embeddings[1],  # your embedding vector, e.g., list of floats
    limit=1
)

for obj in response.objects:
    print(obj.properties)

{'text': 'PROFESSIONAL EXPERIENCE  \n \nPython Developer Intern  | Interncraft, Pakistan                                                                         August 2024-October 2024 \n• Developed a dynamic weather application utilizing the OpenWeatherMap API to deliver accurate, up -to-\ndate weather data . \n• Successfully integrated the solution into a comprehensive agriculture tool, significantly improving \noperational efficiency and decision-making.'}


In [105]:
# step 8

from langchain_community.vectorstores import Weaviate
import weaviate

# Connect to your Weaviate instance


# Create a LangChain vector store from your Weaviate collection
db = Weaviate.from_documents(docs, embeddings, client=collection)

# Now you can use LangChain's retrieval and generation pipeline
# For example, using a RetrievalQA chain (assuming you have a suitable LLM chain set up)
from langchain.chains import RetrievalQA

qa_chain = RetrievalQA.from_chain_type(
    llm=model,  # Replace with your LLM, e.g., Google Generative AI if supported
    retriever=db.as_retriever()
)

result = qa_chain.run("Your question here")
print(result)

AttributeError: 'Collection' object has no attribute 'schema'

In [114]:
import google.generativeai as genai  
from dotenv import load_dotenv
import os

# Load environment variables from your db.env file
load_dotenv("db.env")
google_api_key=os.getenv('GEMINI_API_KEY')
genai.configure(api_key=google_api_key) 
model = genai.GenerativeModel('models/gemini-1.5-flash') 
response = model.generate_content("where was newton born , only give me one or two words response in urdu language only") 



In [115]:

print(response.text)

لنکاشائر، انگلینڈ

