In [None]:
import os
from dotenv import load_dotenv
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Pinecone as PineconeVectorStore
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.document_loaders import PyPDFLoader
from langchain.chains import RetrievalQA
from langchain_huggingface import HuggingFaceEndpoint
from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings

In [51]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [52]:
# Load environment variables

load_dotenv()
HUGGINGFACE_API_KEY = os.getenv("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")
PINECONE_API_KEY = os.getenv("pcsk_PaJKb_AmjfE444nDhvQA1TMCZHEmj9nnj3rixEX976ESeS78bA9C6r7cmcLYotJXVhHT5")
PINECONE_INDEX_NAME = "custom-vectordb"

In [53]:
# Load PDF documents
pdf_path = "documents/ABOUT THE WORKCOHOL ORGANIZATION.pdf"
 # Make sure this file is present in the working directory
loader = PyPDFLoader(pdf_path)
pages = loader.load()


In [54]:
# Split the pages into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
docs = text_splitter.split_documents(pages)


In [55]:
# Initialize Hugging Face Inference API embeddings
embedding_model = HuggingFaceInferenceAPIEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    api_key="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"
)


In [56]:

import pinecone
from langchain_pinecone import PineconeVectorStore

from pinecone import Pinecone

# Create an instance of the Pinecone class
pc = Pinecone(api_key="pcsk_PaJKb_AmjfE444nDhvQA1TMCZHEmj9nnj3rixEX976ESeS78bA9C6r7cmcLYotJXVhHT5")

INFO:pinecone_plugin_interface.logging:Discovering subpackages in _NamespacePath(['d:\\Final year project 1\\KB DOC\\customkbvenv\\Lib\\site-packages\\pinecone_plugins'])
INFO:pinecone_plugin_interface.logging:Looking for plugins in pinecone_plugins.inference
INFO:pinecone_plugin_interface.logging:Installing plugin inference into Pinecone


In [57]:
# Initialize Pinecone and create the index connection
pc = Pinecone(api_key="pcsk_5GaY1y_8sAoHhVACpt45xUSYZFaifSzCeAoMr8HZbuexftJXE6X4bKxP24NLHTW7MAnZTi")
pinecone_index = pc.Index("custom-vectordb")

INFO:pinecone_plugin_interface.logging:Discovering subpackages in _NamespacePath(['d:\\Final year project 1\\KB DOC\\customkbvenv\\Lib\\site-packages\\pinecone_plugins'])
INFO:pinecone_plugin_interface.logging:Looking for plugins in pinecone_plugins.inference
INFO:pinecone_plugin_interface.logging:Installing plugin inference into Pinecone
INFO:pinecone_plugin_interface.logging:Discovering subpackages in _NamespacePath(['d:\\Final year project 1\\KB DOC\\customkbvenv\\Lib\\site-packages\\pinecone_plugins'])
INFO:pinecone_plugin_interface.logging:Looking for plugins in pinecone_plugins.inference


In [58]:
from langchain_community.embeddings import HuggingFaceEmbeddings

embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vectors = embedding_model.embed_query("how are you")
print(len(vectors))

INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2


384


In [59]:
# Create a Pinecone vector store from the documents
vector_store = PineconeVectorStore.from_documents(
    docs,
    embedding_model,
    index_name="custom-vectordb"
)

INFO:pinecone_plugin_interface.logging:Discovering subpackages in _NamespacePath(['d:\\Final year project 1\\KB DOC\\customkbvenv\\Lib\\site-packages\\pinecone_plugins'])
INFO:pinecone_plugin_interface.logging:Looking for plugins in pinecone_plugins.inference
INFO:pinecone_plugin_interface.logging:Installing plugin inference into Pinecone
INFO:pinecone_plugin_interface.logging:Discovering subpackages in _NamespacePath(['d:\\Final year project 1\\KB DOC\\customkbvenv\\Lib\\site-packages\\pinecone_plugins'])
INFO:pinecone_plugin_interface.logging:Looking for plugins in pinecone_plugins.inference


In [60]:
#LLM INTEGRATION
from langchain_huggingface import HuggingFaceEndpoint

llm = HuggingFaceEndpoint(
    repo_id="mistralai/Mistral-7B-Instruct-v0.3",
    huggingfacehub_api_token="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",
    temperature=0.5,
    max_new_tokens=512
)


In [61]:
# Create a RetrievalQA chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=vector_store.as_retriever(),
    chain_type="stuff"
)

In [62]:
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
app = FastAPI()

class Query(BaseModel):
    question: str

@app.post("/ask")
def ask_question(query: Query):
    try:
        answer = qa.run(query.question)
        return {"answer": answer}
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))

In [63]:
if PINECONE_INDEX_NAME not in pc.list_indexes().names():
    # Create new index from docs
    vector_store = PineconeVectorStore.from_documents(
        docs,
        embedding_model,
        index_name=PINECONE_INDEX_NAME
    )
else:
    # Connect to existing index
    vector_store = PineconeVectorStore.from_existing_index(
        index_name=PINECONE_INDEX_NAME,
        embedding=embedding_model
    )


INFO:pinecone_plugin_interface.logging:Discovering subpackages in _NamespacePath(['d:\\Final year project 1\\KB DOC\\customkbvenv\\Lib\\site-packages\\pinecone_plugins'])
INFO:pinecone_plugin_interface.logging:Looking for plugins in pinecone_plugins.inference
INFO:pinecone_plugin_interface.logging:Installing plugin inference into Pinecone
INFO:pinecone_plugin_interface.logging:Discovering subpackages in _NamespacePath(['d:\\Final year project 1\\KB DOC\\customkbvenv\\Lib\\site-packages\\pinecone_plugins'])
INFO:pinecone_plugin_interface.logging:Looking for plugins in pinecone_plugins.inference


In [64]:
import logging
logging.basicConfig(level=logging.INFO)
print(f"Loaded {len(pages)} pages and split into {len(docs)} chunks.")


Loaded 53 pages and split into 117 chunks.


In [65]:
from langchain_huggingface import HuggingFaceEmbeddings
from huggingface_hub import InferenceClient
from langchain.vectorstores import Pinecone

# Updated embeddings import
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Pinecone retriever setup
vectorstore = Pinecone.from_existing_index(
    index_name="custom-vectordb",
    embedding=embedding_model
)
retriever = vectorstore.as_retriever()

# NEW: Using `invoke()` instead of `get_relevant_documents()`
question = "How to contact HR department?"
docs = retriever.invoke(question)  # No warning

# Combine retrieved chunks
context = "\n\n".join([doc.page_content for doc in docs])

# Prompt template
full_prompt = f"""You are a helpful assistant. Use the context below to answer the question.

Context:
{context}

Question: {question}
Answer:"""

# Setup Inference Client with Mistral-7B
client = InferenceClient(
    model="mistralai/Mistral-7B-Instruct-v0.3",  # Model to use
    token="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"
)

# Generate answer using `predict` method
response = client.text_generation(
    prompt=full_prompt,
    max_new_tokens=512,
    temperature=0.3,
    do_sample=True,
    return_full_text=False  # Optional: if you only want the completion
)


# Print the response
print(response)


INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2
INFO:pinecone_plugin_interface.logging:Discovering subpackages in _NamespacePath(['d:\\Final year project 1\\KB DOC\\customkbvenv\\Lib\\site-packages\\pinecone_plugins'])
INFO:pinecone_plugin_interface.logging:Looking for plugins in pinecone_plugins.inference
INFO:pinecone_plugin_interface.logging:Installing plugin inference into Pinecone
INFO:pinecone_plugin_interface.logging:Discovering subpackages in _NamespacePath(['d:\\Final year project 1\\KB DOC\\customkbvenv\\Lib\\site-packages\\pinecone_plugins'])
INFO:pinecone_plugin_interface.logging:Looking for plugins in pinecone_plugins.inference


 You can contact the HR department by sending an email to murali@workcohol.com or by visiting the careers page on the Workcohol Organizations website. Alternatively, you can find Murali Thangavel on LinkedIn for more information.


In [66]:
import asyncio
import sys

if sys.platform.startswith('win') and sys.version_info >= (3, 8):
    asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())