In [1]:
import sys
print(sys.prefix)


C:\Users\pc\anaconda3


In [3]:
import os
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.llms import CTransformers
from langchain.chains import ConversationalRetrievalChain

In [5]:
def load_pdf(pdf_path):
    # Load PDF
    loader = PyPDFLoader(pdf_path)
    documents = loader.load()

    # Split text into chunks
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=50,
        length_function=len
    )
    text_chunks = text_splitter.split_documents(documents)
    return text_chunks

# Create vector store
def create_vector_store(text_chunks):
    # Initialize embeddings
    embeddings = HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-MiniLM-L6-v2",
        model_kwargs={'device': 'cpu'}
    )

    # Create vector store
    vector_store = Chroma.from_documents(
        documents=text_chunks,
        embedding=embeddings
    )
    return vector_store

In [7]:
def initialize_llm():
    # Initialize Llama 2 model
    llm = CTransformers(
        model="TheBloke/Llama-2-7B-Chat-GGML",
        model_type="llama",
        max_new_tokens=512,
        temperature=0.7
    )
    return llm

In [9]:
def create_conversation_chain(vector_store, llm):
    # Create conversation chain
    conversation_chain = ConversationalRetrievalChain.from_llm(
        llm=llm,
        retriever=vector_store.as_retriever(),
        return_source_documents=True
    )
    return conversation_chain

In [11]:
def chat_with_pdf(conversation_chain):
    chat_history = []

    while True:
        # Get user question
        question = input("Ask a question about your PDF (type 'exit' to end): ")

        if question.lower() == 'exit':
            break

        # Get response from chain
        response = conversation_chain(
            {"question": question, "chat_history": chat_history}
        )

        # Add to chat history
        chat_history.append((question, response["answer"]))

        # Print response
        print("\nAnswer:", response["answer"])
        print("\n" + "="*50 + "\n")

In [15]:
from langchain_community.embeddings import HuggingFaceEmbeddings

def create_vector_store(text_chunks):
    # Initialize embeddings with a simpler model
    embeddings = HuggingFaceEmbeddings(
        model_name="distilbert-base-uncased",  # Try this simpler model
        model_kwargs={'device': 'cpu'}
    )

    # Create vector store
    vector_store = Chroma.from_documents(
        documents=text_chunks,
        embedding=embeddings
    )

    return vector_store

In [20]:
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

# Configure longer timeouts and retries
retry_strategy = Retry(
    total=3,
    backoff_factor=1,
    status_forcelist=[429, 500, 502, 503, 504]
)
adapter = HTTPAdapter(max_retries=retry_strategy)
http = requests.Session()
http.mount("https://", adapter)
http.mount("http://", adapter)
http.request = lambda *args, **kwargs: requests.Session.request(
    http,
    *args,
    **{**kwargs, 'timeout': 30}  # Increase timeout to 30 seconds
)

In [24]:
from sentence_transformers import SentenceTransformer
import torch
import os

def download_model():
    print("Pre-downloading required models...")
    try:
        # Set a longer timeout
        torch.hub.download_url_to_file.timeout = 300  # 5 minutes timeout

        # Force download and cache the model
        model = SentenceTransformer('distilbert-base-uncased', cache_folder='./model_cache')
        print("Model downloaded successfully!")
        return True
    except Exception as e:
        print(f"Error downloading model: {str(e)}")
        return False

def main():
    # First ensure model is downloaded
    if not download_model():
        print("Failed to download required models. Please check your internet connection and try again.")
        return

    # Rest of your original code
    pdf_path = r"C:\Users\pc\Downloads\SQLtutorial.pdf" 

    print("Loading and processing PDF...")
    text_chunks = load_pdf(pdf_path)

    print("Creating vector store...")
    vector_store = create_vector_store(text_chunks)

    print("Initializing Llama 2...")
    llm = initialize_llm()

    print("Creating conversation chain...")
    conversation_chain = create_conversation_chain(vector_store, llm)

    print("Ready to chat! Ask questions about your PDF.")
    chat_with_pdf(conversation_chain)

if __name__ == "__main__":
    main()

Pre-downloading required models...


No sentence-transformers model found with name distilbert-base-uncased. Creating a new one with mean pooling.


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Error while downloading from https://cdn-lfs.hf.co/distilbert-base-uncased/5e3f1108e3cb34ee048634875d8482665b65ac713291a7e32396fb18f6ff0063?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27model.safetensors%3B+filename%3D%22model.safetensors%22%3B&Expires=1733585092&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTczMzU4NTA5Mn19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5oZi5jby9kaXN0aWxiZXJ0LWJhc2UtdW5jYXNlZC81ZTNmMTEwOGUzY2IzNGVlMDQ4NjM0ODc1ZDg0ODI2NjViNjVhYzcxMzI5MWE3ZTMyMzk2ZmIxOGY2ZmYwMDYzP3Jlc3BvbnNlLWNvbnRlbnQtZGlzcG9zaXRpb249KiJ9XX0_&Signature=prMn7oXy5e2I0-1AOA8QcCo6xtYyP7rykzQMZkydelqsBsllbqL8PbPlScbrnDsf-6FPUNcNd9UcL6FdXnBRMG%7EeE%7E8Kzbkzb3mdTLJNgVoVkpVLLIatUzqMyYog68nYbiTcIiUlNSOsnRbaulephcsHYcmruCaxuEo35gkYi7vMmdFGNkkHKQWCHqYyVvZtRJsmmEIjC8k6ypEktCZf7zIKq-6oKBCKDJnbI-jEOlQLUXGYI0w0e98cv%7E5I0aOPZTIUwNyiv4%7EV5g5GyyL8iWPOhPyKsEoVkj7snyYCpZjA3-fLmGO3fo0D8OgoC1uKbz4Q0COspcDjlzvFE9cV5w__&Key-Pair-Id=K3RPWS32NSSJCE: HTTPSConnecti

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Error while downloading from https://cdn-lfs.hf.co/distilbert-base-uncased/5e3f1108e3cb34ee048634875d8482665b65ac713291a7e32396fb18f6ff0063?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27model.safetensors%3B+filename%3D%22model.safetensors%22%3B&Expires=1733585092&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTczMzU4NTA5Mn19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5oZi5jby9kaXN0aWxiZXJ0LWJhc2UtdW5jYXNlZC81ZTNmMTEwOGUzY2IzNGVlMDQ4NjM0ODc1ZDg0ODI2NjViNjVhYzcxMzI5MWE3ZTMyMzk2ZmIxOGY2ZmYwMDYzP3Jlc3BvbnNlLWNvbnRlbnQtZGlzcG9zaXRpb249KiJ9XX0_&Signature=prMn7oXy5e2I0-1AOA8QcCo6xtYyP7rykzQMZkydelqsBsllbqL8PbPlScbrnDsf-6FPUNcNd9UcL6FdXnBRMG%7EeE%7E8Kzbkzb3mdTLJNgVoVkpVLLIatUzqMyYog68nYbiTcIiUlNSOsnRbaulephcsHYcmruCaxuEo35gkYi7vMmdFGNkkHKQWCHqYyVvZtRJsmmEIjC8k6ypEktCZf7zIKq-6oKBCKDJnbI-jEOlQLUXGYI0w0e98cv%7E5I0aOPZTIUwNyiv4%7EV5g5GyyL8iWPOhPyKsEoVkj7snyYCpZjA3-fLmGO3fo0D8OgoC1uKbz4Q0COspcDjlzvFE9cV5w__&Key-Pair-Id=K3RPWS32NSSJCE: HTTPSConnecti

Error downloading model: (MaxRetryError('HTTPSConnectionPool(host=\'cdn-lfs.hf.co\', port=443): Max retries exceeded with url: /distilbert-base-uncased/5e3f1108e3cb34ee048634875d8482665b65ac713291a7e32396fb18f6ff0063?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27model.safetensors%3B+filename%3D%22model.safetensors%22%3B&Expires=1733585092&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTczMzU4NTA5Mn19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5oZi5jby9kaXN0aWxiZXJ0LWJhc2UtdW5jYXNlZC81ZTNmMTEwOGUzY2IzNGVlMDQ4NjM0ODc1ZDg0ODI2NjViNjVhYzcxMzI5MWE3ZTMyMzk2ZmIxOGY2ZmYwMDYzP3Jlc3BvbnNlLWNvbnRlbnQtZGlzcG9zaXRpb249KiJ9XX0_&Signature=prMn7oXy5e2I0-1AOA8QcCo6xtYyP7rykzQMZkydelqsBsllbqL8PbPlScbrnDsf-6FPUNcNd9UcL6FdXnBRMG~eE~8Kzbkzb3mdTLJNgVoVkpVLLIatUzqMyYog68nYbiTcIiUlNSOsnRbaulephcsHYcmruCaxuEo35gkYi7vMmdFGNkkHKQWCHqYyVvZtRJsmmEIjC8k6ypEktCZf7zIKq-6oKBCKDJnbI-jEOlQLUXGYI0w0e98cv~5I0aOPZTIUwNyiv4~V5g5GyyL8iWPOhPyKsEoVkj7snyYCpZjA3-fLmGO3fo0D8OgoC

In [22]:
def main():
    # Specify your PDF path
    pdf_path = r"C:\Users\pc\Downloads\SQLtutorial.pdf"

    # Process PDF
    print("Loading and processing PDF...")
    text_chunks = load_pdf(pdf_path)

    # Create vector store
    print("Creating vector store...")
    vector_store = create_vector_store(text_chunks)

    # Initialize LLM
    print("Initializing Llama 2...")
    llm = initialize_llm()

    # Create conversation chain
    print("Creating conversation chain...")
    conversation_chain = create_conversation_chain(vector_store, llm)

    # Start chat interface
    print("Ready to chat! Ask questions about your PDF.")
    chat_with_pdf(conversation_chain)

if __name__ == "__main__":
    main()

Loading and processing PDF...
Creating vector store...


No sentence-transformers model found with name distilbert-base-uncased. Creating a new one with mean pooling.


model.safetensors:   8%|7         | 21.0M/268M [00:00<?, ?B/s]

Error while downloading from https://cdn-lfs.hf.co/distilbert-base-uncased/5e3f1108e3cb34ee048634875d8482665b65ac713291a7e32396fb18f6ff0063?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27model.safetensors%3B+filename%3D%22model.safetensors%22%3B&Expires=1733584814&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTczMzU4NDgxNH19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5oZi5jby9kaXN0aWxiZXJ0LWJhc2UtdW5jYXNlZC81ZTNmMTEwOGUzY2IzNGVlMDQ4NjM0ODc1ZDg0ODI2NjViNjVhYzcxMzI5MWE3ZTMyMzk2ZmIxOGY2ZmYwMDYzP3Jlc3BvbnNlLWNvbnRlbnQtZGlzcG9zaXRpb249KiJ9XX0_&Signature=ElMvfnQKldVC58T-3niVz3XhgQXDSvvw8k1%7EkipJVIbBbKFonVLG1WsJu9PV0r8TlPIkqJwEaGsVogJQtT5WzOMhL-D59ZOsDfpdiH4463oQN%7EnwmkG0xAuTnynwMy53zEmWvezfeD6eA47OOHAg80j4QSv5ZscD%7ELpDvOS%7EZxmqMGSWdHbEiLpVj8nzwmb4nv3g58ljP%7EQgoUeEu0sS5nl7onHLwpLaLxWQcqD%7E1v4OjuJPCU7WZOrsoeqbqndRO64Q3uv0cYPuSyfmxHMBM4jBAcBNRG0hb2jXjphfCLVRgB25knaC4SBedZ9NvptWBghqqq-2Bs-HIt7QSvw7Fw__&Key-Pair-Id=K3RPWS32NSSJCE: HTTPSConn

model.safetensors:  51%|#####     | 136M/268M [00:00<?, ?B/s]

KeyboardInterrupt: 