In [None]:
!pip install PyPDF2 python-docx
!pip install faiss-cpu
!pip install transformers langchain_community faiss-cpu

In [None]:
from PyPDF2 import PdfReader
from docx import Document
import faiss
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from langchain.text_splitter import RecursiveCharacterTextSplitter
from transformers import pipeline

In [None]:
# Loading model LLM from Hugging Face
qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")

In [None]:
def load_document(file_path):
    content = ""
    if file_path.endswith(".pdf"):
        reader = PdfReader(file_path)
        for page in reader.pages:
            content += page.extract_text() if page.extract_text() else ""
    elif file_path.endswith(".docx"):
        doc = Document(file_path)
        for para in doc.paragraphs:
            content += para.text
    else:
        raise ValueError("Unsupported file format. Please use PDF or Word documents.")
    return content

In [None]:
document_link = "<Your document link>"
document_content = load_document(document_link)

In [None]:
# Split the text into smaller parts
text_splitter = RecursiveCharacterTextSplitter(chunk_size=600, chunk_overlap=100)
chunks = text_splitter.split_text(document_content)

In [None]:
# Generate text vectors using TF-IDF
vectorizer = TfidfVectorizer(max_features=1000)
vectors = vectorizer.fit_transform(chunks).toarray().astype('float32')

In [None]:
# Building the FAISS index and adding vectors
d = vectors.shape[1]  # Vector dimensions
index = faiss.IndexFlatL2(d)  # Index to search for L2 distance
index.add(vectors)
print("Number of vectors in the index:", index.ntotal)

In [None]:
# Function to answer questions using the LLM model
def get_answer(query_text, k=5):
    query_vector = vectorizer.transform([query_text]).toarray().astype('float32')
    D, I = index.search(query_vector, k)

    # Choose the best part for the answer
    top_snippets = [chunks[i] for i in I[0]]

    # Return response using LLM
    context = " ".join(top_snippets)
    response = qa_pipeline(question=query_text, context=context)
    return response['answer']

In [None]:
while True:
    user_input = input("\nEnter your question (type 'exit' to quit): ")
    if user_input.lower() == 'exit':
        break
    answer = get_answer(user_input)
    print("\nAnswer:\n", answer)