In [1]:
# Notebook cell: Ingest PDFs → FAISS vectorstore
from dotenv import load_dotenv
import os, json
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
# embeddings
from langchain.embeddings.openai import OpenAIEmbeddings
# vector store
from langchain.vectorstores.faiss import FAISS
# 1. Load API key
load_dotenv()

# 2. Paths
DATA_DIR = "/Users/mousa/Desktop/PipelineIQ/data/raw_documents"
META_PATH = "/Users/mousa/Desktop/PipelineIQ/data/metadata.json"
STORE_DIR = "/Users/mousa/Desktop/PipelineIQ/vectorstore"

# 3. Read metadata
with open(META_PATH, "r", encoding="utf-8") as f:
    docs = json.load(f)

# 4. Extract & chunk text
all_texts, all_meta = [], []
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)

for doc in docs:
    path = os.path.join(DATA_DIR, doc["filename"])
    reader = PdfReader(path)
    text = "".join(page.extract_text() or "" for page in reader.pages)
    chunks = splitter.split_text(text)
    all_texts.extend(chunks)
    all_meta.extend([{"title": doc["title"], "source": doc["filename"]}] * len(chunks))

print(f"> Prepared {len(all_texts)} chunks from {len(docs)} documents.")

# 5. Embed & build FAISS index
emb = OpenAIEmbeddings()
index = FAISS.from_texts(all_texts, emb, metadatas=all_meta)

# 6. Save locally
os.makedirs(STORE_DIR, exist_ok=True)
index.save_local(STORE_DIR)
print(f"> Vector store saved to: {STORE_DIR}")

> Prepared 720 chunks from 12 documents.


  emb = OpenAIEmbeddings()


> Vector store saved to: /Users/mousa/Desktop/PipelineIQ/vectorstore


In [3]:
from dotenv import load_dotenv
import os
load_dotenv()

from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores.faiss import FAISS
from langchain.chat_models import ChatOpenAI
from langchain.chains.qa_with_sources import load_qa_with_sources_chain
from langchain.schema import HumanMessage

# 1) Load index
VECTORSTORE_DIR = "/Users/mousa/Desktop/PipelineIQ/vectorstore"
embeddings = OpenAIEmbeddings()
index = FAISS.load_local(
    VECTORSTORE_DIR,
    embeddings,
    allow_dangerous_deserialization=True
)

# 2) Build QA chain
llm = ChatOpenAI(temperature=0)
qa_chain = load_qa_with_sources_chain(llm, chain_type="stuff")

# 3) Ask a question
question = "What is the primary function of a blowout preventer?"
docs = index.similarity_search(question, k=3)
result = qa_chain({"input_documents": docs, "question": question})
print("🤖 Answer:\n", result["output_text"])

# 4) Summarize with a HumanMessage
summary_msg = llm([HumanMessage(content=f"Summarize this answer in 3 bullet points:\n\n{result['output_text']}")])
print("\n📝 Summary:\n", summary_msg.content)

🤖 Answer:
 The primary function of a blowout preventer is to prevent blowouts by sealing the wellbore. 
SOURCES: Blowout-Preventers-1.pdf, Oil and gas production handbook ed3x0_web.pdf

📝 Summary:
 - Blowout preventers are designed to prevent blowouts by sealing the wellbore
- They are a crucial safety device in oil and gas production
- The information is sourced from Blowout-Preventers-1.pdf and Oil and gas production handbook ed3x0_web.pdf


In [3]:
# Example usage of RecursiveCharacterTextSplitter
sample_text = "This is a sample text to demonstrate the RecursiveCharacterTextSplitter functionality. It splits text into chunks based on the specified chunk size and overlap."

splitter = RecursiveCharacterTextSplitter(chunk_size=50, chunk_overlap=10)
chunks = splitter.split_text(sample_text)

print(f"Generated {len(chunks)} chunks:")
for i, chunk in enumerate(chunks, 1):
    print(f"Chunk {i}: {chunk}")

Generated 4 chunks:
Chunk 1: This is a sample text to demonstrate the
Chunk 2: the RecursiveCharacterTextSplitter functionality.
Chunk 3: It splits text into chunks based on the specified
Chunk 4: specified chunk size and overlap.
