In [4]:
import os
import sys
from pathlib import Path

# Set project root
PROJECT_ROOT = Path.cwd()
os.chdir(PROJECT_ROOT)

if str(PROJECT_ROOT) not in sys.path:
    sys.path.append(str(PROJECT_ROOT))
os.chdir(r"D:\Visual Studio practice\aviation-chatbot")

print(f"‚úÖ Project root: {PROJECT_ROOT}")

‚úÖ Project root: D:\Visual Studio practice\aviation-chatbot


In [5]:
# Import required libraries
import pdfplumber
import json
import psycopg2
import numpy as np
from tqdm.auto import tqdm
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
from collections import Counter

from src.config import (
    RAW_PDF_DIR,
    DB_HOST, DB_PORT, DB_NAME, DB_USER, DB_PASSWORD,
    EMBEDDING_MODEL_NAME, EMBEDDING_BATCH_SIZE,
    CHUNK_SIZE, CHUNK_OVERLAP
)

print("‚úÖ All libraries imported successfully")
print(f"\nüìÇ PDF Directory: {RAW_PDF_DIR}")
print(f"ü§ñ Embedding Model: {EMBEDDING_MODEL_NAME}")
print(f"üìè Chunk Size: {CHUNK_SIZE}, Overlap: {CHUNK_OVERLAP}")

  from .autonotebook import tqdm as notebook_tqdm


D:\Visual Studio practice\aviation-chatbot
‚úÖ All libraries imported successfully

üìÇ PDF Directory: D:\Visual Studio practice\aviation-chatbot\data\raw_pdfs
ü§ñ Embedding Model: all-MiniLM-L6-v2
üìè Chunk Size: 400, Overlap: 100


In [6]:
def extract_text_from_pdf(pdf_path):
    """Extract text from PDF page by page"""
    pages = []
    
    with pdfplumber.open(pdf_path) as pdf:
        for i, page in enumerate(pdf.pages):
            text = page.extract_text()
            if text and text.strip():
                pages.append({
                    "text": text.strip(),
                    "page_number": i + 1,
                    "document_name": pdf_path.name
                })
    return pages

# Extract from all PDFs
all_pages = []
pdf_files = list(RAW_PDF_DIR.glob("*.pdf"))

print(f"üìö Found {len(pdf_files)} PDF files\n")

for pdf_file in pdf_files:
    print(f"üìÑ Processing: {pdf_file.name}")
    pages = extract_text_from_pdf(pdf_file)
    all_pages.extend(pages)
    print(f"   ‚úÖ Extracted {len(pages)} pages")

print(f"\nüéØ Total pages extracted: {len(all_pages)}")

üìö Found 2 PDF files

üìÑ Processing: airport_operations.pdf
   ‚úÖ Extracted 604 pages
üìÑ Processing: scada_manual.pdf
   ‚úÖ Extracted 203 pages

üéØ Total pages extracted: 807


In [7]:
# Initialize text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_OVERLAP,
    separators=["\n\n", "\n", ".", " ", ""]
)

print(f"üìù Text splitter configuration:")
print(f"   Chunk size: {CHUNK_SIZE}")
print(f"   Chunk overlap: {CHUNK_OVERLAP}\n")

# Create chunks from all pages
all_chunks = []

for page in tqdm(all_pages, desc="Chunking pages"):
    splits = text_splitter.split_text(page["text"])
    
    for chunk_text in splits:
        all_chunks.append({
            "text": chunk_text,
            "document_name": page["document_name"],
            "page_number": page["page_number"]
        })

print(f"\n‚úÖ Total chunks created: {len(all_chunks)}")

# Show distribution
doc_counts = Counter(c["document_name"] for c in all_chunks)
print("\nüìä Chunk distribution by document:")
for doc, count in doc_counts.items():
    print(f"   {doc}: {count} chunks")


üìù Text splitter configuration:
   Chunk size: 400
   Chunk overlap: 100



Chunking pages: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 807/807 [00:00<00:00, 14742.50it/s]


‚úÖ Total chunks created: 5011

üìä Chunk distribution by document:
   airport_operations.pdf: 3825 chunks
   scada_manual.pdf: 1186 chunks





In [8]:
# Load embedding model
print(f"üîÑ Loading embedding model: {EMBEDDING_MODEL_NAME}...")
embedding_model = SentenceTransformer(EMBEDDING_MODEL_NAME)
print("‚úÖ Embedding model loaded\n")

# Generate embeddings for all chunks
texts = [c["text"] for c in all_chunks]

print(f"üßÆ Generating embeddings for {len(texts)} chunks...")
print(f"   Batch size: {EMBEDDING_BATCH_SIZE}")

embeddings = embedding_model.encode(
    texts,
    batch_size=EMBEDDING_BATCH_SIZE,
    show_progress_bar=True,
    convert_to_numpy=True
)

print(f"\n‚úÖ Embeddings generated!")
print(f"   Shape: {embeddings.shape}")
print(f"   Dimension: {embeddings.shape[1]}")
print(f"   Data type: {embeddings.dtype}")


üîÑ Loading embedding model: all-MiniLM-L6-v2...




‚úÖ Embedding model loaded

üßÆ Generating embeddings for 5011 chunks...
   Batch size: 32


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 157/157 [02:12<00:00,  1.19it/s]


‚úÖ Embeddings generated!
   Shape: (5011, 384)
   Dimension: 384
   Data type: float32





In [9]:
def insert_chunks_to_db(chunks, embeddings, batch_size=100):
    """
    Insert chunks and embeddings into PostgreSQL in batches
    
    Args:
        chunks: List of chunk dictionaries
        embeddings: NumPy array of embeddings
        batch_size: Number of records to insert at once
    """
    conn = psycopg2.connect(
        host=DB_HOST,
        port=DB_PORT,
        database=DB_NAME,
        user=DB_USER,
        password=DB_PASSWORD
    )
    cursor = conn.cursor()
    
    # Clear existing data
    print("üóëÔ∏è  Clearing existing data...")
    cursor.execute("DELETE FROM knowledge_chunks;")
    conn.commit()
    print("‚úÖ Existing data cleared\n")
    
    # Prepare data for batch insert
    total_inserted = 0
    
    print(f"üíæ Inserting {len(chunks)} chunks in batches of {batch_size}...")
    
    for i in tqdm(range(0, len(chunks), batch_size), desc="Inserting batches"):
        batch_chunks = chunks[i:i + batch_size]
        batch_embeddings = embeddings[i:i + batch_size]
        
        # Prepare batch data
        batch_data = [
            (
                chunk["text"],
                embedding.tolist(),  # Convert numpy array to list
                chunk["document_name"],
                chunk["page_number"]
            )
            for chunk, embedding in zip(batch_chunks, batch_embeddings)
        ]
        
        # Execute batch insert
        cursor.executemany("""
            INSERT INTO knowledge_chunks (content, embedding, document_name, page_number)
            VALUES (%s, %s, %s, %s)
        """, batch_data)
        
        conn.commit()
        total_inserted += len(batch_data)
    
    cursor.close()
    conn.close()
    
    return total_inserted

# Insert all chunks into database
inserted = insert_chunks_to_db(all_chunks, embeddings, batch_size=100)

print(f"\nüéâ Successfully inserted {inserted} chunks into database!")


üóëÔ∏è  Clearing existing data...
‚úÖ Existing data cleared

üíæ Inserting 5011 chunks in batches of 100...


Inserting batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 51/51 [03:55<00:00,  4.62s/it]


üéâ Successfully inserted 5011 chunks into database!





In [11]:
def verify_database():
    """Verify data was inserted correctly"""
    conn = psycopg2.connect(
        host=DB_HOST,
        port=DB_PORT,
        database=DB_NAME,
        user=DB_USER,
        password=DB_PASSWORD
    )
    cursor = conn.cursor()
    
    # Total count
    cursor.execute("SELECT COUNT(*) FROM knowledge_chunks;")
    total = cursor.fetchone()[0]
    
    # Unique documents
    cursor.execute("SELECT COUNT(DISTINCT document_name) FROM knowledge_chunks;")
    unique_docs = cursor.fetchone()[0]
    
    # Documents breakdown
    cursor.execute("""
        SELECT document_name, COUNT(*) as chunk_count
        FROM knowledge_chunks
        GROUP BY document_name
        ORDER BY chunk_count DESC;
    """)
    doc_breakdown = cursor.fetchall()
    
    # Table size
    cursor.execute("""
        SELECT pg_size_pretty(pg_total_relation_size('knowledge_chunks'));
    """)
    table_size = cursor.fetchone()[0]
    
    # Sample chunk
    cursor.execute("""
        SELECT id, content, document_name, page_number
        FROM knowledge_chunks
        LIMIT 1;
    """)
    sample = cursor.fetchone()
    
    print("=" * 80)
    print("üìä DATABASE VERIFICATION REPORT")
    print("=" * 80)
    print(f"\n‚úÖ Total chunks in database: {total}")
    print(f"‚úÖ Unique documents: {unique_docs}")
    print(f"‚úÖ Table size: {table_size}")
    
    print("\nüìö Breakdown by document:")
    for doc_name, count in doc_breakdown:
        print(f"   ‚Ä¢ {doc_name}: {count} chunks")
    
    print("\nüìÑ Sample chunk:")
    print(f"   ID: {sample[0]}")
    print(f"   Document: {sample[2]}")
    print(f"   Page: {sample[3]}")
    print(f"   Content preview:\n   {sample[1][:300]}...")
    print("\n" + "=" * 80)
    
    cursor.close()
    conn.close()

verify_database()


üìä DATABASE VERIFICATION REPORT

‚úÖ Total chunks in database: 5011
‚úÖ Unique documents: 2
‚úÖ Table size: 18 MB

üìö Breakdown by document:
   ‚Ä¢ airport_operations.pdf: 3825 chunks
   ‚Ä¢ scada_manual.pdf: 1186 chunks

üìÑ Sample chunk:
   ID: 3
   Document: airport_operations.pdf
   Page: 2
   Content preview:
   Airport Operations
About the Authors
Norman J. Ashford was Professor of Transport Planning at the Loughborough University
of Technology, England, from 1972 to 1997. He holds bachelor‚Äôs, master‚Äôs, and doctoral
degrees in civil engineering. Dr. Ashford worked as a civil engineer in Canada and taught
a...



In [12]:
def test_similarity_search(query_text, top_k=5):
    """Test vector similarity search"""
    print(f"üîç Testing similarity search...")
    print(f"   Query: '{query_text}'")
    print(f"   Retrieving top {top_k} results\n")
    
    # Generate query embedding
    query_embedding = embedding_model.encode([query_text])[0]
    
    conn = psycopg2.connect(
        host=DB_HOST,
        port=DB_PORT,
        database=DB_NAME,
        user=DB_USER,
        password=DB_PASSWORD
    )
    cursor = conn.cursor()
    
    # Perform similarity search using cosine distance
    cursor.execute("""
        SELECT 
            id,
            content,
            document_name,
            page_number,
            1 - (embedding <=> %s::vector) as similarity
        FROM knowledge_chunks
        ORDER BY embedding <=> %s::vector
        LIMIT %s;
    """, (query_embedding.tolist(), query_embedding.tolist(), top_k))
    
    results = cursor.fetchall()
    
    print("=" * 80)
    print("üéØ SEARCH RESULTS")
    print("=" * 80)
    
    for i, (chunk_id, content, doc_name, page_num, similarity) in enumerate(results, 1):
        print(f"\n{i}. [{doc_name} | Page {page_num}] (Similarity: {similarity:.4f})")
        print(f"   {content[:250]}...")
    
    print("\n" + "=" * 80)
    
    cursor.close()
    conn.close()

# Test with aviation-related queries
test_similarity_search("What is SCADA and how does it work?", top_k=3)


üîç Testing similarity search...
   Query: 'What is SCADA and how does it work?'
   Retrieving top 3 results

üéØ SEARCH RESULTS

1. [scada_manual.pdf | Page 8] (Similarity: 0.6833)
   Preface to the Third Edition
When the first edition of this book was written, certain trends in SCADA
were already apparent, and I made attempts to identify them in Unit 14,
"What's Next?". Generally, these trends have continued, and their
descriptio...

2. [scada_manual.pdf | Page 13] (Similarity: 0.6543)
   electronics-based technologies, SCAD A is a virtual cornucopia of these
terms and abbreviations. Finally, the solutions to the exercises found at the
end of each unit are given in Appendix C.
1-6. Course Objectives
When you have completed this entire...

3. [scada_manual.pdf | Page 145] (Similarity: 0.6444)
   respectively. Very often people confuse the terms in the mistaken belief
that because SCAD A is associated with so much expensive computer
hardware it must be automatic. In fact, most early 