In [1]:
# Cell 1: Setup and Imports
import pandas as pd
import numpy as np
import os
from pathlib import Path
import json

# Set up paths
PROJECT_ROOT = Path.cwd().parent if Path.cwd().name == 'notebooks' else Path.cwd()
DATA_PATH = PROJECT_ROOT / 'data'
RAW_DATA_PATH = DATA_PATH / 'raw' / 'delphic-strategies'
PROCESSED_DATA_PATH = DATA_PATH / 'processed'
EMBEDDINGS_PATH = DATA_PATH / 'embeddings'

print(f"Project root: {PROJECT_ROOT}")
print(f"Data path: {DATA_PATH}")
print(f"Raw data path: {RAW_DATA_PATH}")

# Create directories if they don't exist
PROCESSED_DATA_PATH.mkdir(parents=True, exist_ok=True)
EMBEDDINGS_PATH.mkdir(parents=True, exist_ok=True)

Project root: d:\Projects\RAG Prototype
Data path: d:\Projects\RAG Prototype\data
Raw data path: d:\Projects\RAG Prototype\data\raw\delphic-strategies


In [2]:
# Cell 2: Debug - Check Files
print("Files in data/raw/delphic-strategies/:")
if RAW_DATA_PATH.exists():
    files_found = list(RAW_DATA_PATH.iterdir())
    if files_found:
        for file in files_found:
            print(f"  - {file.name} ({file.suffix})")
    else:
        print("  Folder exists but is empty")
        # Create a test file
        test_file = RAW_DATA_PATH / "test_document.txt"
        with open(test_file, 'w') as f:
            f.write("This is a test document about Delphic Strategies. We provide strategic consulting services and business advisory solutions.")
        print(f"  Created test file: {test_file.name}")
else:
    print(f"  Folder doesn't exist: {RAW_DATA_PATH}")
    RAW_DATA_PATH.mkdir(parents=True, exist_ok=True)
    print(f"  Created folder: {RAW_DATA_PATH}")

Files in data/raw/delphic-strategies/:
  - Arlington County Business License 2025.pdf (.pdf)
  - Business Plan v2.docx (.docx)
  - Delphic EIN.pdf (.pdf)
  - Expenses.xlsx (.xlsx)
  - SDVOSB_approval_letter.pdf (.pdf)
  - SOP Executive Communications Intelligence.docx (.docx)
  - Va Articles of Organization.pdf (.pdf)
  - Va Certificate of Organization.pdf (.pdf)
  - Va Certificate.pdf (.pdf)
  - Va SCC Filing.pdf (.pdf)


In [3]:
# Cell 3: Document Loading Function
def load_documents(directory_path):
    """Load documents from various file types"""
    documents = []
    file_names = []
    
    for file_path in Path(directory_path).glob('**/*'):
        if file_path.is_file():
            try:
                content = ""
                file_ext = file_path.suffix.lower()
                
                if file_ext in ['.txt', '.md']:
                    with open(file_path, 'r', encoding='utf-8') as file:
                        content = file.read()
                
                elif file_ext == '.pdf':
                    try:
                        import PyPDF2
                        with open(file_path, 'rb') as file:
                            reader = PyPDF2.PdfReader(file)
                            for page in reader.pages:
                                content += page.extract_text() + "\n"
                    except ImportError:
                        print(f"PyPDF2 not installed. Install with: pip install PyPDF2")
                        continue
                
                elif file_ext in ['.docx', '.doc']:
                    try:
                        from docx import Document
                        doc = Document(file_path)
                        for paragraph in doc.paragraphs:
                            content += paragraph.text + "\n"
                    except ImportError:
                        print(f"python-docx not installed. Install with: pip install python-docx")
                        continue
                
                elif file_ext in ['.xlsx', '.xls']:
                    try:
                        import pandas as pd
                        df = pd.read_excel(file_path)
                        content = df.to_string()
                    except ImportError:
                        print(f"openpyxl not installed. Install with: pip install openpyxl")
                        continue
                
                if content.strip():
                    documents.append(content)
                    file_names.append(file_path.name)
                    print(f"Loaded: {file_path.name} ({file_ext})")
                else:
                    print(f"No content extracted from: {file_path.name}")
                    
            except Exception as e:
                print(f"Error loading {file_path.name}: {e}")
    
    return documents, file_names

In [4]:
# Cell 4: Load Documents
print("Loading Delphic Strategies documents...")
documents, file_names = load_documents(RAW_DATA_PATH)
print(f"Loaded {len(documents)} documents")

if len(documents) == 0:
    print("No documents found! Please add PDF, Word, Excel, or text files to data/raw/delphic-strategies/")

Loading Delphic Strategies documents...
Loaded: Arlington County Business License 2025.pdf (.pdf)
Loaded: Business Plan v2.docx (.docx)
Loaded: Delphic EIN.pdf (.pdf)
Loaded: Expenses.xlsx (.xlsx)
Loaded: SDVOSB_approval_letter.pdf (.pdf)
Loaded: SOP Executive Communications Intelligence.docx (.docx)
Loaded: Va Articles of Organization.pdf (.pdf)
Loaded: Va Certificate of Organization.pdf (.pdf)
Loaded: Va Certificate.pdf (.pdf)
Loaded: Va SCC Filing.pdf (.pdf)
Loaded 10 documents


In [5]:
# Cell 5: Document Chunking
def chunk_text(text, chunk_size=500, overlap=50):
    """Split text into overlapping chunks"""
    words = text.split()
    chunks = []
    
    for i in range(0, len(words), chunk_size - overlap):
        chunk = ' '.join(words[i:i + chunk_size])
        chunks.append(chunk)
        
        if i + chunk_size >= len(words):
            break
    
    return chunks

# Process all documents into chunks
all_chunks = []
chunk_metadata = []

for idx, (doc, filename) in enumerate(zip(documents, file_names)):
    chunks = chunk_text(doc)
    all_chunks.extend(chunks)
    
    # Track which document each chunk came from
    for chunk_idx, chunk in enumerate(chunks):
        chunk_metadata.append({
            'document_id': idx,
            'document_name': filename,
            'chunk_id': chunk_idx,
            'chunk_text': chunk
        })

print(f"Created {len(all_chunks)} chunks from {len(documents)} documents")

# Create a DataFrame for easy manipulation
if chunk_metadata:
    chunks_df = pd.DataFrame(chunk_metadata)
    print(f"DataFrame created with {len(chunks_df)} rows")
    print(chunks_df.head())
else:
    print("No chunks created - no documents were loaded")

Created 27 chunks from 10 documents
DataFrame created with 27 rows
   document_id                               document_name  chunk_id  \
0            0  Arlington County Business License 2025.pdf         0   
1            1                       Business Plan v2.docx         0   
2            1                       Business Plan v2.docx         1   
3            1                       Business Plan v2.docx         2   
4            1                       Business Plan v2.docx         3   

                                          chunk_text  
0  ARLINGTON COUNTY, VIRGINIA BUSINESS LICENSE TA...  
1  Delphic Strategies LLC - Executive Summary Bus...  
2  and operational accounts and a strategic commu...  
3  Michael served as a strategic communications c...  
4  values typically range from $50,000-150,000 de...  


In [6]:
# Cell 6: Simple Search (only if we have data)
if 'chunks_df' in locals() and len(chunks_df) > 0:
    def simple_keyword_search(query, chunks_df, top_k=3):
        """Simple keyword-based search for testing"""
        query_words = query.lower().split()
        scores = []
        
        for chunk in chunks_df['chunk_text']:
            chunk_lower = chunk.lower()
            score = sum(chunk_lower.count(word) for word in query_words)
            scores.append(score)
        
        chunks_df['relevance_score'] = scores
        results = chunks_df.nlargest(top_k, 'relevance_score')
        return results[results['relevance_score'] > 0]

    # Test simple search
    test_query = "strategy consulting"
    print(f"Testing search for: '{test_query}'")
    search_results = simple_keyword_search(test_query, chunks_df.copy())
    print(f"Found {len(search_results)} relevant chunks")

    for idx, row in search_results.iterrows():
        print(f"\nDocument: {row['document_name']}")
        print(f"Score: {row['relevance_score']}")
        print(f"Text: {row['chunk_text'][:200]}...")
else:
    print("Skipping search test - no chunks available")

Testing search for: 'strategy consulting'
Found 3 relevant chunks

Document: Business Plan v2.docx
Score: 10
Text: overhead, limited government experience, slower response times Boutique Crisis Communications Firms: Levick Strategic Communications, Dezenhall Resources Strengths: Crisis specialization, established ...

Document: Business Plan v2.docx
Score: 8
Text: Strategic Content Development LinkedIn thought leadership: Weekly articles on AI-enhanced strategic communications, crisis management best practices, regulatory communications Industry publications: B...

Document: Business Plan v2.docx
Score: 7
Text: values typically range from $50,000-150,000 depending on organizational size and engagement duration. Strategic Partnerships: The firm maintains relationships with management consulting firms, technol...


In [7]:
# Cell 7: Next Steps
print("\n" + "="*50)
print("RAG PROTOTYPE STATUS:")
if 'chunks_df' in locals() and len(chunks_df) > 0:
    print("✅ Documents loaded and processed successfully!")
    print("✅ Text chunking completed")
    print("✅ Basic search functionality working")
    print("\nNEXT STEPS:")
    print("1. Add more documents to data/raw/delphic-strategies/")
    print("2. Add OpenAI API integration for embeddings")
    print("3. Implement vector similarity search")
    print("4. Add LLM for response generation")
else:
    print("❌ No documents processed")
    print("REQUIRED ACTIONS:")
    print("1. Add PDF, Word, Excel, or text files to data/raw/delphic-strategies/")
    print("2. Re-run the notebook")
print("="*50)


RAG PROTOTYPE STATUS:
✅ Documents loaded and processed successfully!
✅ Text chunking completed
✅ Basic search functionality working

NEXT STEPS:
1. Add more documents to data/raw/delphic-strategies/
2. Add OpenAI API integration for embeddings
3. Implement vector similarity search
4. Add LLM for response generation


In [None]:
# Cell 8: OpenAI Integration Setup
from openai import OpenAI
from dotenv import load_dotenv
import os

# Load environment variables
load_dotenv()

# Initialize OpenAI client
client = OpenAI(
    api_key=os.getenv('OPENAI_API_KEY')
)

# Test OpenAI connection
try:
    # Test with a simple API call
    models = client.models.list()
    print("✅ OpenAI API connected successfully!")
    print("Available models include: gpt-4, gpt-3.5-turbo, text-embedding-3-small")
    
except Exception as e:
    print(f"❌ OpenAI API connection failed: {e}")
    print("Check your API key validity on OpenAI platform")

=== Debugging API Key Loading ===
❌ .env file not found

=== Loading Environment Variables ===
load_dotenv() result: True
✅ API key loaded!
   Length: 164 characters
   Starts with: sk-proj...
   Ends with: ...6cwA

=== All OPENAI Environment Variables ===
OPENAI_API_API_KEY: sk-proj-Uw...
OPENAI_API_KEY: sk-proj-Uw...
