
# **RAG System for Drilling Manual with OpenAI Embeddings**

# This notebook demonstrates a complete RAG (Retrieval-Augmented Generation) system using OpenAI embeddings.
# The system ingests a drilling operations manual and allows you to ask questions about it.


# **1. Install Required Dependencies**


# !pip install openai requests numpy ipywidgets


# **2. Import Libraries and Setup**



In [None]:
import re
import json
import numpy as np
from typing import List, Dict, Tuple
import requests
import time
import os
from IPython.display import display, Markdown, clear_output
import ipywidgets as widgets
from ipywidgets import interact, interactive, fixed, interact_manual

# %%
# Configuration - Set your OpenAI API key here
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY', 'your-openai-api-key-here')

if OPENAI_API_KEY == 'your-openai-api-key-here':
    print("⚠️  Please set your OpenAI API key!")
    print("Option 1: Set environment variable: export OPENAI_API_KEY='your-key'")
    print("Option 2: Modify the OPENAI_API_KEY variable above")
    print("Get your API key from: https://platform.openai.com/api-keys")
else:
    print("✅ OpenAI API key is set!")

# %% [markdown]
# ## 3. RAG System Class Definition

# %%
class NotebookRAG:
    """RAG System optimized for notebook usage with progress indicators and interactive features"""

    def __init__(self, api_key: str):
        self.api_key = api_key
        self.documents = []
        self.embeddings = []
        self.embedding_model = "text-embedding-3-small"
        self.chat_model = "gpt-3.5-turbo"
        self.chunk_metadata = []  # Store metadata about chunks

    def chunk_text(self, text: str, chunk_size: int = 800, overlap: int = 100) -> List[Dict]:
        """Split text into overlapping chunks with metadata"""
        # Split by sections and subsections first
        sections = re.split(r'\n(?=CHAPTER|\d+\.\d+)', text)

        chunks_with_metadata = []
        chunk_id = 0

        for section in sections:
            if not section.strip():
                continue

            # Extract section title
            title_match = re.match(r'(CHAPTER \d+:.*|^\d+\.\d+.*)', section.strip())
            section_title = title_match.group(1) if title_match else "General"

            # Split section into smaller chunks
            sentences = re.split(r'(?<=[.!?])\s+', section)
            current_chunk = ""

            for sentence in sentences:
                if len(current_chunk) + len(sentence) > chunk_size and current_chunk:
                    chunks_with_metadata.append({
                        'id': chunk_id,
                        'text': current_chunk.strip(),
                        'section': section_title,
                        'word_count': len(current_chunk.split())
                    })
                    chunk_id += 1

                    # Start new chunk with overlap
                    overlap_sentences = current_chunk.split('. ')[-2:]
                    current_chunk = '. '.join(overlap_sentences) + '. ' + sentence
                else:
                    current_chunk += sentence + " "

            # Add final chunk
            if current_chunk.strip():
                chunks_with_metadata.append({
                    'id': chunk_id,
                    'text': current_chunk.strip(),
                    'section': section_title,
                    'word_count': len(current_chunk.split())
                })
                chunk_id += 1

        return chunks_with_metadata

    def get_embeddings_batch(self, texts: List[str], batch_size: int = 20) -> List[List[float]]:
        """Get embeddings with progress bar"""
        embeddings = []
        total_batches = (len(texts) - 1) // batch_size + 1

        # Create progress bar
        progress = widgets.IntProgress(
            value=0,
            min=0,
            max=total_batches,
            description='Embeddings:',
            bar_style='',
            style={'bar_color': 'lightblue'},
            orientation='horizontal'
        )
        display(progress)

        for i in range(0, len(texts), batch_size):
            batch = texts[i:i + batch_size]
            batch_num = i // batch_size + 1

            url = "https://api.openai.com/v1/embeddings"
            headers = {
                "Authorization": f"Bearer {self.api_key}",
                "Content-Type": "application/json"
            }
            data = {
                "input": batch,
                "model": self.embedding_model
            }

            try:
                response = requests.post(url, headers=headers, json=data)
                response.raise_for_status()
                result = response.json()

                batch_embeddings = [item["embedding"] for item in result["data"]]
                embeddings.extend(batch_embeddings)

                # Update progress
                progress.value = batch_num
                progress.description = f'Batch {batch_num}/{total_batches}'

                time.sleep(0.5)  # Rate limiting

            except requests.exceptions.RequestException as e:
                print(f"❌ Error in batch {batch_num}: {e}")
                embeddings.extend([None] * len(batch))

        progress.bar_style = 'success'
        progress.description = 'Complete!'
        return embeddings

    def ingest_manual(self, manual_text: str, show_stats: bool = True):
        """Ingest manual with detailed feedback"""
        print("📚 Ingesting drilling manual...")

        # Chunk the text
        chunks_with_metadata = self.chunk_text(manual_text)
        self.chunk_metadata = chunks_with_metadata
        self.documents = [chunk['text'] for chunk in chunks_with_metadata]

        if show_stats:
            print(f"✅ Created {len(chunks_with_metadata)} chunks")

            # Show statistics
            sections = set(chunk['section'] for chunk in chunks_with_metadata)
            avg_words = np.mean([chunk['word_count'] for chunk in chunks_with_metadata])

            print(f"📊 Statistics:")
            print(f"   • Sections found: {len(sections)}")
            print(f"   • Average words per chunk: {avg_words:.0f}")
            print(f"   • Total words: {sum(chunk['word_count'] for chunk in chunks_with_metadata)}")

        print("🔄 Generating embeddings...")

        # Get embeddings
        self.embeddings = self.get_embeddings_batch(self.documents)

        # Filter successful embeddings
        valid_indices = [i for i, emb in enumerate(self.embeddings) if emb is not None]
        self.documents = [self.documents[i] for i in valid_indices]
        self.embeddings = [self.embeddings[i] for i in valid_indices]
        self.chunk_metadata = [self.chunk_metadata[i] for i in valid_indices]

        print(f"✅ Successfully processed {len(self.documents)} chunks with embeddings")

    def cosine_similarity(self, vec1: List[float], vec2: List[float]) -> float:
        """Calculate cosine similarity"""
        if vec1 is None or vec2 is None:
            return 0.0

        vec1, vec2 = np.array(vec1), np.array(vec2)
        return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

    def retrieve_relevant_chunks(self, query: str, top_k: int = 3) -> List[Dict]:
        """Retrieve relevant chunks with metadata"""
        # Get query embedding
        url = "https://api.openai.com/v1/embeddings"
        headers = {
            "Authorization": f"Bearer {self.api_key}",
            "Content-Type": "application/json"
        }
        data = {
            "input": query,
            "model": self.embedding_model
        }

        try:
            response = requests.post(url, headers=headers, json=data)
            response.raise_for_status()
            query_embedding = response.json()["data"][0]["embedding"]
        except:
            return []

        # Calculate similarities
        similarities = []
        for i, doc_embedding in enumerate(self.embeddings):
            similarity = self.cosine_similarity(query_embedding, doc_embedding)
            similarities.append({
                'index': i,
                'similarity': similarity,
                'text': self.documents[i],
                'metadata': self.chunk_metadata[i]
            })

        # Sort and return top-k
        similarities.sort(key=lambda x: x['similarity'], reverse=True)
        return similarities[:top_k]

    def generate_response(self, query: str, context_chunks: List[str]) -> str:
        """Generate response using OpenAI"""
        context = "\n\n".join(context_chunks)

        messages = [
            {
                "role": "system",
                "content": "You are an expert drilling engineer assistant. Use the provided context from the drilling manual to answer questions accurately. Include specific values and procedures when available."
            },
            {
                "role": "user",
                "content": f"Context:\n{context}\n\nQuestion: {query}\n\nProvide a detailed answer based on the context."
            }
        ]

        url = "https://api.openai.com/v1/chat/completions"
        headers = {
            "Authorization": f"Bearer {self.api_key}",
            "Content-Type": "application/json"
        }
        data = {
            "model": self.chat_model,
            "messages": messages,
            "max_tokens": 500,
            "temperature": 0.3
        }

        try:
            response = requests.post(url, headers=headers, json=data)
            response.raise_for_status()
            return response.json()["choices"][0]["message"]["content"]
        except requests.exceptions.RequestException as e:
            return f"Error generating response: {e}"

    def answer_query(self, query: str, show_sources: bool = True) -> str:
        """Answer query with optional source display"""
        if not self.documents:
            return "❌ Please ingest a manual first!"

        print(f"🔍 Processing query: {query}")

        # Retrieve relevant chunks
        relevant_chunks = self.retrieve_relevant_chunks(query, top_k=3)

        if not relevant_chunks:
            return "❌ No relevant information found in the manual."

        if show_sources:
            print(f"📋 Found {len(relevant_chunks)} relevant chunks:")
            for i, chunk in enumerate(relevant_chunks):
                print(f"   {i+1}. {chunk['metadata']['section']} (similarity: {chunk['similarity']:.3f})")

        # Generate response
        context_texts = [chunk['text'] for chunk in relevant_chunks]
        print("🤖 Generating response...")
        response = self.generate_response(query, context_texts)

        return response

# %% [markdown]
# ## 4. Sample Drilling Manual Data

# %%
SAMPLE_DRILLING_MANUAL = """
DRILLING OPERATIONS MANUAL

CHAPTER 1: SAFETY PROCEDURES

1.1 Personal Protective Equipment (PPE)
All personnel on the drilling site must wear appropriate PPE including hard hats, safety glasses, steel-toed boots, and high-visibility clothing. Hearing protection is required in areas where noise levels exceed 85 decibels. PPE inspection must be conducted daily before shift start.

1.2 Gas Detection and Monitoring
Continuous gas monitoring is essential during drilling operations. H2S detectors must be calibrated daily and positioned at strategic locations around the drilling site. Emergency shutdown procedures must be initiated if gas levels exceed safe thresholds of 10 ppm for H2S.

1.3 Blowout Prevention
The blowout preventer (BOP) system must be tested daily according to API standards. All crew members must be trained in emergency procedures including well control and evacuation protocols. BOP stack pressure testing should be performed at 70% of working pressure.

CHAPTER 2: DRILLING EQUIPMENT

2.1 Rotary Table Operations
The rotary table provides the primary rotation for the drill string. Operating parameters include rotation speed typically ranging from 60-200 RPM depending on formation type and bit characteristics. Regular maintenance includes checking for wear on the master bushing and ensuring proper lubrication every 8 hours.

2.2 Mud Pump Systems
Mud pumps circulate drilling fluid through the system. Pump pressure should be monitored continuously with typical operating pressures ranging from 1000-4000 PSI. Pump liner inspection should be performed every 500 hours of operation. Maximum allowable pressure is 5000 PSI.

2.3 Drawworks and Hoisting System
The drawworks controls the hoisting and lowering of the drill string. Load limits must never exceed the rated capacity of 750,000 pounds. Daily inspection of wire rope is mandatory, checking for broken wires, kinks, or excessive wear. Replace wire rope when 6 broken wires are found in one lay.

CHAPTER 3: DRILLING FLUIDS

3.1 Mud Properties
Drilling mud density typically ranges from 8.5 to 18 pounds per gallon depending on formation pressure requirements. Viscosity should be maintained between 35-50 seconds using a marsh funnel. pH levels should be kept between 9-11 for optimal performance. Gel strength should be measured every 4 hours.

3.2 Mud Treatment Chemicals
Common additives include barite for weight increase, bentonite for viscosity, and various polymers for fluid loss control. Chemical additions must be recorded in the daily drilling report with quantities and timing noted. Typical barite addition rate is 100-300 pounds per barrel to increase mud weight.

CHAPTER 4: FORMATION EVALUATION

4.1 Logging While Drilling (LWD)
LWD tools provide real-time formation data including resistivity, density, and neutron readings. Data quality checks should be performed every 50 feet of drilling progress. Calibration procedures must be followed according to service company specifications.

4.2 Casing and Cementing
Casing strings must be designed to withstand formation pressures and provide wellbore integrity. Cement slurry design should achieve compressive strength of at least 2000 PSI within 24 hours. Cement bond logs should be run to verify cement placement quality.

CHAPTER 5: WELL COMPLETION

5.1 Perforating Operations
Perforating guns must be properly assembled and tested before deployment. Shot density typically ranges from 4-16 shots per foot depending on completion design. Safety procedures require personnel to maintain safe distance of at least 500 feet during perforating operations.

5.2 Production Tubing Installation
Production tubing must be inspected for defects before installation. Tubing weight calculations must account for fluid loading and thermal effects. Packer setting depths should be verified against completion drawings.

CHAPTER 6: EMERGENCY PROCEDURES

6.1 Well Control Situations
In case of a kick (influx of formation fluids), the well should be shut in immediately using the BOP. Mud weight should be increased gradually while circulating out the kick. Never allow gas to reach surface equipment. Maximum allowable shut-in pressure is 3000 PSI.

6.2 Equipment Failures
Power failures require immediate activation of backup generators within 30 seconds. Hydraulic failures on BOPs require switching to backup accumulator systems. All equipment failures must be logged and reported to the drilling supervisor immediately.

6.3 Personnel Safety
In case of H2S exposure, affected personnel should be moved to fresh air immediately. Emergency breathing apparatus should be available at all times. Evacuation procedures should be practiced monthly. Wind direction indicators must be visible from all work areas.
"""

print("📖 Sample drilling manual loaded!")
print(f"Manual contains {len(SAMPLE_DRILLING_MANUAL.split())} words")

# %% [markdown]
# ## 5. Initialize and Setup RAG System

# %%
# Initialize the RAG system
if OPENAI_API_KEY != 'your-openai-api-key-here':
    rag = NotebookRAG(OPENAI_API_KEY)
    print("🚀 RAG system initialized!")
else:
    print("⚠️  Cannot initialize RAG system without API key")

# %% [markdown]
# ## 6. Ingest the Drilling Manual

# %%
# Ingest the manual (this will take a moment due to API calls)
if 'rag' in locals():
    rag.ingest_manual(SAMPLE_DRILLING_MANUAL)
else:
    print("⚠️  Please set up your API key first")

# %% [markdown]
# ## 7. Interactive Query Interface

# %%
def query_interface():
    """Interactive query interface for the RAG system"""

    # Predefined queries for quick testing
    sample_queries = [
        "What are the PPE requirements?",
        "What is the operating pressure range for mud pumps?",
        "How often should wire rope be inspected?",
        "What are the typical mud density ranges?",
        "What should I do in case of a kick?",
        "What are the cement strength requirements?",
        "How do I handle H2S exposure?",
        "Custom query..."
    ]

    query_dropdown = widgets.Dropdown(
        options=sample_queries,
        value=sample_queries[0],
        description='Select Query:',
        style={'description_width': 'initial'}
    )

    custom_query = widgets.Text(
        value='',
        placeholder='Enter your custom query here...',
        description='Custom Query:',
        style={'description_width': 'initial'},
        layout=widgets.Layout(width='500px')
    )

    show_sources = widgets.Checkbox(
        value=True,
        description='Show source chunks',
        style={'description_width': 'initial'}
    )

    output = widgets.Output()

    def on_query_change(change):
        if change['new'] == 'Custom query...':
            custom_query.layout.visibility = 'visible'
        else:
            custom_query.layout.visibility = 'hidden'

    query_dropdown.observe(on_query_change, names='value')
    custom_query.layout.visibility = 'hidden'

    def run_query(b):
        with output:
            clear_output()

            if 'rag' not in locals() or not hasattr(rag, 'documents') or not rag.documents:
                print("❌ Please ingest the manual first!")
                return

            # Get the query
            if query_dropdown.value == 'Custom query...':
                query = custom_query.value
            else:
                query = query_dropdown.value

            if not query:
                print("❌ Please enter a query!")
                return

            # Process the query
            try:
                response = rag.answer_query(query, show_sources=show_sources.value)
                print("=" * 60)
                display(Markdown(f"**Query:** {query}"))
                print("=" * 60)
                display(Markdown(f"**Answer:**\n\n{response}"))
                print("=" * 60)
            except Exception as e:
                print(f"❌ Error processing query: {e}")

    button = widgets.Button(description="Ask Question", button_style='success')
    button.on_click(run_query)

    # Layout
    ui = widgets.VBox([
        widgets.HTML("<h3>🤖 Ask Questions About the Drilling Manual</h3>"),
        query_dropdown,
        custom_query,
        show_sources,
        button,
        output
    ])

    display(ui)

# Display the interface
if 'rag' in locals() and hasattr(rag, 'documents') and rag.documents:
    query_interface()

    # Also provide a simple function for direct queries
    def ask(question: str):
        """Simple function to ask questions directly"""
        return rag.answer_query(question)

    print("\n💡 You can also use the ask() function directly:")
    print("   Example: ask('What are the mud pump pressure ranges?')")
else:
    print("⚠️  Please complete the setup steps above first")

# %% [markdown]
# ## 8. Manual Statistics and Exploration

# %%
def show_manual_stats():
    """Display statistics about the ingested manual"""
    if 'rag' not in locals() or not hasattr(rag, 'chunk_metadata'):
        print("❌ Please ingest the manual first!")
        return

    chunks = rag.chunk_metadata

    print("📊 Manual Statistics:")
    print(f"   • Total chunks: {len(chunks)}")
    print(f"   • Total words: {sum(chunk['word_count'] for chunk in chunks)}")
    print(f"   • Average words per chunk: {np.mean([chunk['word_count'] for chunk in chunks]):.1f}")

    # Section breakdown
    sections = {}
    for chunk in chunks:
        section = chunk['section']
        if section not in sections:
            sections[section] = {'count': 0, 'words': 0}
        sections[section]['count'] += 1
        sections[section]['words'] += chunk['word_count']

    print(f"\n📋 Sections in manual:")
    for section, stats in sections.items():
        print(f"   • {section}: {stats['count']} chunks, {stats['words']} words")

if 'rag' in locals() and hasattr(rag, 'chunk_metadata'):
    show_manual_stats()

# %% [markdown]
# ## 9. Test Queries (Run this cell to see examples)

# %%
# Example queries to test the system
if 'rag' in locals() and hasattr(rag, 'documents') and rag.documents:
    test_queries = [
        "What PPE is required on drilling sites?",
        "What is the maximum mud pump pressure?",
        "How do I handle a well control emergency?"
    ]

    print("🧪 Running test queries...\n")

    for i, query in enumerate(test_queries, 1):
        print(f"{'='*60}")
        print(f"Test Query {i}: {query}")
        print(f"{'='*60}")

        try:
            response = rag.answer_query(query, show_sources=False)
            display(Markdown(f"**Answer:** {response}"))
        except Exception as e:
            print(f"❌ Error: {e}")

        print()

# %% [markdown]
# ## 10. Save/Load Functionality (Optional)

# %%
def save_embeddings(filename='drilling_manual_embeddings.json'):
    """Save the embeddings to avoid recomputing"""
    if 'rag' not in locals() or not rag.documents:
        print("❌ No embeddings to save!")
        return

    data = {
        'documents': rag.documents,
        'embeddings': rag.embeddings,
        'chunk_metadata': rag.chunk_metadata,
        'model': rag.embedding_model
    }

    with open(filename, 'w') as f:
        json.dump(data, f)

    print(f"💾 Embeddings saved to {filename}")

def load_embeddings(filename='drilling_manual_embeddings.json'):
    """Load pre-computed embeddings"""
    if 'rag' not in locals():
        print("❌ Initialize RAG system first!")
        return

    try:
        with open(filename, 'r') as f:
            data = json.load(f)

        rag.documents = data['documents']
        rag.embeddings = data['embeddings']
        rag.chunk_metadata = data['chunk_metadata']

        print(f"📁 Loaded {len(rag.documents)} documents with embeddings from {filename}")
    except FileNotFoundError:
        print(f"❌ File {filename} not found!")

# Uncomment these lines to save your embeddings:
# save_embeddings()

print("💡 Use save_embeddings() to save your work and load_embeddings() to restore it later!")