# Oracle Database Similarity Search on AWS

This notebook demonstrates semantic similarity search on data stored in Oracle Database hosted on AWS using AI embeddings.

## Overview
- **Database**: Oracle Database on AWS (RDS or EC2)
- **Embedding Model**: sentence-transformers (all-MiniLM-L6-v2)
- **Similarity Metric**: Cosine similarity & Euclidean distance
- **Use Cases**: Document retrieval, semantic search, recommendations

## Section 1: Install & Import Required Libraries

In [None]:
import subprocess
import sys

packages = [
    'cx_Oracle>=8.0',
    'boto3',
    'pandas',
    'numpy',
    'scikit-learn',
    'sentence-transformers'
]

for package in packages:
    print(f"Installing {package}...")
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", package])

print("✓ All packages installed successfully!")

In [None]:
import cx_Oracle
import boto3
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize
from sentence_transformers import SentenceTransformer
import warnings
import json

warnings.filterwarnings('ignore')

print("✓ All libraries imported successfully!")

## Section 2: Configure AWS Credentials and Database Parameters

In [None]:
import os
from google.colab import userdata

# Load AWS credentials from Colab Secrets
try:
    aws_access_key = userdata.get('AWS_ACCESS_KEY_ID')
    aws_secret_key = userdata.get('AWS_SECRET_ACCESS_KEY')
    print("✓ AWS credentials loaded from Colab Secrets")
except:
    print("⚠ AWS credentials not found in Colab Secrets")
    aws_access_key = None
    aws_secret_key = None

# Database configuration
DB_CONFIG = {
    'host': os.getenv('ORACLE_HOST', 'your-oracle-db-endpoint.rds.amazonaws.com'),
    'port': int(os.getenv('ORACLE_PORT', 1521)),
    'service_name': os.getenv('ORACLE_SERVICE', 'ORCL'),
    'user': os.getenv('ORACLE_USER', 'admin'),
    'password': os.getenv('ORACLE_PASSWORD', 'your_password_here')
}

print("\nDatabase Configuration:")
print(f"  Host: {DB_CONFIG['host']}")
print(f"  Port: {DB_CONFIG['port']}")
print(f"  Service: {DB_CONFIG['service_name']}")
print(f"  User: {DB_CONFIG['user']}")

## Section 3: Connect to Oracle Database on AWS

In [None]:
def create_oracle_connection(config):
    """
    Establish connection to Oracle Database.
    
    Args:
        config (dict): Database configuration with host, port, service_name, user, password
        
    Returns:
        cx_Oracle.Connection: Database connection object or None if failed
    """
    try:
        connection_string = (
            f"{config['user']}/{config['password']}@"
            f"{config['host']}:{config['port']}/{config['service_name']}"
        )
        connection = cx_Oracle.connect(connection_string)
        print(f"✓ Successfully connected to Oracle Database")
        print(f"  Version: {connection.version}")
        return connection
    except cx_Oracle.DatabaseError as e:
        error, = e.args
        print(f"✗ Connection Error: {error.message}")
        return None
    except Exception as e:
        print(f"✗ Unexpected error: {str(e)}")
        return None

# Establish connection
oracle_conn = create_oracle_connection(DB_CONFIG)

## Section 4: Load Data from Oracle Table

In [None]:
def load_data_from_oracle(connection, table_name, columns=None):
    """
    Load data from Oracle table into pandas DataFrame.
    
    Args:
        connection: cx_Oracle connection object
        table_name (str): Name of the table to load
        columns (list): Specific columns to load (optional)
        
    Returns:
        pd.DataFrame: Loaded data or None if failed
    """
    try:
        if columns:
            column_str = ", ".join(columns)
            query = f"SELECT {column_str} FROM {table_name}"
        else:
            query = f"SELECT * FROM {table_name}"
        
        df = pd.read_sql(query, connection)
        print(f"✓ Loaded {len(df)} records from {table_name}")
        print(f"  Columns: {list(df.columns)}")
        return df
    except Exception as e:
        print(f"✗ Error loading data: {str(e)}")
        return None

# Load sample data
if oracle_conn:
    TABLE_NAME = 'DOCUMENTS'
    COLUMNS = ['ID', 'TITLE', 'CONTENT', 'CATEGORY']
    df = load_data_from_oracle(oracle_conn, TABLE_NAME, COLUMNS)
    if df is not None:
        print("\nFirst 3 rows:")
        print(df.head(3))
else:
    print("Cannot load data: Database connection not established")
    df = None

## Section 5: Prepare Data for Similarity Search

In [None]:
print("Loading embedding model (first run may take a moment)...")
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
print("✓ Embedding model loaded successfully")
print(f"  Model: all-MiniLM-L6-v2")
print(f"  Embedding dimension: 384")

In [None]:
def generate_embeddings(texts, model):
    """
    Generate embeddings for a list of texts.
    
    Args:
        texts (list): List of text strings
        model: SentenceTransformer model
        
    Returns:
        np.ndarray: Array of embeddings
    """
    print(f"Generating embeddings for {len(texts)} documents...")
    embeddings = model.encode(texts, show_progress_bar=True, batch_size=32)
    print(f"✓ Generated embeddings with shape: {embeddings.shape}")
    return embeddings

# Prepare and embed data
if df is not None and not df.empty:
    # Find text column
    if 'CONTENT' in df.columns:
        text_column = 'CONTENT'
    elif 'TITLE' in df.columns:
        text_column = 'TITLE'
    else:
        text_cols = df.select_dtypes(include=['object']).columns.tolist()
        text_column = text_cols[0] if text_cols else None
    
    if text_column:
        print(f"Using text column: {text_column}\n")
        df['text_clean'] = df[text_column].fillna('').astype(str).str.strip()
        embeddings = generate_embeddings(df['text_clean'].tolist(), embedding_model)
        df['embeddings'] = [embeddings[i] for i in range(len(df))]
        print("✓ Data preparation complete")
    else:
        print("No text column found in data")
else:
    print("No data available for embedding")

## Section 6: Implement Similarity Search Algorithm

In [None]:
class OracleSimilaritySearch:
    """
    Semantic similarity search engine for Oracle data.
    Supports cosine similarity and euclidean distance metrics.
    """
    
    def __init__(self, dataframe, embedding_model):
        """
        Initialize search engine.
        
        Args:
            dataframe (pd.DataFrame): Data with embeddings column
            embedding_model: SentenceTransformer model for encoding queries
        """
        self.df = dataframe.copy()
        self.model = embedding_model
        self.embeddings = np.array([emb for emb in dataframe['embeddings'].values])
    
    def cosine_similarity_search(self, query, top_k=5):
        """
        Search using cosine similarity metric.
        
        Args:
            query (str): Search query
            top_k (int): Number of top results to return
            
        Returns:
            pd.DataFrame: Top k results with similarity scores
        """
        # Encode query
        query_embedding = self.model.encode([query])[0]
        query_embedding = query_embedding.reshape(1, -1)
        
        # Compute similarities
        similarities = cosine_similarity(query_embedding, self.embeddings)[0]
        
        # Get top k
        top_indices = np.argsort(similarities)[::-1][:top_k]
        results = self.df.iloc[top_indices].copy()
        results['similarity_score'] = similarities[top_indices]
        results['rank'] = range(1, len(results) + 1)
        results['metric'] = 'cosine'
        
        # Clean up
        if 'embeddings' in results.columns:
            results = results.drop(columns=['embeddings', 'text_clean'])
        
        return results[['rank', 'similarity_score', 'metric'] + 
                      [col for col in results.columns 
                       if col not in ['rank', 'similarity_score', 'metric']]].reset_index(drop=True)
    
    def euclidean_distance_search(self, query, top_k=5):
        """
        Search using euclidean distance metric.
        
        Args:
            query (str): Search query
            top_k (int): Number of top results to return
            
        Returns:
            pd.DataFrame: Top k results with distance scores
        """
        # Encode query
        query_embedding = self.model.encode([query])[0]
        
        # Compute distances
        distances = np.sqrt(np.sum((self.embeddings - query_embedding) ** 2, axis=1))
        
        # Get top k (lowest distances)
        top_indices = np.argsort(distances)[:top_k]
        results = self.df.iloc[top_indices].copy()
        results['distance_score'] = distances[top_indices]
        results['rank'] = range(1, len(results) + 1)
        results['metric'] = 'euclidean'
        
        # Clean up
        if 'embeddings' in results.columns:
            results = results.drop(columns=['embeddings', 'text_clean'])
        
        return results[['rank', 'distance_score', 'metric'] + 
                      [col for col in results.columns 
                       if col not in ['rank', 'distance_score', 'metric']]].reset_index(drop=True)

# Initialize search engine
if df is not None and 'embeddings' in df.columns:
    search_engine = OracleSimilaritySearch(df, embedding_model)
    print("✓ Similarity search engine initialized")
else:
    search_engine = None
    print("Cannot initialize search engine: embeddings not available")

## Section 7: Execute Similarity Search Queries

In [None]:
# Example cosine similarity queries
if search_engine is not None:
    test_queries = [
        "machine learning and AI",
        "cloud database solutions",
        "data security and privacy"
    ]
    
    print("=" * 80)
    print("COSINE SIMILARITY SEARCH RESULTS")
    print("=" * 80)
    
    for query in test_queries:
        print(f"\nQuery: '{query}'")
        print("-" * 80)
        results = search_engine.cosine_similarity_search(query, top_k=3)
        print(results[['rank', 'similarity_score']].to_string(index=False))
else:
    print("Search engine not available")

In [None]:
# Example euclidean distance queries
if search_engine is not None:
    test_query = "enterprise infrastructure"
    
    print("\n" + "=" * 80)
    print("EUCLIDEAN DISTANCE SEARCH RESULTS")
    print("=" * 80)
    print(f"\nQuery: '{test_query}'")
    print("-" * 80)
    results = search_engine.euclidean_distance_search(test_query, top_k=3)
    print(results[['rank', 'distance_score']].to_string(index=False))

## Section 8: Display Results and Cleanup

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Visualization of search results
if search_engine is not None:
    query = "data analysis"
    results = search_engine.cosine_similarity_search(query, top_k=5)
    
    if len(results) > 0:
        print(f"Top Results for: '{query}'")
        print("=" * 80)
        print(results[['rank', 'similarity_score']].to_string(index=False))
        
        # Plot results
        fig, ax = plt.subplots(figsize=(10, 6))
        ax.barh(range(len(results)), results['similarity_score'], color='steelblue')
        ax.set_yticks(range(len(results)))
        ax.set_yticklabels([f"Result {i+1}" for i in range(len(results))])
        ax.set_xlabel('Similarity Score', fontsize=12)
        ax.set_ylabel('Results', fontsize=12)
        ax.set_title(f'Similarity Search Results: "{query}"', fontsize=14, fontweight='bold')
        ax.set_xlim([0, 1])
        plt.tight_layout()
        plt.show()

In [None]:
# Close database connection
if oracle_conn:
    try:
        oracle_conn.close()
        print("✓ Oracle database connection closed successfully")
    except Exception as e:
        print(f"⚠ Error closing connection: {str(e)}")

print("\n" + "=" * 80)
print("SUMMARY")
print("=" * 80)
print("""
This notebook demonstrated:
1. Installing and importing required libraries (cx_Oracle, sentence-transformers, etc.)
2. Configuring AWS credentials and Oracle database parameters
3. Establishing secure connection to Oracle Database on AWS
4. Loading data from Oracle tables into pandas DataFrames
5. Generating vector embeddings using pre-trained transformer models
6. Implementing efficient similarity search algorithms
7. Executing semantic search queries with multiple similarity metrics
8. Visualizing and analyzing search results

Best Practices:
- Always close database connections when done
- Cache embeddings to avoid regeneration
- Use batch processing for large datasets
- Monitor connection performance and timeouts
- Implement proper error handling for production use
""")