In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import sys
import warnings

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

In [None]:
def load_and_validate_data(csv_path):
    """Load CSV data and validate its structure."""
    try:
        df = pd.read_csv('http://localhost:8888/edit/PycharmProjects/Banking_Compliance_V3/banking_compliance_dataset_500_rows.csv')
        print(f"CSV loaded successfully. Shape: {df.shape}")
        print("\nColumn names:")
        print(df.columns.tolist())
        print("\nFirst 3 rows:")
        print(df.head(3))
        return df
    except FileNotFoundError:
        print(f"Error: '{csv_path}' not found.")
        print("Please ensure the CSV file is in the same directory as the script.")
        sys.exit(1)
    except Exception as e:
        print(f"Error loading CSV: {e}")
        sys.exit(1)

In [None]:
def prepare_text_for_embedding(df):
    """Prepare text data for embedding by combining relevant columns."""
    # Define text columns that are likely to exist in banking compliance data
    potential_text_columns = [
        'customer_type', 'full_name_en', 'nationality', 'address_line1',
        'city', 'emirate', 'country', 'email_primary', 'kyc_status',
        'risk_rating', 'account_type', 'account_subtype', 'account_name',
        'account_status', 'dormancy_status', 'exclusion_reason'
    ]
    
    # Filter to only include columns that actually exist in the DataFrame
    text_columns = [col for col in potential_text_columns if col in df.columns]
    
    if not text_columns:
        print("Warning: None of the expected text columns found in the dataset.")
        print("Available columns:", df.columns.tolist())
        # Fallback: use all string columns
        text_columns = df.select_dtypes(include=['object']).columns.tolist()
        if not text_columns:
            print("Error: No text columns found for embedding.")
            sys.exit(1)
    
    print(f"\nUsing columns for embedding: {text_columns}")
    
    # Create a copy to avoid warnings
    df_processed = df.copy()
    
    # Handle missing values and combine text columns
    for col in text_columns:
        df_processed[col] = df_processed[col].fillna('').astype(str)
    
    # Combine relevant text columns into a new column for embedding
    df_processed['text_for_embedding'] = df_processed[text_columns].agg(' '.join, axis=1)
    
    # Clean up redundant spaces and normalize text
    df_processed['text_for_embedding'] = (
        df_processed['text_for_embedding']
        .str.replace(r'\s+', ' ', regex=True)
        .str.strip()
        .str.replace('nan', '')  # Remove 'nan' strings that might appear
        .str.replace(r'\s+', ' ', regex=True)  # Clean up again after removing 'nan'
        .str.strip()
    )
    
    # Remove empty entries
    empty_mask = df_processed['text_for_embedding'].str.len() == 0
    if empty_mask.sum() > 0:
        print(f"Warning: {empty_mask.sum()} entries have empty text after processing.")
        df_processed = df_processed[~empty_mask].reset_index(drop=True)
    
    print(f"\nFinal dataset shape: {df_processed.shape}")
    print("\nSample prepared text (first 3 entries):")
    for i, text in enumerate(df_processed['text_for_embedding'].head(3)):
        print(f"Entry {i+1}: {text[:200]}{'...' if len(text) > 200 else ''}")
    
    return df_processed

In [None]:
def load_model_and_generate_embeddings(df_processed):
    """Load the BGE model and generate embeddings."""
    print("\nLoading BGE-large model...")
    try:
        # Try the primary model first
        model = SentenceTransformer('BAAI/bge-large-en-v1.5')
        print("Model loaded successfully.")
    except Exception as e:
        print(f"Failed to load BAAI/bge-large-en-v1.5: {e}")
        print("Trying fallback model...")
        try:
            # Fallback to a smaller but reliable model
            model = SentenceTransformer('all-MiniLM-L6-v2')
            print("Fallback model loaded successfully.")
        except Exception as e2:
            print(f"Failed to load fallback model: {e2}")
            print("Please ensure you have 'sentence-transformers' installed:")
            print("pip install sentence-transformers")
            sys.exit(1)
    
    print("Generating embeddings for the text data...")
    try:
        # Generate embeddings with progress bar
        embeddings = model.encode(
            df_processed['text_for_embedding'].tolist(),
            show_progress_bar=True,
            batch_size=32  # Adjust batch size based on your memory
        )
        
        print(f"Embeddings generated successfully. Shape: {embeddings.shape}")
        return embeddings, model
    
    except Exception as e:
        print(f"Error generating embeddings: {e}")
        sys.exit(1)


In [None]:
def find_similar_entries(df_processed, embeddings, query_idx=0, top_n=5):
    """Find the most similar entries to a given query."""
    if len(embeddings) <= 1:
        print("Not enough entries in the dataset to calculate similarity.")
        return
    
    print(f"\nFinding top {top_n} most similar entries...")
    
    # Use the specified index as query
    query_embedding = embeddings[query_idx].reshape(1, -1)
    
    # Calculate cosine similarity
    similarities = cosine_similarity(query_embedding, embeddings)[0]
    
    # Get top N most similar (excluding self if query is from dataset)
    if np.isclose(similarities[query_idx], 1.0):
        # Exclude self-similarity
        similarities[query_idx] = -1  # Set to very low value to exclude from top results
    
    # Get indices of top similar entries
    top_indices = np.argsort(similarities)[-top_n:][::-1]
    
    # Display results
    query_text = df_processed.loc[query_idx, 'text_for_embedding']
    print(f"\nQuery (Entry {query_idx}): {query_text[:200]}{'...' if len(query_text) > 200 else ''}")
    print(f"\nTop {top_n} most similar entries:")
    
    for rank, idx in enumerate(top_indices):
        if similarities[idx] < -0.5:  # Skip if it's the excluded self-similarity
            continue
            
        score = similarities[idx]
        
        # Get available information for display
        display_info = {}
        info_columns = ['customer_id', 'full_name_en', 'kyc_status', 'risk_rating', 'account_type']
        
        for col in info_columns:
            if col in df_processed.columns:
                display_info[col] = df_processed.loc[idx, col]
        
        print(f"\nRank {rank+1}: Index {idx}, Similarity Score: {score:.4f}")
        for key, value in display_info.items():
            print(f"  {key}: {value}")
        
        # Show a snippet of the text
        similar_text = df_processed.loc[idx, 'text_for_embedding']
        print(f"  Text snippet: {similar_text[:150]}{'...' if len(similar_text) > 150 else ''}")


In [None]:
def main():
    """Main function to execute the embedding similarity analysis."""
    csv_path = 'banking_compliance_dataset_500_rows.csv'
    
    # Step 1: Load and validate data
    df = load_and_validate_data(csv_path)
    
    # Step 2: Prepare text for embedding
    df_processed = prepare_text_for_embedding(df)
    
    # Step 3: Generate embeddings
    embeddings, model = load_model_and_generate_embeddings(df_processed)
    
    # Step 4: Find similar entries
    find_similar_entries(df_processed, embeddings, query_idx=0, top_n=5)
    
    # Optional: Save embeddings for future use
    save_embeddings = input("\nWould you like to save the embeddings to a file? (y/n): ").lower().strip()
    if save_embeddings == 'y':
        try:
            # Save DataFrame with embeddings
            df_processed['embedding'] = list(embeddings)
            df_processed.to_pickle('banking_compliance_with_embeddings.pkl')
            print("Embeddings saved to 'banking_compliance_with_embeddings.pkl'")
            
            # Also save just the embeddings as numpy array
            np.save('banking_compliance_embeddings.npy', embeddings)
            print("Raw embeddings saved to 'banking_compliance_embeddings.npy'")
        except Exception as e:
            print(f"Error saving embeddings: {e}")
    
    print("\nAnalysis complete!")



In [None]:
if __name__ == "__main__":
    main()