In [46]:
#Import necessary libraries,
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import streamlit as st
import io

In [3]:
@st.cache_resource
def load_bge_model():
    """Load BGE model with caching for Streamlit"""
    return SentenceTransformer('BAAI/bge-large-en-v1.5')

In [4]:
def initialize_embedding_model():
    try:
        model = load_bge_model()
        st.success("BGE Large model loaded successfully!")
        return model
    except Exception as e:
        st.error(f"Failed to load BGE model: {str(e)}")
        return None


In [5]:
# Usage in your app
embedding_model = initialize_embedding_model()

2025-06-09 15:02:46.858 
  command:

    streamlit run C:\Users\Admin\anaconda3\Lib\site-packages\ipykernel_launcher.py [ARGUMENTS]


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/94.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/779 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]



In [6]:
if embedding_model:
    # Use the model for embeddings
    text = "Sample banking compliance text"
    embedding = embedding_model.encode([text])
    print(f"Embedding generated: {embedding.shape}")

Embedding generated: (1, 1024)


In [7]:
try:
    df = pd.read_csv('banking_compliance_dataset_500_rows.csv')
    print("CSV loaded successfully. First 5 rows:")
    print(df.head())
except FileNotFoundError:
    print("Error: 'banking_compliance_dataset_500_rows.csv' not found.")
    print("Please ensure the CSV file is in the same directory as the script.")
    exit()

CSV loaded successfully. First 5 rows:
  customer_id customer_type       full_name_en       full_name_ar  \
0   CUS770487    INDIVIDUAL       Ali Al-Zaabi       Ali Al-Zaabi   
1   CUS865179    INDIVIDUAL   Fatima Al-Shamsi   Fatima Al-Shamsi   
2   CUS133659     CORPORATE    Fatima Al-Zaabi    Fatima Al-Zaabi   
3   CUS216970    INDIVIDUAL   Sara Al-Mansouri   Sara Al-Mansouri   
4   CUS377932     CORPORATE  Hassan Al-Suwaidi  Hassan Al-Suwaidi   

        id_number      id_type date_of_birth  nationality  \
0  78419988078673  EMIRATES_ID    1971-01-03        EGYPT   
1  78419814823498  EMIRATES_ID    1994-05-03        INDIA   
2  78419845443951  EMIRATES_ID    1974-04-24  PHILIPPINES   
3  78420006007072  EMIRATES_ID    1996-11-17        SYRIA   
4  78419949683481  EMIRATES_ID    1984-02-08        SYRIA   

             address_line1 address_line2  ... waiting_period_start  \
0  Building 224, Street 15       Area 17  ...           2021-12-27   
1  Building 876, Street 15        Area 

In [8]:
# Step 2: Select relevant text columns for embedding
# For a banking compliance app, relevant text fields might include:
# 'full_name_en', 'address_line1', 'address_line2', 'city', 'country', 'email_primary', 'kyc_status', 'risk_rating', 'account_type', 'account_subtype', 'account_name', 'account_status', 'dormancy_status', 'exclusion_reason'
# You might want to combine these into a single text string for each customer/account
# Let's create a combined 'text_for_embedding' column.
# We'll fill NaN values with empty strings to avoid errors during string concatenation.

text_columns = [
    'customer_type', 'full_name_en', 'nationality', 'address_line1',
    'city', 'emirate', 'country', 'email_primary', 'kyc_status',
    'risk_rating', 'account_type', 'account_subtype', 'account_name',
    'account_status', 'dormancy_status', 'exclusion_reason'
]


In [9]:
# Create a copy to avoid SettingWithCopyWarning
df_processed = df.copy()

In [10]:
# Combine relevant text columns into a new column for embedding
# Join non-null string values with a space
df_processed['text_for_embedding'] = df_processed[text_columns].astype(str).agg(' '.join, axis=1)


In [11]:
# Clean up redundant spaces that might result from concatenation
df_processed['text_for_embedding'] = df_processed['text_for_embedding'].str.replace(r'\s+', ' ', regex=True).str.strip()


In [12]:
print("\nPrepared text for embedding (first 5 entries):")
for i, text in enumerate(df_processed['text_for_embedding'].head()):
    print(f"Entry {i+1}: {text}")


Prepared text for embedding (first 5 entries):
Entry 1: INDIVIDUAL Ali Al-Zaabi EGYPT Building 224, Street 15 Dubai FUJAIRAH UAE ali.al-zaabi@hotmail.com PENDING LOW CURRENT JOINT CURRENT - Ali Al-Zaabi DORMANT FLAGGED nan
Entry 2: INDIVIDUAL Fatima Al-Shamsi INDIA Building 876, Street 15 Ajman SHARJAH UAE fatima.al-shamsi@outlook.com PENDING HIGH CURRENT CORPORATE CURRENT - Fatima Al-Shamsi DORMANT FLAGGED nan
Entry 3: CORPORATE Fatima Al-Zaabi PHILIPPINES Building 575, Street 35 Ajman FUJAIRAH UAE fatima.al-zaabi@yahoo.com COMPLIANT HIGH INVESTMENT SECURITIES INVESTMENT - Fatima Al-Zaabi DORMANT FLAGGED nan
Entry 4: INDIVIDUAL Sara Al-Mansouri SYRIA Building 624, Street 13 Sharjah UMM_AL_QUWAIN UAE sara.al-mansouri@gmail.com COMPLIANT LOW INVESTMENT MUTUAL_FUND INVESTMENT - Sara Al-Mansouri DORMANT FLAGGED nan
Entry 5: CORPORATE Hassan Al-Suwaidi SYRIA Building 231, Street 5 Dubai FUJAIRAH UAE hassan.al-suwaidi@gmail.com COMPLIANT LOW INVESTMENT MUTUAL_FUND INVESTMENT - Hassan Al-Su

In [47]:
# Step 3: Load the BGE-large model and generate embeddings
print("\nLoading BGE-large model. This may take a while for the first time...")
# 'BAAI/bge-large-en-v1.5' is a common identifier for BGE-large.
# You might need to specify a different version if 'bge-large-en-v1.5' is not the exact one you intend.
try:
    model = SentenceTransformer('BAAI/bge-large-en-v1.5')
    print("Model loaded successfully.")
    print("Generating embeddings for the text data...")
    # Generate embeddings for all text entries
    embeddings = model.encode(df_processed['text_for_embedding'].tolist(), show_progress_bar=True)
    # Alternative print statement to avoid f-string syntax for debugging purposes
    embeddings = np.array(embeddings)
    print("Embeddings shape:", embeddings.shape)
except Exception as e:
    print("❌ Error:", str(e))


Loading BGE-large model. This may take a while for the first time...
Model loaded successfully.
Generating embeddings for the text data...


Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Embeddings shape: (500, 1024)


In [48]:
 # Store embeddings in the DataFrame (optional, but useful for later use)
df_processed['embedding'] = list(embeddings)

In [49]:
# Step 4: Generate similarity scores
print("\nCalculating similarity scores...")


Calculating similarity scores...


In [50]:
# Example: Find the most similar customer/account to the first entry
    # (This is a demonstration; in a real app, you'd embed a query string
    # and compare it against all document embeddings.)

if len(embeddings) > 1:
        query_embedding = embeddings[0] # Using the first entry as a query example


In [51]:
 # Calculate cosine similarity between the query embedding and all other embeddings
        # Reshape query_embedding for cosine_similarity if it's a single vector
query_embedding_reshaped = query_embedding.reshape(1, -1)
        

In [52]:
# Calculate similarity with all other embeddings
similarities = cosine_similarity(query_embedding_reshaped, embeddings)


In [53]:
# The result is a 2D array, take the first row for a 1D array of similarities
similarity_scores = similarities[0]

In [54]:
# Get top N most similar
top_n = 5
        # Exclude self-similarity if the query is from the dataset
if len(similarity_scores) > 0 and np.isclose(similarity_scores[0], 1.0):
             # Sort indices by similarity score in descending order, exclude the first one (self)
    most_similar_indices = similarity_scores.argsort()[-top_n-1:][::-1][1:]
else:
            # Sort indices by similarity score in descending order
    most_similar_indices = similarity_scores.argsort()[-top_n:][::-1]


In [59]:
    print(f"\nTop {top_n} most similar entries to '{df_processed.loc[0, 'text_for_embedding']}' (excluding itself if applicable):")
    for rank, idx in enumerate(most_similar_indices):
            score = similarity_scores[idx]
            original_text = df_processed.loc[idx, 'text_for_embedding']
            customer_id = df_processed.loc[idx, 'customer_id']
            kyc_status = df_processed.loc[idx, 'kyc_status']
            risk_rating = df_processed.loc[idx, 'risk_rating']

            print(f"Rank {rank+1}: Customer ID: {customer_id}, Score: {score:.4f}, KYC Status: {kyc_status}, Risk: {risk_rating}")
            # print(f"    Original Text: {original_text}") # Uncomment to see the full text

else:
        print("Not enough entries in the dataset to calculate similarity.")




Top 5 most similar entries to 'INDIVIDUAL Ali Al-Zaabi EGYPT Building 224, Street 15 Dubai FUJAIRAH UAE ali.al-zaabi@hotmail.com PENDING LOW CURRENT JOINT CURRENT - Ali Al-Zaabi DORMANT FLAGGED nan' (excluding itself if applicable):
Rank 1: Customer ID: CUS181896, Score: 0.8961, KYC Status: EXPIRED, Risk: LOW
Rank 2: Customer ID: CUS590110, Score: 0.8935, KYC Status: PENDING, Risk: LOW
Rank 3: Customer ID: CUS335113, Score: 0.8864, KYC Status: COMPLIANT, Risk: HIGH
Rank 4: Customer ID: CUS625600, Score: 0.8835, KYC Status: PENDING, Risk: LOW
Rank 5: Customer ID: CUS476701, Score: 0.8829, KYC Status: EXPIRED, Risk: MEDIUM
Not enough entries in the dataset to calculate similarity.
