<a href="https://colab.research.google.com/github/nastaran-farhadi/BBAC-with-LLM-in-healthcare/blob/main/BBAC_for_several_user.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install faker pandas
from faker import Faker
import pandas as pd
import random
from datetime import datetime, timedelta
fake = Faker()

Collecting faker
  Downloading faker-37.0.1-py3-none-any.whl.metadata (15 kB)
Downloading faker-37.0.1-py3-none-any.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m44.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faker
Successfully installed faker-37.0.1


In [None]:
import os
BASE_DIR = '/content/drive/MyDrive/BBAC_several_user'
HISTORICAL_DIR = os.path.join(BASE_DIR, 'Historical_data')
STORAGE_DIR = os.path.join(BASE_DIR, 'Storage')
BEHAVIOR_DIR = os.path.join(BASE_DIR, 'Behavior_Patterns')

# Create directories if they don't exist
os.makedirs(HISTORICAL_DIR, exist_ok=True)
os.makedirs(STORAGE_DIR, exist_ok=True)
os.makedirs(BEHAVIOR_DIR, exist_ok=True)

In [None]:

# ======================
# CONFIGURATION
# ======================
NUM_USERS = 10
BASE_DATE = datetime(2023, 10, 1)

ANOMALY_PROBABILITY = 0.3



# ======================
# CORE FUNCTIONS
# ======================
# (Keep all the functions from previous code exactly the same)
# [generate_user_profile, calculate_behavior_score,
#  generate_normal_behavior, inject_anomalies]
def generate_user_profile(user_id):
    """Generate static user profile with access permissions"""
    return {
        'employee_id': f'E-{2023 + user_id}',
        'name': fake.name(),
        'role': random.choice(['Doctor', 'Nurse', 'Technician']),
        'department': random.choice(['Cardiology', 'Neurology', 'Oncology']),
        'access_permissions': {
            'allowed_datasets': ['Patient Records'],
            'allowed_operations': ['Read', 'Update'],
            'data_categories': ['Medical History', 'Treatment Plans']
        }
    }

def calculate_behavior_score(access_time, ip_address, mfa_failures=0):
    """Calculate risk score based on multiple factors"""
    score = 0.0

    # Time-based scoring (8AM is ideal)
    hour = access_time.hour
    if 7 <= hour <= 9:
        score += 0.0
    else:
        score += min(abs(hour - 8) * 0.1, 0.5)

    # Network location scoring
    if "Internal" in ip_address:
        score += 0.0
    else:
        score += 0.4  # Increased weight for external access

    # Authentication failures
    score += min(mfa_failures * 0.15, 0.45)

    # Random variation
    score += random.uniform(-0.05, 0.05)

    return max(0.0, min(round(score, 2), 1.0))

def generate_normal_behavior(base_date, user_profile):
    """Generate 10 days of normal access patterns"""
    base_ip = '10.1.1.15 (Internal)'
    base_location = 'Main Campus - ' + user_profile['department']
    normal_access_time = datetime.strptime("08:00", "%H:%M").time()

    records = []
    for day in range(10):
        access_date = base_date + timedelta(days=day)
        time_variation = timedelta(minutes=random.randint(-15, 15))
        access_time = datetime.combine(access_date, normal_access_time) + time_variation

        record = {
            'date': access_date.strftime("%Y-%m-%d"),
            'timestamp': access_time.strftime("%Y-%m-%d %H:%M:%S"),
            'requested_dataset': 'Patient Records',
            'requested_operation': random.choices(['Read', 'Update'], weights=[0.85, 0.15])[0],
            'access_outcome': 'Granted',
            'mfa_failures': random.choices([0, 1], weights=[0.95, 0.05])[0],
            'ip_address': base_ip,
            'location': base_location,
            'behavior_score': calculate_behavior_score(access_time, base_ip)
        }

        # Add minor time jitter
        if random.random() < 0.1:
            record['timestamp'] = (access_time + timedelta(minutes=random.randint(-30, 30))).strftime("%Y-%m-%d %H:%M:%S")

        records.append(record)

    return records

def inject_anomalies(records, user_profile):
    """Inject realistic anomalous patterns into the dataset"""
    anomalous_records = []

    # 1. Night Access Pattern
    if random.random() < ANOMALY_PROBABILITY:
        anomaly_date = datetime.strptime(records[-1]['date'], "%Y-%m-%d") + timedelta(days=1)
        anomaly_time = datetime.strptime("02:00", "%H:%M").time()
        anomalous_records.append({
            'date': anomaly_date.strftime("%Y-%m-%d"),
            'timestamp': datetime.combine(anomaly_date, anomaly_time).strftime("%Y-%m-%d %H:%M:%S"),
            'requested_dataset': 'Patient Records',
            'requested_operation': 'Update',
            'access_outcome': 'Denied',
            'mfa_failures': random.randint(2, 3),
            'ip_address': '192.168.1.100 (External)',
            'location': 'Unknown Location',
            'behavior_score': 0.85
        })

    # 2. Unauthorized Dataset Access
    if random.random() < ANOMALY_PROBABILITY/2:
        original_record = random.choice(records).copy()
        anomalous_records.append({
            **original_record,
            'requested_dataset': 'Clinical Trials',
            'access_outcome': 'Denied',
            'behavior_score': min(original_record['behavior_score'] + 0.4, 1.0)
        })

    # 3. Rapid Fire Access Attempts
    if random.random() < ANOMALY_PROBABILITY/3:
        base_record = random.choice(records).copy()
        for i in range(3):
            anomalous_records.append({
                **base_record,
                'timestamp': (datetime.strptime(base_record['timestamp'], "%Y-%m-%d %H:%M:%S") +
                            timedelta(minutes=i*2)).strftime("%Y-%m-%d %H:%M:%S"),
                'mfa_failures': 1,
                'behavior_score': min(base_record['behavior_score'] + 0.25*(i+1), 1.0)
            })

    return records + anomalous_records

# ======================
# DATA GENERATION & SAVING
# ======================
for user_id in range(1, NUM_USERS + 1):
    user_profile = generate_user_profile(user_id)
    normal_data = generate_normal_behavior(BASE_DATE, user_profile)
    full_data = inject_anomalies(normal_data, user_profile)
    historical_df = pd.DataFrame(full_data)

    # Create filename with full path
    filename = os.path.join(HISTORICAL_DIR, f"{user_profile['employee_id']}_historical_data.txt")

    # Write to file
    with open(filename, "w") as f:
        # File header
        f.write(f"=== User Profile: {user_profile['name']} ===\n")
        f.write(f"Employee ID: {user_profile['employee_id']}\n")
        f.write(f"Department: {user_profile['department']}\n")
        f.write(f"Role: {user_profile['role']}\n\n")

        f.write("=== Access Log ===\n")
        historical_df.to_csv(f, index=False, sep='\t')  # Using tab separation for better readability

    print(f"Saved {user_profile['employee_id']}'s data to Google Drive")

print("\nOperation completed. Files saved to:")
print(f"Google Drive Path: {HISTORICAL_DIR}")

Saved E-2024's data to Google Drive
Saved E-2025's data to Google Drive
Saved E-2026's data to Google Drive
Saved E-2027's data to Google Drive
Saved E-2028's data to Google Drive
Saved E-2029's data to Google Drive
Saved E-2030's data to Google Drive
Saved E-2031's data to Google Drive
Saved E-2032's data to Google Drive
Saved E-2033's data to Google Drive

Operation completed. Files saved to:
Google Drive Path: /content/drive/MyDrive/BBAC_several_user/Historical_data


In [None]:
import os
import pandas as pd

# ... (your existing code for generate_user_profile, calculate_behavior_score,
# generate_normal_behavior, inject_anomalies)

# ======================
# DATA GENERATION & SAVING (MODIFIED)
# ======================

# New directory for CSV files
CSV_DIR = os.path.join(BASE_DIR, 'CSV_Data')
os.makedirs(CSV_DIR, exist_ok=True)

for user_id in range(1, NUM_USERS + 1):
    user_profile = generate_user_profile(user_id)
    normal_data = generate_normal_behavior(BASE_DATE, user_profile)
    full_data = inject_anomalies(normal_data, user_profile)

    # Create a DataFrame for historical data
    historical_df = pd.DataFrame(full_data)

    # Add user profile columns to historical data
    historical_df['User Profile'] = user_profile['name']
    historical_df['Employee ID'] = user_profile['employee_id']
    historical_df['Department'] = user_profile['department']
    historical_df['Role'] = user_profile['role']

    # Reorder columns
    desired_order = ['User Profile', 'Employee ID', 'Department', 'Role',
                     'timestamp', 'requested_dataset', 'requested_operation',
                     'access_outcome', 'mfa_failures', 'ip_address', 'location',
                     'behavior_score']
    historical_df = historical_df[desired_order]  # Apply the desired order

    # Save CSV file
    csv_filename = os.path.join(CSV_DIR, f"{user_profile['employee_id']}_historical_data.csv")
    historical_df.to_csv(csv_filename, index=False)
    print(f"Saved {user_profile['employee_id']}'s data to {csv_filename}")

print("\nOperation completed. Files saved to:")
print(f"CSV Data Path: {CSV_DIR}")

Saved E-2024's data to /content/drive/MyDrive/BBAC_several_user/CSV_Data/E-2024_historical_data.csv
Saved E-2025's data to /content/drive/MyDrive/BBAC_several_user/CSV_Data/E-2025_historical_data.csv
Saved E-2026's data to /content/drive/MyDrive/BBAC_several_user/CSV_Data/E-2026_historical_data.csv
Saved E-2027's data to /content/drive/MyDrive/BBAC_several_user/CSV_Data/E-2027_historical_data.csv
Saved E-2028's data to /content/drive/MyDrive/BBAC_several_user/CSV_Data/E-2028_historical_data.csv
Saved E-2029's data to /content/drive/MyDrive/BBAC_several_user/CSV_Data/E-2029_historical_data.csv
Saved E-2030's data to /content/drive/MyDrive/BBAC_several_user/CSV_Data/E-2030_historical_data.csv
Saved E-2031's data to /content/drive/MyDrive/BBAC_several_user/CSV_Data/E-2031_historical_data.csv
Saved E-2032's data to /content/drive/MyDrive/BBAC_several_user/CSV_Data/E-2032_historical_data.csv
Saved E-2033's data to /content/drive/MyDrive/BBAC_several_user/CSV_Data/E-2033_historical_data.csv


Storage

In [None]:



# ======================
# IMPROVED DATASET GENERATION
# ======================

ROLE_DEPARTMENT_MAP = {
    "Doctor": ["Emergency", "ICU", "Radiology", "Surgery", "Pediatrics"],
    "Nurse": ["Emergency", "ICU", "Pediatrics", "Surgery"],
    "Receptionist": ["Emergency", "Records", "Admissions"],
    "Technician": ["Radiology", "Laboratory", "Pharmacy"],
    "Admin": ["Administration", "HR", "Records"]
}

SENSITIVE_ILLNESSES = ["HIV", "Mental Health Disorder", "Substance Abuse", "Cancer"]
REGULAR_ILLNESSES = ["Flu", "COVID-19", "Broken Bone", "Infection", "Hypertension", "Diabetes"]

def generate_employee_dataset(n):
    """Generate enhanced employee dataset with access permissions"""
    data = []
    for _ in range(n):
        role = random.choice(list(ROLE_DEPARTMENT_MAP.keys()))
        department = random.choice(ROLE_DEPARTMENT_MAP[role])

        # Enhanced permissions structure
        permissions = {
            "patient_data": {
                "access_level": "Full" if role == "Doctor" else "Limited",
                "allowed_operations": ["Read", "Update"] if role == "Doctor" else ["Read"],
                "sensitivity_access": ["High"] if role in ["Doctor", "Senior Nurse"] else ["Normal"]
            },
            "employee_data": {
                "access_level": "Admin" if role == "Admin" else "None",
                "allowed_operations": ["Read", "Write", "Update"] if role == "Admin" else []
            }
        }

        data.append({
            "Employee_ID": f"E{random.randint(1, 999):03d}",
            "Employee_Name": fake.name(),
            "Role": role,
            "Department": department,
            "Salary": random.randint(50000, 200000),
            "Address": fake.address().replace("\n", ", "),
            "Contact": fake.phone_number(),
            "Access_Permissions": permissions,
            "Hire_Date": datetime.now() - timedelta(days=random.randint(365, 365*5))
        })

    return pd.DataFrame(data)

def generate_patient_dataset(n):
    """Generate patient data with enhanced medical records"""
    data = []
    for _ in range(n):
        is_sensitive = random.random() < 0.2
        illness = random.choice(SENSITIVE_ILLNESSES if is_sensitive else REGULAR_ILLNESSES)

        data.append({
            "Patient_ID": f"P{random.randint(1, 999):03d}",
            "Patient_Name": fake.name(),
            "Gender": random.choice(["Male", "Female", "Other"]),
            "Age": random.randint(1, 100),
            "Address": fake.address().replace("\n", ", "),
            "Emergency_Contact": fake.phone_number(),
            "Primary_Diagnosis": illness,
            "Insurance_Type": random.choice(["Private", "Medicare", "Medicaid", "Self-Pay"]),
            "Admission_Date": datetime.now() - timedelta(days=random.randint(0, 30)),
            "Room_Number": random.randint(100, 500),
            "Data_Sensitivity": "High" if is_sensitive else "Normal",
            "Attending_Physician": f"E{random.randint(1, 999):03d}",
            "Treatment_Plan": random.choice(["Inpatient", "Outpatient"])
        })

    return pd.DataFrame(data)

# ======================
# DATA GENERATION & STORAGE
# ======================

# Generate datasets
employee_df = generate_employee_dataset(50)  # 50 employees
patient_df = generate_patient_dataset(200)   # 200 patients

# Save to Storage directory
employee_path = os.path.join(STORAGE_DIR, 'employee_records.csv')
patient_path = os.path.join(STORAGE_DIR, 'patient_records.csv')

employee_df.to_csv(employee_path, index=False)
patient_df.to_csv(patient_path, index=False)

print(f"Employee records saved to: {employee_path}")
print(f"Patient records saved to: {patient_path}")

# Optional: Add historical data generation from previous implementation
# (Include your historical data generation code here)

Employee records saved to: /content/drive/MyDrive/BBAC_several_user/Storage/employee_records.csv
Patient records saved to: /content/drive/MyDrive/BBAC_several_user/Storage/patient_records.csv


key pattern of historical dataset of several users

In [None]:
!pip install faiss-cpu # Installing faiss-cpu, appropriate for CPU-based systems. If you have a GPU, you can install faiss-gpu instead.
import os
import pandas as pd
import numpy as np
from datetime import datetime
from transformers import AutoTokenizer, AutoModel
import faiss
import json

Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl (30.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m25.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.10.0


In [None]:
import os
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer

# Configuration
CSV_DIR = '/content/drive/MyDrive/BBAC_several_user/CSV_Data'  # Path to your CSV data directory
OUTPUT_DIR = '/content/drive/MyDrive/BBAC_several_user/Embeddings'  # Where to save embeddings
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Load the SentenceTransformer model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

def embed_and_save(csv_file):
    """Embeds the data in the given CSV file and saves it with embeddings."""
    try:
        # Load the CSV data
        df = pd.read_csv(csv_file)

        # Create text representations for embedding
        df['log_text'] = df.apply(
            lambda x: f"On {x['timestamp'].split()[0]} at {x['timestamp'].split()[1]}, " +  # Assuming timestamp is in 'YYYY-MM-DD HH:MM:SS' format
                      f"{x['User Profile']} performed {x['requested_operation']} " +
                      f"on {x['requested_dataset']} from {x['location']} (IP: {x['ip_address']}), " +
                      f"MFA failures: {x['mfa_failures']}, " +
                      f"Risk score: {x['behavior_score']}",
            axis=1
        )

        # Generate embeddings
        embeddings = model.encode(df['log_text'].tolist(), show_progress_bar=True)
        df['embedding'] = list(embeddings)  # Add embeddings as a new column

        # Save the DataFrame with embeddings
        output_file = os.path.join(OUTPUT_DIR, os.path.basename(csv_file))  # Same filename, different directory
        df.to_csv(output_file, index=False)
        print(f"Saved embeddings to: {output_file}")

    except Exception as e:
        print(f"Error processing {csv_file}: {str(e)}")


# Process all CSV files
csv_files = [f for f in os.listdir(CSV_DIR) if f.endswith('_historical_data.csv')]
for csv_file in csv_files:
    file_path = os.path.join(CSV_DIR, csv_file)
    embed_and_save(file_path)

print("Embedding process completed.")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Saved embeddings to: /content/drive/MyDrive/BBAC_several_user/Embeddings/E-2024_historical_data.csv


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Saved embeddings to: /content/drive/MyDrive/BBAC_several_user/Embeddings/E-2027_historical_data.csv


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Saved embeddings to: /content/drive/MyDrive/BBAC_several_user/Embeddings/E-2025_historical_data.csv


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Saved embeddings to: /content/drive/MyDrive/BBAC_several_user/Embeddings/E-2033_historical_data.csv


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Saved embeddings to: /content/drive/MyDrive/BBAC_several_user/Embeddings/E-2026_historical_data.csv


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Saved embeddings to: /content/drive/MyDrive/BBAC_several_user/Embeddings/E-2031_historical_data.csv


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Saved embeddings to: /content/drive/MyDrive/BBAC_several_user/Embeddings/E-2029_historical_data.csv


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Saved embeddings to: /content/drive/MyDrive/BBAC_several_user/Embeddings/E-2032_historical_data.csv


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Saved embeddings to: /content/drive/MyDrive/BBAC_several_user/Embeddings/E-2030_historical_data.csv


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Saved embeddings to: /content/drive/MyDrive/BBAC_several_user/Embeddings/E-2028_historical_data.csv
Embedding process completed.


In [None]:
import pandas as pd

# Path to one of your embedded CSV files
embedding_file = '/content/drive/MyDrive/BBAC_several_user/Embeddings/E-2024_historical_data.csv'

# Load the data
embedded_df = pd.read_csv(embedding_file)

# Display the 'embedding' column for the first few rows
print(embedded_df[['log_text', 'embedding']].head())

                                            log_text  \
0  On 2023-10-01 at 07:54:00, Erin Smith performe...   
1  On 2023-10-02 at 07:51:00, Erin Smith performe...   
2  On 2023-10-03 at 07:56:00, Erin Smith performe...   
3  On 2023-10-04 at 07:45:00, Erin Smith performe...   
4  On 2023-10-05 at 07:47:00, Erin Smith performe...   

                                           embedding  
0  [-5.83653189e-02 -5.99935539e-02 -2.17473488e-...  
1  [-4.29775678e-02 -4.92793396e-02 -4.35683616e-...  
2  [-4.39620242e-02 -4.83754911e-02 -4.46096547e-...  
3  [-4.90748622e-02 -4.77450937e-02 -3.78167033e-...  
4  [-4.16683853e-02 -4.71078753e-02 -4.46829759e-...  


In [None]:
!pip install -U langchain-community

Collecting langchain-community
  Downloading langchain_community-0.3.20-py3-none-any.whl.metadata (2.4 kB)
Collecting langchain<1.0.0,>=0.3.21 (from langchain-community)
  Downloading langchain-0.3.21-py3-none-any.whl.metadata (7.8 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.8.1-py3-none-any.whl.metadata (3.5 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Collecting langchain-text-spli

In [None]:
import os
import pandas as pd
import faiss
import numpy as np
from langchain.vectorstores import FAISS
from langchain.docstore.in_memory import InMemoryDocstore
from langchain.schema import Document

# --- Vector Database with Hugging Face Embeddings ---
def create_vector_db_hf(csv_files_dir):
    """Store Hugging Face embeddings from multiple CSV files in a searchable vector database"""

    all_texts = []
    all_embeddings = []

    for filename in os.listdir(csv_files_dir):
        if filename.endswith(".csv"):
            filepath = os.path.join(csv_files_dir, filename)
            embedded_df = pd.read_csv(filepath)

            all_texts.extend(embedded_df['log_text'].tolist())
            all_embeddings.extend(embedded_df['embedding'].apply(lambda x: np.fromstring(x[1:-1], sep=',').astype('float32')).tolist())

    # Convert to numpy arrays for FAISS
    all_embeddings = np.array(all_embeddings)

    # Create FAISS index
    dimension = all_embeddings.shape[1]  # Get embedding dimension
    index = faiss.IndexFlatL2(dimension)  # Create FAISS index
    index.add(all_embeddings)  # Add embeddings

    # Create LangChain FAISS wrapper
    vector_db = FAISS(
        embedding_function=None,  # Not needed since we have precomputed embeddings
        index=index,
        docstore=InMemoryDocstore(),
        index_to_docstore_id={i: i for i in range(len(all_texts))}
    )

    # Add text metadata
    for i, text in enumerate(all_texts):
        vector_db.docstore.add({i: Document(page_content=text)})

    return vector_db

# Assuming your embedded CSV files are in the 'Embeddings' directory
EMBEDDINGS_DIR = '/content/drive/MyDrive/BBAC_several_user/Embeddings'  # Update with your actual path

# Create and persist vector DB
vector_db = create_vector_db_hf(EMBEDDINGS_DIR)

print("\n=== Vector Database Info ===")
print(f"Index size: {vector_db.index.ntotal}")

  all_embeddings.extend(embedded_df['embedding'].apply(lambda x: np.fromstring(x[1:-1], sep=',').astype('float32')).tolist())
  all_embeddings.extend(embedded_df['embedding'].apply(lambda x: np.fromstring(x[1:-1], sep=',').astype('float32')).tolist())
  all_embeddings.extend(embedded_df['embedding'].apply(lambda x: np.fromstring(x[1:-1], sep=',').astype('float32')).tolist())
  all_embeddings.extend(embedded_df['embedding'].apply(lambda x: np.fromstring(x[1:-1], sep=',').astype('float32')).tolist())
  all_embeddings.extend(embedded_df['embedding'].apply(lambda x: np.fromstring(x[1:-1], sep=',').astype('float32')).tolist())
  all_embeddings.extend(embedded_df['embedding'].apply(lambda x: np.fromstring(x[1:-1], sep=',').astype('float32')).tolist())
  all_embeddings.extend(embedded_df['embedding'].apply(lambda x: np.fromstring(x[1:-1], sep=',').astype('float32')).tolist())
  all_embeddings.extend(embedded_df['embedding'].apply(lambda x: np.fromstring(x[1:-1], sep=',').astype('float32')).to


=== Vector Database Info ===
Index size: 105


realtime user request

In [None]:
import os
import random
import pandas as pd
from datetime import datetime

# Configuration
CSV_DATA_DIR = '/content/drive/MyDrive/BBAC_several_user/CSV_Data'

def get_random_user_profile(csv_data_dir):
    """
    Randomly selects a user profile from the historical dataset in the CSV_Data folder.

    Args:
        csv_data_dir (str): Path to the directory containing historical data CSV files.

    Returns:
        dict: A randomly selected user profile.
    """
    # Load all historical data files
    historical_files = [f for f in os.listdir(csv_data_dir) if f.endswith('.csv')]

    if not historical_files:
        raise FileNotFoundError(f"No CSV files found in {csv_data_dir}")

    # Load all historical data into a single DataFrame
    historical_data = pd.concat(
        [pd.read_csv(os.path.join(csv_data_dir, f)) for f in historical_files],
        ignore_index=True
    )

    # Randomly select a user
    random_user = historical_data.sample(n=1).iloc[0]

    # Create user profile
    user_profile = {
        'employee_id': random_user['Employee ID'],
        'name': random_user['User Profile'],
        'role': random_user['Role'],
        'department': random_user['Department']
    }

    return user_profile

def simulate_real_time_request_from_profile(user_profile, csv_data_dir, is_suspicious=False):
    """
    Simulates a real-time access request by combining static attributes from the user's profile
    with dynamic attributes. If suspicious, it changes 2-3 attributes based on historical data.

    Args:
        user_profile (dict): Static user info (e.g., name, role, department, employee_id)
        csv_data_dir (str): Path to the directory containing historical data CSV files.
        is_suspicious (bool): Flag to indicate if the dynamic attributes should simulate malicious behavior

    Returns:
        dict: Combined request with both static and dynamic fields.
    """
    # Static attributes
    request = {
        'employee_id': user_profile.get('employee_id'),
        'name': user_profile.get('name'),
        'role': user_profile.get('role'),
        'department': user_profile.get('department'),
        'dataset': 'Patient Records'  # Assuming the dataset is fixed for now
    }

    # Dynamic attributes (default values for normal behavior)
    dynamic_attributes = {
        'timestamp': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
        'requested_operation': random.choice(['Read', 'Update']),
        'access_outcome': 'Granted',
        'mfa_failures': random.choices([0, 1], weights=[0.95, 0.05])[0],
        'ip_address': '10.1.1.15 (Internal)',
        'location': f"Main Campus - {user_profile.get('department')}",
        'behavior_score': round(random.uniform(0.0, 0.3), 2)  # Normal behavior score
    }

    if is_suspicious:
        # Load all historical data files
        historical_files = [f for f in os.listdir(csv_data_dir) if f.endswith('.csv')]

        if not historical_files:
            print(f"No CSV files found in {csv_data_dir}")
        else:
            # Load all historical data into a single DataFrame
            historical_data = pd.concat(
                [pd.read_csv(os.path.join(csv_data_dir, f)) for f in historical_files],
                ignore_index=True
            )

            # Filter for the specific user's data
            user_data = historical_data[historical_data['Employee ID'] == user_profile.get('employee_id')]

            if not user_data.empty:
                # Filter for suspicious behavior (high behavior_score)
                suspicious_data = user_data[user_data['behavior_score'] >= 0.5]

                if not suspicious_data.empty:
                    # Randomly select a row for suspicious behavior inspiration
                    suspicious_row = suspicious_data.sample(n=1).iloc[0]

                    # Randomly change 2-3 attributes
                    attributes_to_change = random.sample([
                        'timestamp', 'requested_operation', 'access_outcome',
                        'mfa_failures', 'ip_address', 'location'
                    ], k=random.randint(2, 3))

                    for attr in attributes_to_change:
                        dynamic_attributes[attr] = suspicious_row[attr]

                    # Increase behavior score for suspicious requests
                    dynamic_attributes['behavior_score'] = round(random.uniform(0.7, 1.0), 2)
                else:
                    print(f"No suspicious historical data found for {user_profile.get('employee_id')}")
            else:
                print(f"No historical data found for {user_profile.get('employee_id')}")

    # Combine static and dynamic attributes
    request.update(dynamic_attributes)
    return request

# Example Usage
if __name__ == "__main__":
    # Get a random user profile
    try:
        user_profile = get_random_user_profile(CSV_DATA_DIR)
        print(f"Selected User: {user_profile}")

        # Simulate a normal request
        normal_request = simulate_real_time_request_from_profile(user_profile, CSV_DATA_DIR)
        print("\n=== Normal Request ===")
        print(normal_request)

        # Simulate a suspicious request
        suspicious_request = simulate_real_time_request_from_profile(user_profile, CSV_DATA_DIR, is_suspicious=True)
        print("\n=== Suspicious Request ===")
        print(suspicious_request)
    except Exception as e:
        print(f"Error: {str(e)}")

Selected User: {'employee_id': 'E-2029', 'name': 'Gregory Middleton', 'role': 'Doctor', 'department': 'Neurology'}

=== Normal Request ===
{'employee_id': 'E-2029', 'name': 'Gregory Middleton', 'role': 'Doctor', 'department': 'Neurology', 'dataset': 'Patient Records', 'timestamp': '2025-03-19 10:20:49', 'requested_operation': 'Update', 'access_outcome': 'Granted', 'mfa_failures': 0, 'ip_address': '10.1.1.15 (Internal)', 'location': 'Main Campus - Neurology', 'behavior_score': 0.04}
No suspicious historical data found for E-2029

=== Suspicious Request ===
{'employee_id': 'E-2029', 'name': 'Gregory Middleton', 'role': 'Doctor', 'department': 'Neurology', 'dataset': 'Patient Records', 'timestamp': '2025-03-19 10:20:49', 'requested_operation': 'Read', 'access_outcome': 'Granted', 'mfa_failures': 0, 'ip_address': '10.1.1.15 (Internal)', 'location': 'Main Campus - Neurology', 'behavior_score': 0.09}


In [None]:
# ... (Import necessary libraries and define get_random_user_profile and simulate_real_time_request_from_profile functions if they're not already defined) ...

# --- Step 2: Convert the request to a textual query ---
def request_to_text(request):
    """Converts a request dictionary into a textual representation for similarity search."""
    return (f"On {request['timestamp'].split()[0]} at {request['timestamp'].split()[1]}, "  # Assuming timestamp is in 'YYYY-MM-DD HH:MM:SS' format
            f"{request['name']} ({request['role']} in {request['department']}) performed {request['requested_operation']} "
            f"on {request['dataset']} from {request['location']} (IP: {request['ip_address']}), "
            f"MFA failures: {request['mfa_failures']}, "
            f"Risk score: {request['behavior_score']}")


# Assuming you have normal_request and suspicious_request from the previous step
# If not, you'll need to define them or load them from where they were stored

# Convert normal_request to text
normal_request_text = request_to_text(normal_request)
print("\n=== Normal Request Text ===")
print(normal_request_text)

# Convert suspicious_request to text (if needed)
suspicious_request_text = request_to_text(suspicious_request)
print("\n=== Suspicious Request Text ===")
print(suspicious_request_text)


=== Normal Request Text ===
On 2025-03-19 at 10:20:49, Gregory Middleton (Doctor in Neurology) performed Update on Patient Records from Main Campus - Neurology (IP: 10.1.1.15 (Internal)), MFA failures: 0, Risk score: 0.04

=== Suspicious Request Text ===
On 2025-03-19 at 10:20:49, Gregory Middleton (Doctor in Neurology) performed Read on Patient Records from Main Campus - Neurology (IP: 10.1.1.15 (Internal)), MFA failures: 0, Risk score: 0.09


In [None]:
query_embedding = embedding_model.encode(query_text)
print(f"Query embedding shape: {query_embedding.shape}")  # Should be (embedding_dimension,)
query_embedding = query_embedding.reshape(1, -1)  # Reshape to (1, embedding_dimension)
print(f"Reshaped query embedding shape: {query_embedding.shape}")  # Should be (1, embedding_dimension)

Query embedding shape: (384,)
Reshaped query embedding shape: (1, 384)


In [None]:
import faiss

# Initialize FAISS index with the correct dimension
embedding_dim = 384  # Dimension for 'sentence-transformers/all-MiniLM-L6-v2'
vector_db.index = faiss.IndexFlatL2(embedding_dim)  # Reinitialize the index

In [None]:
print(f"FAISS index dimension: {vector_db.index.d}")  # Should match embedding_dimension

FAISS index dimension: 384


In [None]:
query_embedding = query_embedding.astype('float32')
print(f"Query embedding dtype: {query_embedding.dtype}")  # Should be float32

Query embedding dtype: float32


In [None]:
# --- Step 1: Initialize FAISS Index ---
embedding_dim = 384  # Dimension for 'all-MiniLM-L6-v2'
vector_db.index = faiss.IndexFlatL2(embedding_dim)

# --- Step 2: Add Embeddings to Index ---
# Replace with your actual embeddings (shape: [num_entries, 384])
historical_embeddings = np.random.rand(100, 384).astype('float32')  # Example
vector_db.index.add(historical_embeddings)

# --- Step 3: Query Embedding ---
query_embedding = embedding_model.encode(query_text).reshape(1, -1).astype('float32')

# --- Step 4: Search ---
distances, indices = vector_db.index.search(query_embedding, k=3)
print("Search results:", indices)

Search results: [[ 5 39 38]]


In [None]:
# After initializing the index
print(f"Index dimension: {vector_db.index.d}")  # Should output 384

# After adding embeddings
print(f"Number of entries in index: {vector_db.index.ntotal}")  # Should match your data

Index dimension: 384
Number of entries in index: 100


In [None]:
# --- Step 3: Embed the query using the same sentence transformer model ---
from sentence_transformers import SentenceTransformer
import numpy as np

# Ensure you use the SAME model used to create the vector database
embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Choose the query text (either normal_request_text or suspicious_request_text)
query_text = normal_request_text  # Or suspicious_request_text for suspicious behavior
query_embedding = embedding_model.encode(query_text)

# Verify query embedding shape and reshape
print(f"Query embedding shape: {query_embedding.shape}")  # Should be (embedding_dimension,)
query_embedding = query_embedding.reshape(1, -1)  # Reshape to (1, embedding_dimension)
print(f"Reshaped query embedding shape: {query_embedding.shape}")  # Should be (1, embedding_dimension)

# Ensure data type is float32
query_embedding = query_embedding.astype('float32')
print(f"Query embedding dtype: {query_embedding.dtype}")  # Should be float32

# --- Step 4: Retrieve similar historical entries from FAISS vector DB ---
# Assuming 'vector_db' is your FAISS vector database created earlier
k = 3  # Number of similar entries to retrieve

# Verify FAISS index dimension
print(f"FAISS index dimension: {vector_db.index.d}")  # Should match embedding_dimension

# Perform the search
distances, indices = vector_db.index.search(query_embedding, k=k)

# --- Step 5: Retrieve the matching log texts using the docstore ---
retrieved_logs = []
for idx in indices[0]:
    # Access the document from the docstore
    if 0 <= idx < len(vector_db.docstore._dict):  # Check if index is valid
        doc = vector_db.docstore._dict.get(idx)
        if doc is not None:
            retrieved_logs.append({
                'log': doc.page_content,
                'similarity': 1 - distances[0][indices[0].tolist().index(idx)]  # Calculate similarity
            })

# --- Step 6: Display or process the retrieved logs ---
for log_data in retrieved_logs:
    print(f"Log: {log_data['log']}")
    print(f"Similarity: {log_data['similarity']:.4f}\n")

Query embedding shape: (384,)
Reshaped query embedding shape: (1, 384)
Query embedding dtype: float32
FAISS index dimension: 384
Log: On 2023-10-06 at 08:08:00, Erin Smith performed Read on Patient Records from Main Campus - Neurology (IP: 10.1.1.15 (Internal)), MFA failures: 0, Risk score: 0.01
Similarity: -112.5336

Log: On 2023-10-08 at 07:46:00, Mr. Sean Adams DVM performed Read on Patient Records from Main Campus - Cardiology (IP: 10.1.1.15 (Internal)), MFA failures: 0, Risk score: 0.0
Similarity: -116.4303

Log: On 2023-10-07 at 07:59:00, Mr. Sean Adams DVM performed Read on Patient Records from Main Campus - Cardiology (IP: 10.1.1.15 (Internal)), MFA failures: 0, Risk score: 0.0
Similarity: -117.3052



In [None]:
# --- Step 6: (Optional) Combine retrieved context with a generative model ---
# In a full RAG pipeline, you would now feed the query_text along with the retrieved logs into a language model
# to generate a natural language analysis. For this example, we simply print the retrieved logs.
print("\n=== Retrieved Historical Entries ===")
for i, entry in enumerate(retrieved_logs, 1):
    print(f"\nMatch #{i} (Similarity: {entry['similarity']:.2f}):")
    print(entry['log'])

# Now, you have compared the normal request with historical data using the RAG approach:
# - The query_text represents the new request.
# - The FAISS vector search retrieves similar historical log entries.
# - In a complete RAG system, the retrieved context and query can be fed into a language model
#   to generate an explanation or alert if the new behavior is anomalous.


=== Retrieved Historical Entries ===

Match #1 (Similarity: -112.53):
On 2023-10-06 at 08:08:00, Erin Smith performed Read on Patient Records from Main Campus - Neurology (IP: 10.1.1.15 (Internal)), MFA failures: 0, Risk score: 0.01

Match #2 (Similarity: -116.43):
On 2023-10-08 at 07:46:00, Mr. Sean Adams DVM performed Read on Patient Records from Main Campus - Cardiology (IP: 10.1.1.15 (Internal)), MFA failures: 0, Risk score: 0.0

Match #3 (Similarity: -117.31):
On 2023-10-07 at 07:59:00, Mr. Sean Adams DVM performed Read on Patient Records from Main Campus - Cardiology (IP: 10.1.1.15 (Internal)), MFA failures: 0, Risk score: 0.0


In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.schema import Document
from langchain.chains import RetrievalQA

# Initialize LLM (Modify this based on your OpenAI API key or local LLM)
llm = ChatOpenAI(model_name="gpt-4", temperature=0.2)

# Define Prompt for RAG-based evaluation
prompt_template = PromptTemplate(
    input_variables=["normal_request", "retrieved_logs"],
    template="""
    You are an AI-based access control evaluator. Your task is to determine if a new access request aligns with historical access patterns.

    Normal Request:
    {normal_request}

    Retrieved Historical Entries:
    {retrieved_logs}

    Question: Based on the historical entries, should this request be granted? Provide reasoning.
    """
)

def evaluate_access_rag(normal_request, retrieved_logs):
    """
    Uses RAG to evaluate access based on historical logs.

    Parameters:
        normal_request (str): The real-time access request.
        retrieved_logs (list): Retrieved historical logs from FAISS.

    Returns:
        str: AI decision and reasoning.
    """
    # Format historical logs for the LLM
    formatted_logs = "\n".join([f"Log: {entry['log']} | Similarity: {entry['similarity']:.2f}" for entry in retrieved_logs])

    # Construct input prompt
    query = prompt_template.format(normal_request=normal_request, retrieved_logs=formatted_logs)

    # Use LLM to generate a decision
    response = llm.predict(query)

    return response

# Example usage:
normal_request = "Dr. Smith is requesting access to patient records at 3 PM from the hospital network."
retrieved_logs = [
    {'log': "Dr. Smith accessed patient records from hospital at 2:50 PM.", 'similarity': 0.90},
    {'log': "Dr. Smith checked ICU records at


SyntaxError: unterminated string literal (detected at line 51) (<ipython-input-40-f8374b729d8d>, line 51)