In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import random
import datetime
import pandas as pd

**Step 1:** Generate Historical Data. Simulate a dataset that contains historical user behavior information. For example, each record stores the employee’s role, department, access timestamp, number of MFA rejections (which we use as a signal for anomaly), and feedback from the last access. We also assign a “behavior score” to indicate risk.

In [None]:
!pip install faker



In [None]:
from faker import Faker

In [None]:
fake = Faker()

def generate_hospital_access_dataset(n):
    """
    Generate simulated historical access data for a hospital environment using Faker.
    Each record includes user context and access details, ensuring all users have a valid Employee_ID.
    """
    roles = ["Doctor", "Nurse", "Receptionist", "Technician", "Admin", "Patient", "Visitor"]
    departments = ["Emergency", "ICU", "Radiology", "Surgery", "Pediatrics", "Pharmacy", "Laboratory", "Records"]
    ip_addresses = ["10.1.1.10 (Internal)", "10.1.1.20 (Internal)", "192.168.2.15 (Internal)",
                    "203.0.113.5 (External)", "198.51.100.25 (External)"]
    locations = ["Main Campus", "North Wing", "South Wing", "East Wing", "West Wing"]

    data = []
    for _ in range(n):
        name = fake.name()
        role = random.choice(roles)
        department = random.choice(departments)
        access_timestamp = datetime.datetime.now() - datetime.timedelta(
            days=random.randint(1, 30), hours=random.randint(0, 23)
        )
        access_outcome = random.choice(["Allowed", "Denied"])
        mfa_failures = random.randint(0, 3)
        ip_address = random.choice(ip_addresses)
        location = random.choice(locations)
        feedback = random.choice(["Normal", "After-hours access", "Multiple MFA failures", "Unusual access location"])

        # Assign behavior score based on risk factors
        behavior_score = round(random.uniform(0.1, 1.0), 2) if access_outcome == "Denied" or mfa_failures >= 2 or "External" in ip_address else round(random.uniform(0.1, 0.5), 2)

        # Generate Employee_ID: Staff gets "E###", Patients/Visitors get "P###"
        if role in ["Doctor", "Nurse", "Receptionist", "Technician", "Admin"]:
            employee_id = f"E{random.randint(100, 999)}"
        else:  # Patients and Visitors
            employee_id = f"P{random.randint(100, 999)}"

        data.append([employee_id, name, role, department, access_timestamp, access_outcome, mfa_failures, feedback, ip_address, location, behavior_score])

    columns = ["Employee_ID", "Name", "Role", "Department", "Access_Timestamp", "Access_Outcome",
               "MFA_Failure_Count", "Feedback", "IP_Address", "Location", "Behavior_Score"]

    return pd.DataFrame(data, columns=columns)

# Generate dataset for 15 hospital access logs
hospital_df = generate_hospital_access_dataset(n=15)

print("\nHospital Access Data:")
print(hospital_df.to_string(index=False))



Hospital Access Data:
Employee_ID               Name         Role Department           Access_Timestamp Access_Outcome  MFA_Failure_Count                Feedback               IP_Address    Location  Behavior_Score
       E159        Jesse Smith Receptionist        ICU 2025-03-03 07:05:04.501771        Allowed                  0 Unusual access location  192.168.2.15 (Internal) Main Campus            0.21
       E944      Tracy Russell        Nurse    Records 2025-03-08 08:05:04.502194        Allowed                  0      After-hours access     10.1.1.10 (Internal)   East Wing            0.18
       E790    Jennifer Bowers        Admin   Pharmacy 2025-02-10 23:05:04.502523        Allowed                  0   Multiple MFA failures  192.168.2.15 (Internal)  North Wing            0.32
       P494     Kaylee Sherman      Visitor  Emergency 2025-02-21 08:05:04.502813        Allowed                  1                  Normal  192.168.2.15 (Internal)   East Wing            0.15
       E587 

**Step 2**:create a dataset for access

In [None]:


fake = Faker()

def generate_employee_dataset(n):
    """
    Generate a simulated employee dataset for a hospital using Faker.
    Each record contains personal and professional details of an employee.
    """
    roles = ["Doctor", "Nurse", "Receptionist", "Technician", "Admin"]
    departments = ["Emergency", "ICU", "Radiology", "Surgery", "Pediatrics", "Pharmacy", "Laboratory", "Records"]

    data = []
    for _ in range(n):
        employee_id = f"E{random.randint(1, 999):03d}"
        employee_name = fake.name()
        employee_role = random.choice(roles)
        employee_department = random.choice(departments)
        employee_salary = random.randint(50000, 200000)
        employee_address = fake.address()
        employee_phone = fake.phone_number()

        data.append([
            employee_id, employee_name, employee_role, employee_department,
            employee_salary, employee_address, employee_phone
        ])

    columns = [
        "Employee_ID", "Employee_Name", "Employee_Role", "Employee_Department",
        "Employee_Salary", "Employee_Address", "Employee_Phone"
    ]
    return pd.DataFrame(data, columns=columns)

def generate_patient_dataset(n):
    """
    Generate a simulated patient dataset for a hospital using Faker.
    Each record contains personal and clinical details of a patient.
    """
    illnesses = ["Flu", "COVID-19", "Broken Bone", "Infection", "Hypertension", "Diabetes", "Cardiac Arrest"]
    insurance_types = ["Private", "Medicare", "Medicaid", "Self-Pay", "Other"]

    data = []
    for _ in range(n):
        patient_id = f"P{random.randint(1, 999):03d}"
        patient_name = fake.name()
        patient_gender = random.choice(["Male", "Female"])
        patient_age = random.randint(1, 100)
        patient_address = fake.address()
        patient_phone = fake.phone_number()
        patient_illness = random.choice(illnesses)
        patient_insurance = random.choice(insurance_types)
        admission_date = datetime.datetime.now() - datetime.timedelta(days=random.randint(0, 30))
        room_number = random.randint(100, 500)

        data.append([
            patient_id, patient_name, patient_gender, patient_age,
            patient_address, patient_phone, patient_illness, patient_insurance,
            admission_date, room_number
        ])

    columns = [
        "Patient_ID", "Patient_Name", "Patient_Gender", "Patient_Age",
        "Patient_Address", "Patient_Phone", "Patient_Illness", "Patient_Insurance_Type",
        "Patient_Admission_Date", "Patient_Room_Number"
    ]
    return pd.DataFrame(data, columns=columns)

# Generate datasets with 20 records each
employee_df = generate_employee_dataset(20)
patient_df = generate_patient_dataset(20)

print("\nEmployee Data")
print(employee_df.to_string(index=False))

print("\nPatient Data")
print(patient_df.to_string(index=False))



Employee Data
Employee_ID     Employee_Name Employee_Role Employee_Department  Employee_Salary                                         Employee_Address         Employee_Phone
       E943    Jessica Murray  Receptionist           Emergency           178792                 0304 Jennifer Lakes\nSarahview, UT 66084             4229425456
       E955       John Wilcox    Technician             Surgery           113860                             USNS Hernandez\nFPO AE 34195          (690)250-4851
       E962    Timothy Gordon         Nurse           Radiology           100151       240 Jennifer Mill Apt. 448\nTimothyburgh, VI 40486    +1-953-589-9804x759
       E976       Cody Cherry  Receptionist          Laboratory           125527                 520 Bishop Manors\nWilliambury, IN 74063             7142552473
       E501    Michael Savage         Admin                 ICU            52783          1207 Torres Cape Suite 385\nWoodmouth, GU 24957     957.467.6858x94244
       E294 Alexand

**Step 3:** Convert Historical Data to Embeddings: To use the LLM and perform similarity searches, we need to convert each historical record into a text summary and then create an embedding (a numerical vector) from that text. We use the SentenceTransformer model to do this. Then, we store all the embeddings in FAISS (a vector database) for efficient similarity search.

In [None]:
!pip install -q sentence_transformers
from sentence_transformers import SentenceTransformer

In [None]:
!pip install -q faiss-cpu
import faiss
import numpy as np

# Define the dimension of the embeddings (depends on the SentenceTransformer model used)
embedding_dim = 768  # Most BERT-based models use 768 dimensions

# Create a FAISS index for similarity search
faiss_index = faiss.IndexFlatL2(embedding_dim)  # L2 distance (Euclidean)

In [None]:
def create_text_summary(employee_id, role, department, mfa_failures, feedback, ip_address, location, behavior_score, access_outcome):
    """
    Generate a structured text summary focused on security patterns.
    """
    time_context = "After-Hours" if datetime.datetime.now().hour not in range(8, 18) else "Normal Hours"
    ip_type = "External" if "External" in ip_address else "Internal"
    risk_context = f"High-Risk (Score: {behavior_score:.2f})" if behavior_score > 0.5 else f"Low-Risk (Score: {behavior_score:.2f})"

    return (
        f"Role: {role} | Department: {department} | "
        f"Access Outcome: {access_outcome} | "
        f"MFA Failures: {mfa_failures} | IP Type: {ip_type} | "
        f"Location: {location} | Time: {time_context} | "
        f"Risk: {risk_context}"
    )

In [None]:
# Load the SentenceTransformer model (e.g., 'all-mpnet-base-v2')
embedding_model = SentenceTransformer('all-mpnet-base-v2') # This line is crucial

# Use Inner Product (cosine similarity) instead of L2
embedding_dim = 768
faiss_index = faiss.IndexFlatIP(embedding_dim)  # Inner Product = Cosine when normalized

# Generate normalized embeddings
hospital_df["Embedding"] = hospital_df.apply(
    lambda row: embedding_model.encode(
        create_text_summary(
            row['Employee_ID'], row['Role'], row['Department'],
            row['MFA_Failure_Count'], row['Feedback'],
            row['IP_Address'], row['Location'],
            row['Behavior_Score'], row['Access_Outcome']
        ),
        convert_to_numpy=True
    ),
    axis=1
)

# Normalize embeddings before adding to FAISS
embeddings = np.vstack(hospital_df["Embedding"].values).astype('float32')
faiss.normalize_L2(embeddings)  # Critical for cosine similarity
faiss_index.add(embeddings)
print("\nFAISS index created with historical embeddings.")


FAISS index created with historical embeddings.


 **Process a Real-Time Access Request:** When a new user requests access, we capture the request details (Name, role, department, timestamp, MFA failures, IP, location, etc.), convert this information into a text summary, and generate an embedding. Then we use the FAISS index to retrieve the most similar historical records, which provide context for typical behavior.

In [None]:
def process_realtime_request(realtime_request, historical_df, faiss_index, embedding_model):
    """
    Processes real-time access requests with security context awareness.
    Returns summary, similar records, and risk assessment.
    """
    try:
        # 1. Validate input structure
        required_fields = ['Employee_ID', 'Role', 'Department', 'Geo_Location',
                          'IP_Address & Network_Info', 'MFA_Failure_Count']
        for field in required_fields:
            if field not in realtime_request:
                raise ValueError(f"Missing required field: {field}")

        # 2. Create security-focused text summary
        security_context = (
            f"Role: {realtime_request['Role']} | "
            f"Department: {realtime_request['Department']} | "
            f"Location: {realtime_request['Geo_Location']} | "
            f"Network: {realtime_request['IP_Address & Network_Info']} | "
            f"MFA Failures: {realtime_request['MFA_Failure_Count']}"
        )

        # 3. Generate and normalize embedding
        embedding = embedding_model.encode([security_context])
        embedding = np.array(embedding).astype("float32")
        faiss.normalize_L2(embedding)  # For cosine similarity

        # 4. Similarity search with safety checks
        k = min(3, len(historical_df))  # Never request more records than available
        distances, indices = faiss_index.search(embedding, k)

        # 5. Process results
        if indices.size == 0 or indices[0][0] < 0:
            return security_context, pd.DataFrame(), "No similar records found"

        similar_records = historical_df.iloc[indices[0]].copy()
        similar_records['Similarity_Score'] = 1 - distances[0]  # Convert L2 distance to similarity

        # 6. Risk assessment
        avg_risk = similar_records['Behavior_Score'].mean()
        risk_status = "HIGH RISK" if avg_risk > 0.7 else "MEDIUM RISK" if avg_risk > 0.4 else "LOW RISK"

        return security_context, similar_records, risk_status

    except Exception as e:
        print(f"Error processing request: {str(e)}")
        return security_context, pd.DataFrame(), "Error in processing"

# Example usage
realtime_request = {
    "Employee_ID": "E10",
    "Role": "IT Specialist",
    "Department": "Information Technology",
    "Geo_Location": "Berlin Campus",
    "IP_Address & Network_Info": "External (203.0.113.42)",
    "MFA_Failure_Count": 2
}

# Sample historical dataframe structure
historical_df = pd.DataFrame({
    "Employee_ID": ["E123", "E456", "E789"],
    "Access_Timestamp": [pd.Timestamp.now() - pd.Timedelta(days=x) for x in [1, 2, 3]],
    "MFA_Failure_Count": [1, 3, 0],
    "Behavior_Score": [0.3, 0.8, 0.2],
    "IP_Address": ["Internal", "External", "Internal"]
})

# Initialize FAISS index (should be pre-trained on historical data)
embedding_dim = 768
faiss_index = faiss.IndexFlatL2(embedding_dim)

# Process request
context, records, risk = process_realtime_request(
    realtime_request,
    historical_df,
    faiss_index,
    embedding_model
)

# Output results
print("\n🔒 Security Context:")
print(context)

if not records.empty:
    print("\n📋 Similar Historical Records:")
    print(records[["Employee_ID", "Access_Timestamp", "MFA_Failure_Count",
                 "Behavior_Score", "Similarity_Score"]].to_string(index=False))

print(f"\n🚨 Risk Assessment: {risk}")


🔒 Security Context:
Role: IT Specialist | Department: Information Technology | Location: Berlin Campus | Network: External (203.0.113.42) | MFA Failures: 2

🚨 Risk Assessment: No similar records found
