In [9]:
import pandas as pd
import numpy as np
from faker import Faker
from datetime import datetime, timedelta
import random

# Initialize faker for realistic data generation
fake = Faker()

# Set random seed for reproducibility
np.random.seed(42)
random.seed(42)

# =============================================
# Generate Node Data (Data Assets)
# =============================================

def generate_node_data(num_nodes=10000):
    # Define data asset types with different sensitivity profiles
    asset_types = [
        'employee_record', 'financial_report', 'customer_data', 
        'source_code', 'marketing_material', 'meeting_minutes',
        'product_spec', 'contract', 'email', 'database_backup'
    ]
    
    # Generate nodes
    nodes = []
    for node_id in range(num_nodes):
        # Randomly select asset type
        asset_type = random.choice(asset_types)
        
        # Generate features based on type
        size = int(np.random.lognormal(mean=8, sigma=2))  # File size in bytes
        created_date = fake.date_time_between(start_date='-2y', end_date='now')
        last_modified = fake.date_time_between(start_date=created_date, end_date='now')
        last_accessed = fake.date_time_between(start_date=last_modified, end_date='now')
        
        # Generate sensitivity score (0-100) based on type and other factors
        base_sensitivity = {
            'employee_record': 80,
            'financial_report': 90,
            'customer_data': 85,
            'source_code': 75,
            'marketing_material': 30,
            'meeting_minutes': 60,
            'product_spec': 50,
            'contract': 70,
            'email': 40,
            'database_backup': 95
        }[asset_type]
        
        # Add variability to sensitivity
        sensitivity_score = base_sensitivity + random.randint(-15, 15)
        sensitivity_score = max(0, min(100, sensitivity_score))
        
        # Generate access frequency (higher for less sensitive data)
        access_frequency = int(np.random.poisson(lam=100 - sensitivity_score * 0.7))
        
        # Determine if sensitive (label)
        is_sensitive = 1 if sensitivity_score > 65 else 0
        
        nodes.append({
            'node_id': node_id,
            'type': asset_type,
            'size': size,
            'sensitivity_score': sensitivity_score,
            'created_date': created_date.strftime('%Y-%m-%d %H:%M:%S'),
            'last_modified': last_modified.strftime('%Y-%m-%d %H:%M:%S'),
            'last_accessed': last_accessed.strftime('%Y-%m-%d %H:%M:%S'),
            'access_frequency': access_frequency,
            'owner_department': fake.random_element(elements=(
                'HR', 'Finance', 'Engineering', 'Marketing', 'Legal', 'Operations'
            )),
            'storage_location': fake.random_element(elements=(
                'Cloud_Storage_A', 'Cloud_Storage_B', 'OnPrem_Server_1', 
                'OnPrem_Server_2', 'Endpoint_Device'
            )),
            'encryption_status': fake.random_element(elements=('encrypted', 'unencrypted')),
            'sensitive_label': is_sensitive
        })
    
    return pd.DataFrame(nodes)

# =============================================
# Generate Edge Data (Relationships)
# =============================================

def generate_edge_data(node_df, num_edges=30000):
    num_nodes = len(node_df)
    edges = []
    
    # Create some clusters of highly connected nodes (simulating department structures)
    department_groups = {}
    for dept in node_df['owner_department'].unique():
        dept_nodes = node_df[node_df['owner_department'] == dept]['node_id'].tolist()
        department_groups[dept] = dept_nodes
    
    for edge_id in range(num_edges):
        # 70% chance of intra-department edge, 30% inter-department
        if random.random() < 0.7:
            dept = random.choice(list(department_groups.keys()))
            source = random.choice(department_groups[dept])
            target = random.choice(department_groups[dept])
        else:
            source = random.randint(0, num_nodes-1)
            target = random.randint(0, num_nodes-1)
        
        # Ensure no self-loops
        while target == source:
            target = random.randint(0, num_nodes-1)
        
        # Get node features to influence edge properties
        src_node = node_df[node_df['node_id'] == source].iloc[0]
        tgt_node = node_df[node_df['node_id'] == target].iloc[0]
        
        # Edge features
        access_frequency = int(np.random.poisson(lam=((src_node['access_frequency'] + tgt_node['access_frequency']) / 4)))
        
        sharing_level = random.choice(['internal', 'restricted', 'confidential'])
        
        # Relationship type based on node types
        if src_node['type'] == 'email' or tgt_node['type'] == 'email':
            relationship_type = 'email_attachment'
        elif src_node['type'] == 'database_backup' or tgt_node['type'] == 'database_backup':
            relationship_type = 'backup_dependency'
        else:
            relationship_type = random.choice([
                'access_grant', 'data_flow', 'version_history', 
                'reference', 'permission_share'
            ])
        
        # Calculate sprawl risk score (0-1)
        risk_score = (
            0.4 * (src_node['sensitivity_score'] / 100) +
            0.4 * (tgt_node['sensitivity_score'] / 100) +
            0.1 * (1 if sharing_level == 'confidential' else 0) +
            0.1 * (access_frequency / 100)
        )
        
        # Add some noise
        risk_score = min(1, max(0, risk_score + random.uniform(-0.1, 0.1)))
        
        edges.append({
            'source': source,
            'target': target,
            'relationship_type': relationship_type,
            'access_frequency': access_frequency,
            'sharing_level': sharing_level,
            'last_accessed': fake.date_time_between(
                start_date=max(
                    datetime.strptime(src_node['last_accessed'], '%Y-%m-%d %H:%M:%S'),
                    datetime.strptime(tgt_node['last_accessed'], '%Y-%m-%d %H:%M:%S')
                ),
                end_date='now'
            ).strftime('%Y-%m-%d %H:%M:%S'),
            'permission_type': fake.random_element(elements=(
                'read', 'write', 'admin', 'share'
            )),
            'data_flow_direction': random.choice(['unidirectional', 'bidirectional']),
            'sprawl_risk': risk_score
        })
    
    return pd.DataFrame(edges)

# =============================================
# Main Execution
# =============================================

if __name__ == "__main__":
    print("Generating synthetic node data...")
    node_df = generate_node_data(10000)
    print("Node data generated with shape:", node_df.shape)
    
    print("\nGenerating synthetic edge data...")
    edge_df = generate_edge_data(node_df, 30000)
    print("Edge data generated with shape:", edge_df.shape)
    
    # Save to CSV files
    node_file = "enterprise_nodes.csv"
    edge_file = "enterprise_edges.csv"
    
    node_df.to_csv(node_file, index=False)
    edge_df.to_csv(edge_file, index=False)
    
    print(f"\nSaved node data to: {node_file}")
    print(f"Saved edge data to: {edge_file}")
    
    # Print some statistics
    print("\nData Statistics:")
    print(f"- Sensitive nodes: {node_df['sensitive_label'].sum()} ({node_df['sensitive_label'].mean()*100:.1f}%)")
    print(f"- Average sprawl risk: {edge_df['sprawl_risk'].mean():.3f}")
    print("- Node type distribution:")
    print(node_df['type'].value_counts())
    print("\n- Edge relationship types:")
    print(edge_df['relationship_type'].value_counts())

Generating synthetic node data...
Node data generated with shape: (10000, 12)

Generating synthetic edge data...
Edge data generated with shape: (30000, 9)

Saved node data to: enterprise_nodes.csv
Saved edge data to: enterprise_edges.csv

Data Statistics:
- Sensitive nodes: 5763 (57.6%)
- Average sprawl risk: 0.597
- Node type distribution:
type
product_spec          1063
contract              1040
customer_data         1038
database_backup       1015
financial_report      1003
meeting_minutes        994
email                  985
marketing_material     969
source_code            969
employee_record        924
Name: count, dtype: int64

- Edge relationship types:
relationship_type
email_attachment     5754
backup_dependency    5226
reference            3826
access_grant         3815
permission_share     3798
version_history      3798
data_flow            3783
Name: count, dtype: int64
