In [71]:
import networkx as nx
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
from matplotlib.patches import Patch
import pandas as pd
import numpy as np
from pyvis.network import Network

# =============================================
# 1. Basic Network Visualization (Matplotlib)
# =============================================

def visualize_basic_graph(node_df, edge_df, sample_size=800, min_connections=3):
    """Visualize a small sample of the graph using matplotlib with better edge representation"""
    # Ensure sample size doesn't exceed available nodes
    sample_size = min(sample_size, len(node_df))
    
    # First find well-connected nodes (nodes with at least min_connections edges)
    node_degrees = pd.concat([
        edge_df['source'].value_counts(),
        edge_df['target'].value_counts()
    ]).groupby(level=0).sum()
    well_connected = node_degrees[node_degrees >= min_connections].index.tolist()
    
    # Sample from well-connected nodes if possible, otherwise fallback
    if len(well_connected) >= sample_size:
        selected_nodes = np.random.choice(well_connected, size=sample_size, replace=False)
    else:
        selected_nodes = node_df['node_id'].sample(n=sample_size, replace=False).values
    
    # Get the complete node data for selected nodes
    node_sample = node_df[node_df['node_id'].isin(selected_nodes)]
    
    # Get edges between selected nodes
    edge_sample = edge_df[
        edge_df['source'].isin(selected_nodes) | 
        edge_df['target'].isin(selected_nodes)
    ]
    
    # If we have too few edges, try to add edges that connect to our selected nodes
    min_edges = sample_size  # At least 1 edge per node on average
    if len(edge_sample) < min_edges:
        # Find potential additional edges
        potential_edges = edge_df[
            (edge_df['source'].isin(selected_nodes)) | 
            (edge_df['target'].isin(selected_nodes))
        ]
        potential_edges = potential_edges[~potential_edges.index.isin(edge_sample.index)]
        
        # Only proceed if we actually found additional edges
        if len(potential_edges) > 0:
            needed = min(min_edges - len(edge_sample), len(potential_edges))
            additional_edges = potential_edges.sample(n=needed, replace=False)
            edge_sample = pd.concat([edge_sample, additional_edges])
    
    G = nx.Graph()
    
    # Add nodes with all attributes from the DataFrame
    valid_node_ids = set()
    for _, row in node_sample.iterrows():
        attrs = {
            'type': row['type'],
            'sensitivity': float(row['sensitivity_score']),
            'is_sensitive': bool(row['sensitive_label']),
            'access_frequency': float(row['access_frequency'])
        }
        G.add_node(row['node_id'], **attrs)
        valid_node_ids.add(row['node_id'])
    
    # Add edges only if both source and target nodes exist
    for _, row in edge_sample.iterrows():
        if row['source'] in valid_node_ids and row['target'] in valid_node_ids:
            attrs = {
                'relationship': row['relationship_type'],
                'risk': float(row['sprawl_risk'])
            }
            G.add_edge(row['source'], row['target'], **attrs)

    
    # Prepare visualization
    plt.figure(figsize=(14, 10))
    
    # Get attributes from graph nodes
    node_colors = [G.nodes[n]['sensitivity'] for n in G.nodes()]
    node_sizes = [np.log(G.nodes[n]['access_frequency'] + 1) * 50 for n in G.nodes()]
    
    # Get edge attributes
    edge_widths = [G.edges[e]['risk'] * 5 for e in G.edges()]
    relationship_types = list(set([G.edges[e]['relationship'] for e in G.edges()]))
    edge_colors = [relationship_types.index(G.edges[e]['relationship']) for e in G.edges()]
    
    # Create colormaps
    node_cmap = plt.cm.plasma
    edge_cmap = plt.cm.tab20
    
    # Draw the graph
    pos = nx.spring_layout(G, k=0.5, iterations=100)
    
    # Draw nodes
    nodes = nx.draw_networkx_nodes(
        G, pos,
        node_color=node_colors,
        node_size=node_sizes,
        cmap=node_cmap,
        alpha=0.8,
        vmin=min(node_colors),
        vmax=max(node_colors)
    )
    
    # Draw edges
    edges = nx.draw_networkx_edges(
        G, pos,
        width=edge_widths,
        edge_color=edge_colors,
        edge_cmap=edge_cmap,
        alpha=0.6,
        edge_vmin=min(edge_colors),
        edge_vmax=max(edge_colors)
    )
    
    # Create legends
    node_types = node_sample['type'].unique()
    node_type_legend = [
        Patch(facecolor=node_cmap(i / len(node_types)), label=t)
        for i, t in enumerate(node_types)
    ]
    
    edge_type_legend = [
        Patch(facecolor=edge_cmap(i / len(relationship_types)), label=t)
        for i, t in enumerate(relationship_types)
    ]
    
    plt.legend(handles=node_type_legend + edge_type_legend, bbox_to_anchor=(1.05, 1), loc='upper left')
    
    # Add colorbars
    #plt.colorbar(nodes, label='Node Sensitivity Score', shrink=0.7)
    #plt.colorbar(edges, label='Edge Relationship Type', shrink=0.7)
    
    plt.title(f"Enterprise Data Sprawl Network Sample\n({len(G.nodes())} nodes, {len(G.edges())} edges)")
    plt.axis('off')
    plt.tight_layout()
    plt.savefig('basic_network.png', dpi=300, bbox_inches='tight')
    plt.close()
    print(f"Saved visualization with {len(G.nodes())} nodes and {len(G.edges())} edges to basic_network.png")

# =============================================
# Main Execution
# =============================================

if __name__ == "__main__":
    # Load your generated data
    node_df = pd.read_csv("enterprise_nodes.csv")
    edge_df = pd.read_csv("enterprise_edges.csv")
    
    print("Visualizing data sprawl network...")
    
    # 1. Basic network visualization (matplotlib)
    visualize_basic_graph(node_df, edge_df)


Visualizing data sprawl network...
Saved visualization with 800 nodes and 219 edges to basic_network.png


In [93]:
# =============================================
# 3. Attribute Distribution Visualizations
# =============================================

def plot_attribute_distributions(node_df, edge_df):
    """Create distribution plots for key attributes"""
    plt.figure(figsize=(18, 12))
    font_size = 12
    
    # Node attribute distributions
    plt.subplot(2, 3, 1)
    node_df['type'].value_counts().plot(kind='bar', color='teal')
    plt.title('Distribution of Data Types', fontsize=font_size)
    plt.xticks(rotation=45, fontsize=font_size)
    plt.yticks(fontsize=font_size)
    
    plt.subplot(2, 3, 2)
    plt.hist(node_df['sensitivity_score'], bins=20, color='purple', alpha=0.7)
    plt.title('Sensitivity Score Distribution', fontsize=font_size)
    plt.xticks(fontsize=font_size)
    plt.yticks(fontsize=font_size)
    
    
    # Edge attribute distributions
    plt.subplot(2, 3, 4)
    edge_df['relationship_type'].value_counts().plot(kind='bar', color='steelblue')
    plt.title('Relationship Types', fontsize=font_size)
    plt.xticks(rotation=45, fontsize=font_size)
    plt.yticks(fontsize=font_size)
    
    plt.subplot(2, 3, 5)
    plt.hist(edge_df['sprawl_risk'], bins=20, color='crimson', alpha=0.7)
    plt.title('Sprawl Risk Distribution', fontsize=font_size)
    plt.xticks(fontsize=font_size)
    plt.yticks(fontsize=font_size)
    
    
    plt.tight_layout()
    plt.savefig('attribute_distributions.png', dpi=300)
    plt.close()
    print("Saved attribute distributions to attribute_distributions.png")

# =============================================
# 4. Department-Level Visualization
# =============================================

def plot_department_network(node_df, edge_df):
    """Create a simplified department-level visualization"""
    # Aggregate by department
    dept_nodes = node_df.groupby('owner_department').agg({
        'node_id': 'count',
        'sensitivity_score': 'mean',
        'sensitive_label': 'sum'
    }).reset_index()
    
    dept_edges = edge_df.merge(
        node_df[['node_id', 'owner_department']],
        left_on='source', right_on='node_id'
    ).merge(
        node_df[['node_id', 'owner_department']],
        left_on='target', right_on='node_id',
        suffixes=('_source', '_target')
    )
    
    dept_connections = dept_edges.groupby([
        'owner_department_source', 'owner_department_target'
    ]).agg({
        'sprawl_risk': 'mean',
        'source': 'count'
    }).reset_index()
    
    # Create department graph
    G = nx.DiGraph()
    
    # Add department nodes
    for _, row in dept_nodes.iterrows():
        G.add_node(
            row['owner_department'],
            size=row['node_id'],
            sensitivity=row['sensitivity_score'],
            sensitive_count=row['sensitive_label']
        )
    
    # Add department connections
    for _, row in dept_connections.iterrows():
        if row['owner_department_source'] != row['owner_department_target']:
            G.add_edge(
                row['owner_department_source'],
                row['owner_department_target'],
                weight=row['source'],
                risk=row['sprawl_risk']
            )
    
    # Draw the graph
    plt.figure(figsize=(12, 8))
    
    node_sizes = [G.nodes[n]['size'] * 10 for n in G.nodes()]
    node_colors = [G.nodes[n]['sensitivity'] for n in G.nodes()]
    edge_widths = [G.edges[e]['weight'] / 100 for e in G.edges()]
    edge_colors = [G.edges[e]['risk'] for e in G.edges()]
    
    pos = nx.circular_layout(G)
    
    nodes = nx.draw_networkx_nodes(
        G, pos,
        node_size=node_sizes,
        node_color=node_colors,
        cmap=plt.cm.plasma,
        alpha=0.8
    )
    
    edges = nx.draw_networkx_edges(
        G, pos,
        width=edge_widths,
        edge_color=edge_colors,
        edge_cmap=plt.cm.YlOrRd,
        edge_vmin=0,
        edge_vmax=1,
        arrowstyle='-|>',
        arrowsize=15,
        alpha=0.6
    )
    
    nx.draw_networkx_labels(G, pos, font_size=10, font_weight='bold')
    
    plt.colorbar(nodes, label='Average Sensitivity')
    plt.colorbar(edges, label='Average Sprawl Risk')

    plt.tight_layout(rect=[0, 0, 0.9, 1])  # leave space for colorbars on the right

    plt.title("Department-Level Data Flow Network\n(Node Size: Data Assets | Edge Width: Connection Volume)")
    plt.axis('off')
    plt.tight_layout()
    plt.savefig('department_network.png', dpi=300)
    plt.close()
    print("Saved department network visualization to department_network.png")



In [95]:
# =============================================
# Main Execution
# =============================================

if __name__ == "__main__":
    # Load your generated data
    node_df = pd.read_csv("enterprise_nodes.csv")
    edge_df = pd.read_csv("enterprise_edges.csv")
    
    print("Visualizing data sprawl network...")
    
    
    # 3. Attribute distributions
    plot_attribute_distributions(node_df, edge_df)
    

    
    print("\nAll visualizations generated successfully!")

Visualizing data sprawl network...
Saved attribute distributions to attribute_distributions.png

All visualizations generated successfully!
