# Cookie Consent Management Solution Test
#### This notebook demonstrates the complete flow of checking cookie consent management on websites.

## Import required components

In [None]:
from url_processor import URLProcessor
from provider_registry import ProviderRegistry
from browser_manager import BrowserManager
from data_collection import DataCollectionService
import json, visualisation
from IPython.display import display, HTML
from dataclasses import asdict
# Import necessary libraries
import networkx as nx
from urllib.parse import urlparse
import matplotlib.pyplot as plt
from networkx.drawing.nx_agraph import graphviz_layout

## Initialize components

In [None]:
url_processor = URLProcessor()
provider_registry = ProviderRegistry()
browser_manager = BrowserManager(provider_registry)
data_collector = DataCollectionService(browser_manager)
print("All components initialized")

## Get URLs from user

In [None]:
"""
print("Choose URL input method:")
print("1. Enter comma-separated URLs")
print("2. Use default Abbott URLs")
choice = input("Enter choice (1 or 2): ")

if choice == "1":
   urls_input = input("Enter URLs (comma-separated): ")
   test_urls = [url.strip() for url in urls_input.split(",")]
else:
   test_urls = [
       "it.pediasure.abbott",
       "it.ensure.abbott", 
       "es.ensure.abbott",
       "es.pediasure.abbott"
   ]

# Process URLs to ensure proper format
formatted_urls = []
for url in test_urls:
   if not url.startswith(('http://', 'https://')):
       url = 'https://' + url
   formatted_urls.append(url)

print(f"\nProcessing {len(formatted_urls)} URLs:")
for url in formatted_urls:
   print(f"- {url}")
"""

## Ensure URL format is correct

In [None]:
test_urls = ['it.ensure.abbott']
formatted_urls = []
for url in test_urls:
   if not url.startswith(('http://', 'https://')):
       url = 'https://' + url
   formatted_urls.append(url)

print(f"\nProcessing {len(formatted_urls)} URLs:")
for url in formatted_urls:
   print(f"- {url}")

## Process URLs

In [None]:
print("\nStarting URL processing...")
url_results = url_processor.process_urls(formatted_urls)
print(f"URL processing complete. {len(url_results)} results obtained")

## Collect consent data for each URL

In [None]:
all_results = []
for i, url_result in enumerate(url_results, 1):
    print("Processing URL")
    if url_result.is_valid:
        print("URL is valid, checking for cookie consent...")
        result = data_collector.create_result(url_result)
        all_results.append(result)
        print("Cookie consent check complete")
    else:
        print(f"Skipping invalid URL. Error: {url_result.error_message}")
    
print(f"Processed {len(all_results)} valid URLs")

## Display Results

In [None]:
# Display results
print("\nGenerating results...")
for i, result in enumerate(all_results, 1):
    print(f"\n=== Result {i}/{len(all_results)} ===")
    print(f"URL: {result.url_info['requested_url']}")
    
    #print("\nFull Result:")
    #print(json.dumps(asdict(result), indent=2))
    
    print("\nSummary:")
    summary = data_collector.generate_cod_results(result, include_network_chains=True)
    print(json.dumps(summary, indent=2))
    
    if result.errors:
        print("\nErrors encountered:")
        for error in result.errors:
            print(f"- {error}")

## Visualise

### Visualisation Functions

In [23]:
def normalize_url(url: str) -> str:
    """
    Normalizes URLs by removing 'http://', 'https://', and 'www.' prefixes.
    For example: 'https://www.example.com' becomes 'example.com'
    """
    # Remove http:// or https://
    if "://" in url:
        url = url.split("://", 1)[1]
    
    # Remove www.
    if url.startswith("www."):
        url = url[4:]
        
    return url

def shorten_url(url):
    """
    For example, 'https://abc.com/page/something' becomes 'abc.com/page'.
    """
    try:
        # First normalize the URL
        url = normalize_url(url)
        parsed = urlparse(f"http://{url}")  # Add scheme to make urlparse work correctly
        netloc = parsed.netloc
        # Process the path: strip leading/trailing slashes and split by '/'
        if parsed.path and parsed.path != "/":
            parts = parsed.path.strip("/").split("/")
            if parts:
                return f"{netloc}/{parts[0]}"
        return netloc
    except Exception:
        # In case of any parsing error, return the original URL.
        return url
    
def collapse_url(url: str, requested_url: str, max_path_sections: int = 2) -> str:
    """
    Collapses URLs, with special handling for the requested_url:
    - If URL starts with requested_url, return requested_url
    - Otherwise collapse normally
    """
    try:
        # Normalize both URLs before comparison
        normalized_url = normalize_url(url)
        normalized_requested = normalize_url(requested_url)
        
        # Check if this URL is or starts with the requested_url
        if normalized_url.startswith(normalized_requested):
            return requested_url
            
        parsed = urlparse(f"http://{normalized_url}")  # Add scheme to make urlparse work correctly
        netloc = parsed.netloc or "unknown"
        # Split path into segments, ignoring empty parts
        path_parts = [p for p in parsed.path.strip("/").split("/") if p]
        # Keep only up to 'max_path_sections' segments
        collapsed = "/".join(path_parts[:max_path_sections])
        return f"{netloc}/{collapsed}" if collapsed else netloc
    except Exception:
        return url

def get_node_color(short_label: str) -> str:
    """Assigns colors to nodes based on their domain."""
    lower_label = short_label.lower()
    
    # Mapping from keyword substring -> color
    keyword_to_color = {
        "facebook": "blue",
        "amazon": "orange",
        "tiktok": "black",
        "abbott": "royalblue",
        "hubspot": "orange",
        "hs-analytics": "orange",
        "google": "yellow",
        "googletagmanager": "yellow",
        "doubleclick": "yellow",
        "trustarc": "green",
    }
    
    # Return the first match or default to gray
    for keyword, color in keyword_to_color.items():
        if keyword in lower_label:
            return color
    return "gray"

def draw_network_graph(result, hierarchical=False, collapse=False, make_url_short=False, phase="Pre-consent"):
    # Convert the dataclass to a dictionary for dictionary-style access
    result_dict = asdict(result)
    requested_url = result_dict["url_info"]["requested_url"]

    # Extract the request chains
    if phase == "Post-consent; Cookies Rejected":
        chains = result_dict["reject_flow"]["network_state"]["request_chains"]
    elif phase == "Post-consent; Cookies Accepted":
        chains = result_dict["accept_flow"]["network_state"]["request_chains"]
    else:
        chains = result_dict["page_landing"]["state"]["network_state"]["request_chains"]

    # We will build a new directed graph that uses "collapsed" node labels
    G = nx.DiGraph()
    
    # A mapping from the collapsed node label -> set of raw URLs (to track how many)
    node_map = {}
    
    # Process chains and build the graph
    if collapse:
        # Collapse and build the graph
        for chain in chains:
            raw_src = chain.get("source", "unknown")
            raw_tgt = chain.get("target", "unknown")
            
            src = collapse_url(raw_src, requested_url, max_path_sections=2)
            tgt = collapse_url(raw_tgt, requested_url, max_path_sections=2)
            
            node_map.setdefault(src, set()).add(raw_src)
            node_map.setdefault(tgt, set()).add(raw_tgt)
            
            G.add_edge(src, tgt, type=chain.get("type", "unknown"))
    else:
        for chain in chains:
            src = chain.get("source", "unknown")
            tgt = chain.get("target", "unknown")
            
            # Even when not collapsing URLs, we still want to consolidate the requested_url
            if normalize_url(src).startswith(normalize_url(requested_url)):
                src = requested_url
            if normalize_url(tgt).startswith(normalize_url(requested_url)):
                tgt = requested_url
                
            # When not collapsing, each node represents exactly one URL
            node_map.setdefault(src, set()).add(src)
            node_map.setdefault(tgt, set()).add(tgt)
            G.add_edge(src, tgt, type=chain.get("type", "unknown"))
    
    # Find orphan nodes (nodes with no incoming edges except requested_url)
    all_nodes = set(G.nodes())
    nodes_with_incoming = {v for u, v in G.edges()}
    orphan_nodes = all_nodes - nodes_with_incoming - {requested_url}
    
    # Add edges from requested_url to orphan nodes
    for orphan in orphan_nodes:
        G.add_edge(requested_url, orphan, type="unknown")
    
    # Build labels and node colors
    labels = {}
    node_colors = []
    for node in G.nodes():
        raw_count = len(node_map[node])  # Distinct raw URLs
        if make_url_short:
            short_label = f"{shorten_url(node)} ({raw_count})"  # e.g. "abc.com/page (3)"
            labels[node] = short_label
        else:
            labels[node] = f"{node} ({raw_count})"
        color = get_node_color(node)
        node_colors.append(color)
    
    # Choose a layout
    if hierarchical:
        # Requires graphviz installed
        # rankdir=TB means top-to-bottom flow
        pos = graphviz_layout(G, prog='dot', args='-Grankdir=TB')
    else:
        # Default: force-directed layout
        pos = nx.spring_layout(G, seed=42)
    
    # Draw
    plt.figure(figsize=(24, 16))
    nx.draw_networkx_nodes(G, pos, node_color=node_colors, node_size=500)
    nx.draw_networkx_edges(G, pos, arrowstyle='->', arrowsize=20, edge_color='gray')
    nx.draw_networkx_labels(G, pos, labels=labels, font_size=8)
    
    layout_name = "Hierarchical (Top-Down)" if hierarchical else "Spring"
    plt.title(f"Network Request Chains for {requested_url} - {phase} phase")
    plt.axis("off")
    plt.show()


In [None]:
"""
def debug_network_data(result, phase="Pre-consent"):
    
    import json
    from IPython.display import display, HTML
    
    # Get your data structure
    data = visualisation.prepare_data_for_d3_tree(result, phase)
    
    # Print summary
    print(f"Root node: {data['name']}")
    print(f"Number of direct children: {len(data.get('children', []))}")
    
    # Show first level in formatted HTML
    children_preview = json.dumps([
        {"name": child["name"], "childCount": len(child.get("children", []))}
        for child in data.get("children", [])[:5]
    ], indent=2)
    
    display(HTML(f"<pre>{children_preview}</pre>"))
    
    return data"
"""

### Visualise Pre-consent Phase


In [None]:
draw_network_graph(all_results[0], False, False, True, "Pre-consent")

# Debug the data before visualization
print("Debugging network data structure:")
#data = debug_network_data(all_results[0], "Pre-consent")
data = visualisation.prepare_data_for_d3_network(all_results[0], "Pre-consent")
print(f"Root node: {data['name']}")
print(f"Number of direct children: {len(data.get('children', []))}")
children_preview = json.dumps([{"name": child["name"]} for child in data.get("children", [])[:5]], indent=2)
display(HTML(f"<pre>{children_preview}</pre>"))

visualisation.save_visualization_html(all_results[0], "network_viz.html", "Pre-consent")
print("Visualization saved to network_viz.html - open this file in your browser")



### Visualise Pre-consent Phase (Enhanced)

In [None]:
"""
Robust Enhanced Visualization - Handles both object and dictionary formats


## Enhanced Visualization for Pre-consent Phase
# Import the enhanced visualization module
from enhanced_visualisation import DomainNetworkVisualizer
from IPython.display import IFrame
import json
from dataclasses import asdict

# Create the domain network visualizer with the provider registry
domain_visualizer = DomainNetworkVisualizer(provider_registry)

# Function to safely get nested attribute/key from mixed object/dict structure
def safe_get(obj, *keys):

    current = obj
    for key in keys:
        if current is None:
            return None
        
        # Handle dict case
        if isinstance(current, dict):
            current = current.get(key, None)
        # Handle object case
        elif hasattr(current, key):
            current = getattr(current, key, None)
        else:
            return None
    return current

# Extract data directly for visualization to avoid circular references
try:
    # Get result and convert to dict for exploration
    result = all_results[0]
    
    # First try to convert everything to dict
    try:
        result_dict = asdict(result)
        print("Successfully converted result to dictionary")
    except Exception:
        result_dict = result if isinstance(result, dict) else {}
        print("Failed to convert full result to dictionary, proceeding with manual extraction")
    
    # Get the URL
    url = None
    if isinstance(result_dict, dict) and "url_info" in result_dict:
        url = result_dict["url_info"].get("requested_url", None)
    
    # Try direct attribute access if url is still None
    if url is None:
        url = safe_get(result, "url_info", "requested_url")
    
    if url is None:
        url = "Unknown URL"
    
    print(f"Processing visualization for URL: {url}")
    
    # Initialize empty chains
    simple_chains = []
    
    # Try multiple approaches to get request chains
    
    # Approach 1: Navigate through dict structure
    if isinstance(result_dict, dict):
        # Try the expected path for pre-consent
        if ("page_landing" in result_dict and 
            isinstance(result_dict["page_landing"], dict) and 
            "state" in result_dict["page_landing"] and
            isinstance(result_dict["page_landing"]["state"], dict) and
            "network_state" in result_dict["page_landing"]["state"]):
            
            network_state = result_dict["page_landing"]["state"]["network_state"]
            
            if isinstance(network_state, dict) and "request_chains" in network_state:
                simple_chains = network_state["request_chains"]
                print(f"Found {len(simple_chains)} request chains via dict path")
            elif isinstance(network_state, dict) and "requests" in network_state:
                # Create chains from requests
                requests = network_state["requests"]
                print(f"Found {len(requests)} requests to create chains from (dict)")
                for req in requests:
                    if isinstance(req, dict) and "url" in req:
                        simple_chains.append({
                            "source": url,
                            "target": req["url"],
                            "timestamp": req.get("timestamp", "")
                        })
    
    # Approach 2: Direct attribute navigation
    if not simple_chains:
        # Try to navigate using attributes
        network_state = safe_get(result, "page_landing", "state", "network_state")
        
        if network_state:
            # Check if it has request_chains attribute
            request_chains = safe_get(network_state, "request_chains")
            if request_chains:
                if isinstance(request_chains, list):
                    simple_chains = request_chains
                    print(f"Found {len(simple_chains)} request chains via attribute path")
            
            # If still no chains, try to build from requests
            if not simple_chains and hasattr(network_state, "requests"):
                requests = network_state.requests
                if requests:
                    print(f"Found {len(requests)} requests to create chains from (attribute)")
                    for req in requests:
                        if hasattr(req, "url") and req.url:
                            simple_chains.append({
                                "source": url,
                                "target": req.url,
                                "timestamp": getattr(req, "timestamp", "") if hasattr(req, "timestamp") else ""
                            })
    
    # Approach 3: Direct network requests if nothing else worked
    if not simple_chains:
        print("Trying to extract network requests directly")
        
        # Try to get request chains from browser manager logs
        result_browser_state = safe_get(result, "page_landing", "browser_state")
        if result_browser_state:
            result_requests = safe_get(result_browser_state, "network_requests")
            if result_requests and isinstance(result_requests, list):
                print(f"Found {len(result_requests)} direct network requests")
                for req in result_requests:
                    req_url = None
                    if isinstance(req, dict) and "url" in req:
                        req_url = req["url"]
                    elif hasattr(req, "url"):
                        req_url = req.url
                    
                    if req_url:
                        simple_chains.append({
                            "source": url,
                            "target": req_url,
                            "timestamp": ""
                        })
    
    # Create minimal data structure
    minimal_data = {
        "url_info": {"requested_url": url},
        "page_landing": {
            "state": {
                "network_state": {
                    "request_chains": simple_chains
                }
            }
        }
    }
    
    # Debug output
    print(f"Created {len(simple_chains)} request chains for visualization")
    
    # If we have at least one request chain, proceed with visualization
    if simple_chains:
        # Save the enhanced visualization to an HTML file
        output_file = "enhanced_network_viz.html" 
        success = domain_visualizer.save_domain_visualization_html(
            minimal_data, 
            filename=output_file,
            phase="Pre-consent"
        )

        # Display the visualization directly in the notebook if successful
        if success:
            print(f"Enhanced visualization saved to {output_file}")
            display(IFrame(src=output_file, width='100%', height=800))
        else:
            print("Failed to save visualization")
    else:
        print("No request chains found to visualize")
    
except Exception as e:
    import traceback
    print(f"Error creating visualization: {e}")
    traceback.print_exc()
"""

In [None]:
"""
Fixed Enhanced Visualization - Properly handles circular references
"""

## Enhanced Visualization for Pre-consent Phase
# Import the fixed visualization module
from fixed_visualizer import FixedDomainNetworkVisualizer
from IPython.display import IFrame
import json
from dataclasses import asdict

# Create the domain network visualizer with the provider registry
domain_visualizer = FixedDomainNetworkVisualizer(provider_registry)

# Function to safely get nested attribute/key from mixed object/dict structure
def safe_get(obj, *keys):
    """Safely get nested attribute/key from object or dictionary"""
    current = obj
    for key in keys:
        if current is None:
            return None
        
        # Handle dict case
        if isinstance(current, dict):
            current = current.get(key, None)
        # Handle object case
        elif hasattr(current, key):
            current = getattr(current, key, None)
        else:
            return None
    return current

# Extract data directly for visualization to avoid circular references
try:
    # Get result and convert to dict for exploration
    result = all_results[0]
    
    # First try to convert everything to dict
    try:
        result_dict = asdict(result)
        print("Successfully converted result to dictionary")
    except Exception:
        result_dict = result if isinstance(result, dict) else {}
        print("Failed to convert full result to dictionary, proceeding with manual extraction")
    
    # Get the URL
    url = None
    if isinstance(result_dict, dict) and "url_info" in result_dict:
        url = result_dict["url_info"].get("requested_url", None)
    
    # Try direct attribute access if url is still None
    if url is None:
        url = safe_get(result, "url_info", "requested_url")
    
    if url is None:
        url = "Unknown URL"
    
    print(f"Processing visualization for URL: {url}")
    
    # Save the enhanced visualization to an HTML file
    output_file = "enhanced_network_viz.html" 
    success = domain_visualizer.save_domain_visualization_html(
        result, 
        filename=output_file,
        phase="Pre-consent"
    )

    # Display the visualization directly in the notebook if successful
    if success:
        print(f"Enhanced visualization saved to {output_file}")
        display(IFrame(src=output_file, width='100%', height=800))
    else:
        print("Failed to save visualization")
    
except Exception as e:
    import traceback
    print(f"Error creating visualization: {e}")
    traceback.print_exc()

### Visualise Post-consent On Accept Phase

In [None]:
draw_network_graph(all_results[0], False, False, True, "Post-consent; Cookies Accepted")

# Visualize Post-consent Accept Phase with D3.js
#display(visualisation.display_network_visualization(all_results[0], "Post-consent; Cookies Accepted"))
#visualisation.save_visualization_html(all_results[0], "network_viz.html", "Post-consent; Cookies Accepted")

### Visualise Post-consent On Reject Phase

In [None]:
draw_network_graph(all_results[0], False, False, True, "Post-consent; Cookies Rejected")

# Visualize Post-consent Reject Phase with D3.js
#display(visualisation.display_network_visualization(all_results[0], "Post-consent; Cookies Rejected"))

## Cleanup

In [None]:
print("\nCleaning up resources...")
browser_manager.cleanup()
print("Cleanup complete")