In [4]:
import json
import re
import numpy as np
import glob

FILENAME_PATTERN = "dataset/perf*.logcat"

# Load all files matching the pattern
log_data = []
for filename in glob.glob(FILENAME_PATTERN):
    with open(filename, "r") as file:
        data = json.load(file)
        tree_messages_slice = [entry["message"] for entry in data["logcatMessages"] if entry["header"]["tag"] == "Tree"]
        log_data.extend(tree_messages_slice)

In [5]:
from tree import TreeNode, iterate_tree_depth_first

# Extract tree messages
tree_messages = log_data
reconstructed_tree_messages = []
node_descendant_samples = []

# Reconstruct tree logs
i = 0
curr_tree = ""
log_terminator = "END-LONG-LOG"
while i < len(tree_messages):
    if tree_messages[i] == log_terminator:
        if curr_tree:
            reconstructed_tree_messages.append(curr_tree)
        curr_tree = ""
    else:
        curr_tree += tree_messages[i]
    i += 1

# Discard first logline - it might have been cropped upon logcat startup
if reconstructed_tree_messages:
    reconstructed_tree_messages.pop(0)
        
print(f"Processing {len(reconstructed_tree_messages)} trees")
malformed = 0
for msg in reconstructed_tree_messages:
    try:
        root = TreeNode.from_parentheses(msg)
    except:
        malformed += 1
        continue
    root.initialize()
    for node in iterate_tree_depth_first(root):
        node_descendant_samples.append(node.n_descendants)
    del root

print(f"There are {malformed} malformed trees")
print(f"There are {len(node_descendant_samples)} total nodes")

Processing 1039 trees
There are 10 malformed trees
There are 6074541 total nodes


In [6]:
import plotly.graph_objects as go
import numpy as np

FONT_SIZE_PERCENTAGES = 26
FONT_SIZE = 29

# Example data (replace with your actual `node_descendant_samples`)
node_descendant_samples = np.array(node_descendant_samples)

# Define static bins
bins = [(1, 1), (2, 10), (11, 100), (101, np.max(node_descendant_samples))]
bin_labels = ["1", "[2, 10]", "[11, 100]", "> 100"]

# Calculate frequencies for each bin
frequencies = [
    sum((node_descendant_samples >= low) & (node_descendant_samples <= high)) for low, high in bins
]

# Convert frequencies to percentages
frequencies_percent = [freq / len(node_descendant_samples) * 100 for freq in frequencies]

# Create a bar chart to display the data
fig = go.Figure()
fig.add_trace(
    go.Bar(
        x=bin_labels,
        y=frequencies_percent,
        marker=dict(color='blue', line=dict(color='black', width=1.5)),
        opacity=0.75,
        text=[f"{freq:.2f}%" for freq in frequencies_percent],  # Add percentage text
        textposition="auto",  # Position the text outside the bars
        textfont=dict(size=FONT_SIZE_PERCENTAGES)
    )
)

# Update layout with log scales, custom ticks, thinner bars, and adjusted margins
fig.update_layout(
    xaxis=dict(
        title="Number of Node Descendants",
        type="category",
        titlefont=dict(size=FONT_SIZE),  # Set axis title font size to FONT_SIZE
        tickfont=dict(size=FONT_SIZE),  # Set axis tick font size to FONT_SIZE
        title_standoff=20,  # Increase the space between the Y axis title and the axis
    ),
    yaxis=dict(
        title="Frequency (%)",
        type="log",
        titlefont=dict(size=FONT_SIZE),  # Set axis title font size to FONT_SIZE
        title_standoff=20,  # Increase the space between the Y axis title and the axis
        tickmode="array",
        tickvals=[0, 1, 10, 100],  # Custom ticks (adjust as needed)
        ticktext=["0", "1", "10", "100"],  # Corresponding labels for the ticks
        tickfont=dict(size=FONT_SIZE),  # Set axis tick font size to FONT_SIZE
    ),
    template="plotly_white",
    bargap=0.5,  # Reduced value for thinner bars
    margin=dict(t=5, b=0, l=0, r=0),  # Decreased top margin to reduce height
    width=900,  # Adjust the width (in pixels)
    height=450,  # Decreased the height (in pixels)
)

# Show the plot
fig.show()
fig.write_image("descendants.pdf", format="pdf", engine="kaleido")