In [1]:
import json
import numpy as np
import glob
import re

FILENAME = "claycode_successful_scans.txt"
    
messages = []
with open(FILENAME, "r") as file:
    for line in file:
        line = line.strip()
        if re.fullmatch(r"\d+:[A-Za-z0-9]+", line):  # latency:number:alphanumeric_string
            messages.append(line)

samples = [(int(message.split(":")[0]), len(message.split(":")[1]), message.split(":")[1]) for message in messages]

latencies = np.array([sample[0] for sample in samples])
string_lengths = np.array([sample[1] for sample in samples])
print(f"Num latency {len(latencies)}")
print(f"Num unique string lengths {len(set(string_lengths))}")
print(f"Min latency {np.min(latencies)}")
print(f"Avg latency {np.mean(latencies)}")
print(f"Max latency {np.max(latencies)}")
print(f"Std deviation latency {np.std(latencies)}")


Num latency 7193
Num unique string lengths 14
Min latency 55
Avg latency 121.71903239260392
Max latency 275
Std deviation latency 30.105166157966707


In [38]:
import plotly.express as px
import pandas as pd

# Your original data
data = {
    "latency": latencies,
    "string_length": string_lengths
}

df = pd.DataFrame(data)
df = df[df["string_length"] < 60]
df["bits"] = df["string_length"] * 8 + 16

# Count samples per string length
grouped_counts_df = df.groupby("string_length").size().reset_index(name='count')
min_count = grouped_counts_df["count"].min()
# print(grouped_counts_df)
# print(f"min count = {min_count}")

# Keep only the first `min_count` samples per string length
df_balanced = df.groupby("string_length", group_keys=False).head(min_count)
grouped_counts_df_balanced = df_balanced.groupby("string_length").size().reset_index(name='count')
# print(grouped_counts_df_balanced)

# Calculate and print average latency per string_length
avg_latency_df = df_balanced.groupby("string_length")["latency"].mean().reset_index(name="average_latency")
print("Average latencies")
print(avg_latency_df)

# Create custom label column
df_balanced["label"] = df_balanced.apply(
    lambda row: f"{row['bits']} bits ({row['string_length']} chars)", axis=1
)

# Sort labels based on string_length
sorted_labels = df_balanced.sort_values("string_length")["label"].unique()

# Create the boxplot with updated labels and sorted order
fig = px.box(
    df_balanced,
    x="label",
    y="latency",
    labels={
        "label": "Scanned data size",
        "latency": "Latency (ms)"
    },
    category_orders={"label": sorted_labels},
)

# Update font size
fig.update_layout(
    font=dict(size=16),
    xaxis=dict(tickangle=-25),  # Rotate x-axis labels by 45 degrees down
    margin=dict(l=20, r=20, t=20, b=20)  # Reduce padding (left, right, top, bottom)
)

fig.show()
fig.write_image("latencies_string_length.pdf", format="pdf", engine="kaleido")

Average latencies
    string_length  average_latency
0               5        97.524272
1              10       103.948220
2              15       108.012945
3              20       108.770227
4              25       113.741100
5              30       116.595469
6              35       127.038835
7              40       129.705502
8              45       133.064725
9              50       142.042071
10             55       148.310680




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

