In [None]:
import json
import os
import matplotlib.pyplot as plt
import pandas as pd

# Download data from remote server
!rm -r processed_data
!wget --compression=auto --header="Accept-Encoding: gzip" -r -np -nH --cut-dirs=1 -R "index.html*" -P processed_data http://redacted.nonexistantdomain/pdata/

# Directory containing the data files
data_dir = "processed_data/"

# File paths
accpy_bandit_file = os.path.join(data_dir, "accpy_bandit_all.json")
pypi_bandit_file = os.path.join(data_dir, "pypi_bandit_10pct_sample.json")
accpy_guarddog_file = os.path.join(data_dir, "accpy_gd_all.json")
pypi_guarddog_file = os.path.join(data_dir, "pypi_gd_10pct_sample.json")
accpy_dynamic_file = os.path.join(data_dir, "accpy_dynamic_all.json")
pypi_dynamic_file = os.path.join(data_dir, "pypi_dynamic_10pct_sample.json")
survey_file = os.path.join(data_dir, "survey.csv")

# Function to load data from a JSON file
def load_data(file_path):
    with open(file_path, "r") as f:
        return json.load(f)

# Load data
accpy_bandit_data = load_data(accpy_bandit_file)
pypi_bandit_data = load_data(pypi_bandit_file)
accpy_guarddog_data = load_data(accpy_guarddog_file)
pypi_guarddog_data = load_data(pypi_guarddog_file)
accpy_dynamic_data = load_data(accpy_dynamic_file)
pypi_dynamic_data = load_data(pypi_dynamic_file)


In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Define separate labels for severity and confidence
SEVERITY_LABELS = ["SEVERITY.HIGH", "SEVERITY.MEDIUM", "SEVERITY.LOW"]
CONFIDENCE_LABELS = ["CONFIDENCE.HIGH", "CONFIDENCE.MEDIUM", "CONFIDENCE.LOW"]

vuln_to_id = {
    'assert_used': 1,
    'blacklist': 2,
    'request_without_timeout': 3,
    'try_except_pass': 4,
    'subprocess_without_shell_equals_true': 5,
    'hardcoded_sql_expressions': 6,
    'start_process_with_partial_path': 7,
    'hardcoded_password_string': 8,
    'start_process_with_a_shell': 9,
    'subprocess_popen_with_shell_equals_true': 10,
    'hashlib': 11,
    'hardcoded_tmp_directory': 12,
    'exec_used': 13,
    'hardcoded_password_funcarg': 14,
    'hardcoded_password_default': 15,
    'hardcoded_bind_all_interfaces': 16,
    'yaml_load': 17,
    'django_mark_safe': 18,
    'try_except_continue': 19,
    'request_with_no_cert_validation': 20,
    'jinja2_autoescape_false': 21,
    'tarfile_unsafe_members': 22,
    'any_other_function_with_shell_equals_true': 23,
    'start_process_with_no_shell': 24,
    'set_bad_file_permissions': 25,
    'paramiko_calls': 26,
    'ssh_no_host_key_verification': 27,
    'django_extra_used': 28,
    'flask_debug_true': 29,
    'ssl_with_no_version': 30,
    'use_of_mako_templates': 31,
    'django_rawsql_used': 32,
    'weak_cryptographic_key': 33,
    'ssl_with_bad_version': 34,
    'linux_commands_wildcard_injection': 35,
    'snmp_insecure_version_check': 36,
    'logging_config_insecure_listen': 37
}

# Function to aggregate issues by severity and confidence
def aggregate_issues(data, verbose=False):
    ks = SEVERITY_LABELS + CONFIDENCE_LABELS
    summary = {k: 0 for k in ks}
    s2 = {
        "total_issues": 0,
        "total_loc": 0,
        "total_packages": 0
    }
    summary.update(s2)
    heuristics = {}

    for package, details in data.items():
        for e in details["results"]:
            t = e["test_name"]
            if t not in heuristics:
                heuristics[t] = 0
            heuristics[t] += 1

        for k in ks:
            summary[k] += details["summary"][k]
        summary["total_packages"] += 1
        summary["total_issues"] += details["issues"]
        summary["total_loc"] += details["summary"]["loc"]

    print("----")
    tot = 0
    for k,v in sorted(heuristics.items(), key=lambda e: -e[1]):
    #    print(f"{k}: {v}")
        tot += v
    npkg = summary["total_packages"]
    print(f"Total issues: {tot}")
    print(f"Total pkg: {npkg}")
    print(f'Total loc: {summary["total_loc"]}')
    print(f'Total loc/pkg: {summary["total_loc"] / npkg}')
    print("----")
    return summary

# Function to plot data with error bars
def plot_comparison(accpy_data, pypi_data, metric, labels, title, ylabel):
    # Prepare the data
    accpy_values = [accpy_data[label] for label in labels]
    pypi_values = [pypi_data[label] for label in labels]

    # Normalize values by total issues per million lines of code
    accpy_per_million_values = [val / accpy_data['total_loc'] * 1000000 for val in accpy_values]
    pypi_per_million_values = [val / pypi_data['total_loc'] * 1000000 for val in pypi_values]

    # Normalize values by number of issues per package
    accpy_per_pkg_values = [val / accpy_data['total_packages'] for val in accpy_values]
    pypi_per_pkg_values = [val / pypi_data['total_packages'] for val in pypi_values]

    # Calculate error bars (Poisson distribution approximation)
    accpy_errors = [np.sqrt(val) / accpy_data['total_loc'] * 1000000 for val in accpy_values]
    pypi_errors = [np.sqrt(val) / pypi_data['total_loc'] * 1000000 for val in pypi_values]

    accpy_pkg_errors = [np.sqrt(val) / accpy_data['total_packages'] for val in accpy_values]
    pypi_pkg_errors = [np.sqrt(val) / pypi_data['total_packages'] for val in pypi_values]

    x = range(len(labels))

    # Create 2 subplots (1 row x 2 columns for the current section)
    fig, axes = plt.subplots(1, 1, figsize=(5, 3))

    # Plot for issues per million lines of code with error bars
    accpy_errors, pypi_errors = None, None
    bars1 = axes.bar(x, accpy_per_million_values, width=0.4, label='AccPy', align='center', yerr=accpy_errors, capsize=5)
    bars2 = axes.bar([i + 0.4 for i in x], pypi_per_million_values, width=0.4, label='PyPI', align='center', yerr=pypi_errors, capsize=5)
    axes.set_xticks([i + 0.2 for i in x])
    axes.set_xticklabels(labels)#, rotation=45)
    axes.set_ylabel(ylabel)
    axes.set_title(f'{title} (Issues per Million Lines)')
    axes.set_ylim([0, 11000])
    axes.legend()

    axes.bar_label(bars1, fmt='%.1f', padding=3)
    axes.bar_label(bars2, fmt='%.1f', padding=3)

    # Plot for issues per package with error bars
    # bars3 = axes[1].bar(x, accpy_per_pkg_values, width=0.4, label='AccPy', align='center', yerr=accpy_pkg_errors, capsize=5)
    # bars4 = axes[1].bar([i + 0.4 for i in x], pypi_per_pkg_values, width=0.4, label='PyPI', align='center', yerr=pypi_pkg_errors, capsize=5)
    # axes[1].set_xticks([i + 0.2 for i in x])
    # axes[1].set_xticklabels(labels, rotation=45)
    # axes[1].set_ylabel('Number of Issues per Package')
    # axes[1].set_title(f'{title} (Issues per Package)')
    # axes[1].legend()

    # axes[1].bar_label(bars3, fmt='%.2f', padding=3)
    # axes[1].bar_label(bars4, fmt='%.2f', padding=3)

    plt.tight_layout()
    plt.show()

# Aggregate data
accpy_bandit_summary = aggregate_issues(accpy_bandit_data, verbose=True)
pypi_bandit_summary = aggregate_issues(pypi_bandit_data)

# Plot comparisons for severity with error bars
plot_comparison(accpy_bandit_summary, pypi_bandit_summary, 'Severity', SEVERITY_LABELS, 'Vulnerabilities by Severity', 'Number of Issues / mloc')

# Plot comparisons for confidence with error bars
#plot_comparison(accpy_bandit_summary, pypi_bandit_summary, 'Confidence', CONFIDENCE_LABELS, 'Vulnerabilities by Confidence', 'Number of Issues / mloc')

# Additional analysis (e.g., total lines of code, total issues)
print(f"AccPy total number of issues: {accpy_bandit_summary['total_issues']}")
print(f"PyPI total number of issues: {pypi_bandit_summary['total_issues']}")
print(f"AccPy total lines of code: {accpy_bandit_summary['total_loc']}")
print(f"PyPI total lines of code: {pypi_bandit_summary['total_loc']}")
print(f"AccPy avg lines of code per package: {accpy_bandit_summary['total_loc'] / accpy_bandit_summary['total_packages']}")
print(f"PyPI avg lines of code per package: {pypi_bandit_summary['total_loc'] / pypi_bandit_summary['total_packages']}")
print(f"AccPy issues per 1 million loc: {accpy_bandit_summary['total_issues'] / accpy_bandit_summary['total_loc'] * 1000000}")
print(f"PyPI issues per 1 million loc: {pypi_bandit_summary['total_issues'] / pypi_bandit_summary['total_loc'] * 1000000}")
print(f"AccPy issues per package: {accpy_bandit_summary['total_issues'] / len(accpy_bandit_data)}")
print(f"PyPI issues per package: {pypi_bandit_summary['total_issues'] / len(pypi_bandit_data)}")


In [None]:
# GuardDog statistics

GUARDDOG_LABELS = [
    "npm-silent-process-execution", "exec-base64", "silent-process-execution",
    "exfiltrate-sensitive-data", "clipboard-access", "download-executable",
    "obfuscation", "npm-exec-base64", "shady-links", "npm-install-script",
    "code-execution", "cmd-overwrite", "steganography", "npm-serialize-environment"
]
new_gd_labels = []
gd_vuln_to_id = {
    'empty_information': 1,
    'single_python_file': 2,
    'typosquatting': 3,
    'bundled_binary': 4,
    'release_zero': 5,
    'deceptive_author': 6,
    'shady-links': 7,
    'cmd-overwrite': 8,
    'code-execution': 9,
    'obfuscation': 10,
    'clipboard-access': 11,
    'dll-hijacking': 12,
    'exfiltrate-sensitive-data': 13,
    'bidirectional-characters': 14,
    'exec-base64': 15,
    'silent-process-execution': 16,
    'download-executable': 17
}


def get_gd_labels(data):
    global new_gd_labels
    labels = set()
    for package, details in data.items():

      labels.update(details["results"].keys())
    new_gd_labels = list(labels)
    #new_gd_labels.remove("unclaimed_maintainer_email_domain")
    new_gd_labels.remove("potentially_compromised_email_domain")

def aggregate_guarddog_issues(data):
    labels = new_gd_labels
    summary = {
        "total_issues": 0,
        "total_packages": 0
    }

    summary.update({label: 0 for label in labels})

    for package, details in data.items():
        summary["total_issues"] += details["issues"]
        summary["total_packages"] += 1
        for label in labels:
            if label in details["results"] and details["results"][label]:
                summary[label] += len(details["results"][label]) if isinstance(details["results"][label], list) else 1

    return summary

# Function to plot GuardDog data
def plot_guarddog_comparison(accpy_data, pypi_data, bandit_accpy_data, bandit_pypi_data):
    labels = new_gd_labels
    labels = [x for x in labels if x in gd_vuln_to_id]
    ylabel = 'Number of Issues per Package'
    ylabel_mloc = 'Number of Issues / mloc'
    title = "GuardDog Vulnerabilities Comparison (Issues per Package)"
    title_mloc = "GuardDog Vulnerabilities Comparison (Issues per Million Lines of Code)"

    # Prepare the data
    accpy_values = [accpy_data[label] for label in labels]
    pypi_values = [pypi_data[label] for label in labels]

    # Normalize values by total issues per million lines of code
    accpy_per_pkg_values = [val / accpy_data['total_packages'] for val in accpy_values]
    pypi_per_pkg_values = [val / pypi_data['total_packages'] for val in pypi_values]
    accpy_per_mloc_values = [val / bandit_accpy_data['total_loc'] * 1000000 for val in accpy_values]
    pypi_per_mloc_values = [val / bandit_pypi_data['total_loc'] * 1000000 for val in pypi_values]

    # Calculate error bars
    pypi_error = [np.sqrt(val) / pypi_data['total_packages'] for val in pypi_values]
    accpy_error = [np.sqrt(val) / accpy_data['total_packages'] for val in accpy_values]
    accpy_per_mloc_error = [np.sqrt(val) / bandit_accpy_data['total_loc'] * 1000000 for val in accpy_values]
    pypi_per_mloc_error = [np.sqrt(val) / bandit_pypi_data['total_loc'] * 1000000 for val in pypi_values]

    x = range(len(labels))
    labels = [ gd_vuln_to_id[x] if x in gd_vuln_to_id else x for x in labels ]

    # Create subplots
    # plt.figure(figsize=(12, 6))
    fig, axes = plt.subplots(1, 1, figsize=(6, 3))
    # axes[0].bar(x, accpy_per_pkg_values, width=0.4, label='AccPy', align='center', yerr=accpy_error, capsize=5)
    # axes[0].bar([i + 0.4 for i in x], pypi_per_pkg_values, width=0.4, label='PyPI', align='center', yerr=pypi_error, capsize=5)
    # x = list(x)
    # axes[0].set_xticks([i + 0.2 for i in x], labels, rotation=30, ha='right')
    # axes[0].set_ylabel(ylabel)
    # axes[0].set_title(title)
    # axes[0].legend()

    axes.bar(x, accpy_per_mloc_values, width=0.4, label='AccPy', align='center', capsize=5)
    axes.bar([i + 0.4 for i in x], pypi_per_mloc_values, width=0.4, label='PyPI', align='center', capsize=5)
    axes.set_xticks([i + 0.2 for i in x], labels, rotation=30, ha='right')
    axes.set_ylabel(ylabel_mloc)
    axes.set_title(title_mloc)
    axes.legend()

    fig.tight_layout()
    fig.show()


# Aggregate data for both
get_gd_labels({**accpy_guarddog_data, **pypi_guarddog_data})
accpy_summary = aggregate_guarddog_issues(accpy_guarddog_data)
pypi_summary = aggregate_guarddog_issues(pypi_guarddog_data)

# Plot comparison
plot_guarddog_comparison(accpy_summary, pypi_summary, accpy_bandit_summary, pypi_bandit_summary)

# Additional analysis
print(f"AccPy total number of issues: {accpy_summary['total_issues']}")
print(f"PyPI total number of issues: {pypi_summary['total_issues']}")
print(f"AccPy total number of packages: {accpy_summary['total_packages']}")
print(f"PyPI total number of packages: {pypi_summary['total_packages']}")
print(f"AccPy issues per package: {accpy_summary['total_issues'] / accpy_summary['total_packages']}")
print(f"PyPI issues per package: {pypi_summary['total_issues'] / pypi_summary['total_packages']}")

In [None]:
from collections import Counter
import pandas as pd

def gd_data_to_vulnlist(gd_data):
    res = {}
    vuln_counter = Counter()

    for pkg in gd_data:
        res[pkg] = []
        for vuln in gd_data[pkg]["results"]:
            if gd_data[pkg]["results"][vuln] in [None, {}, ""]:
                continue
            res[pkg].append(vuln)
            vuln_counter[vuln] += 1

    # Get top 10 vulnerabilities by count
    top_vulns = vuln_counter.most_common(30)

    # Count unique occurrences of each vulnerability
    unique_vuln_counts = {vuln: sum(1 for pkgs in res.values() if vuln in pkgs) for vuln, _ in top_vulns}

    # Create a DataFrame for better visualization
    df = pd.DataFrame(top_vulns, columns=["Vulnerability", "Count"])
    df["Unique Occurrences"] = df["Vulnerability"].map(unique_vuln_counts)

    return df

#top_vuln_df = gd_data_to_vulnlist(accpy_guarddog_data)
#display(top_vuln_df)

#top_vuln_df = gd_data_to_vulnlist(pypi_guarddog_data)
#display(top_vuln_df)

import pprint

def bandit_data_to_vulnlist(bandit_data):
  l = { k: e for k,e in bandit_data.items() if e["issues"] > 0 }
  vlist = {}
  for p in l:
    vlist[p] = [x["test_name"] for x in bandit_data[p]["results"]]
  cnts = {}
  for k,v in vlist.items():
    for e in v:
      if e not in cnts:
        cnts[e] = 0
      cnts[e] += 1
  pprint.pp(cnts)

bandit_data_to_vulnlist(accpy_bandit_data)
bandit_data_to_vulnlist(pypi_bandit_data)

In [None]:
# Dynamic statistics

def extract_dynamic_data(data):
    packages = []

    for package_name, details in data.items():
        package_data = {
            "name": package_name,
            "packets": details["packets"],
            "dep_packets": details["dep_packets"],
            "packets_size": details["packets_size"],
            "dep_packets_size": details["dep_packets_size"],
            "packets_domains": details["packets_domains"],
            "dep_packets_domains": details["dep_packets_domains"]
        }
        packages.append(package_data)

    return packages

# Extract dynamic data for both AccPy and PyPI
accpy_packages = extract_dynamic_data(accpy_dynamic_data)
pypi_packages = extract_dynamic_data(pypi_dynamic_data)

# Create a scatter plot with different colors for AccPy and PyPI, one dot per package
fig, ax = plt.subplots(figsize=(10, 6))

# Collect data for AccPy
accpy_packets = [pkg["packets"] for pkg in accpy_packages]
accpy_packets_size = [pkg["packets_size"] / pkg["packets"] if pkg["packets"] > 0 else 0 for pkg in accpy_packages]
accpy_dep_packets = [pkg["dep_packets"] for pkg in accpy_packages]
accpy_dep_packets_size = [pkg["dep_packets_size"] / pkg["dep_packets"] if pkg["dep_packets"] > 0 else 0 for pkg in accpy_packages]

# Collect data for PyPI
pypi_packets = [pkg["packets"] for pkg in pypi_packages]
pypi_packets_size = [pkg["packets_size"] / pkg["packets"] if pkg["packets"] > 0 else 0 for pkg in pypi_packages]
pypi_dep_packets = [pkg["dep_packets"] for pkg in pypi_packages]
pypi_dep_packets_size = [pkg["dep_packets_size"] / pkg["dep_packets"] if pkg["dep_packets"] > 0 else 0 for pkg in pypi_packages]

# Plot all AccPy data points in one go
ax.scatter(accpy_packets, accpy_packets_size, color='blue', label='AccPy - Packets', s=100, alpha=0.6)
ax.scatter(accpy_dep_packets, accpy_dep_packets_size, color='lightblue', label='AccPy - Dep Packets', s=100, alpha=0.6)

# Plot all PyPI data points in one go
ax.scatter(pypi_packets, pypi_packets_size, color='red', label='PyPI - Packets', s=100, alpha=0.6)
ax.scatter(pypi_dep_packets, pypi_dep_packets_size, color='orange', label='PyPI - Dep Packets', s=100, alpha=0.6)


# Remove duplicate labels in the legend
handles, labels = ax.get_legend_handles_labels()
unique_labels = dict(zip(labels, handles))
ax.legend(unique_labels.values(), unique_labels.keys())

# Labeling the plot
ax.set_xlabel('Number of TCP+UDP Packets')
ax.set_ylabel('Avg Packet Size (bytes)')
ax.set_title('Dynamic Analysis: Packets vs. Packet Size Comparison (AccPy vs PyPI)')
ax.set_yscale('log')
ax.set_xscale('log')
plt.grid(True)

# Show the plot
fig.set_dpi(100)
plt.tight_layout()
plt.show()

# Additional statistics
print(f"AccPy total number of packets: {sum(accpy_packets)}")
print(f"AccPy total number of dependent packets: {sum(accpy_dep_packets)}")
print(f"AccPy total packets size: {sum(accpy_packets_size)} bytes")
print(f"AccPy total dependent packets size: {sum(accpy_dep_packets_size)} bytes")
print(f"AccPy total unique domains: {len(set([item for sublist in [pkg['packets_domains'] for pkg in accpy_packages] for item in sublist]))}")

print(f"PyPI total number of packets: {sum(pypi_packets)}")
print(f"PyPI total number of dependent packets: {sum(pypi_dep_packets)}")
print(f"PyPI total packets size: {sum(pypi_packets_size)} bytes")
print(f"PyPI total dependent packets size: {sum(pypi_dep_packets_size)} bytes")
print(f"PyPI total unique domains: {len(set([item for sublist in [pkg['packets_domains'] for pkg in pypi_packages] for item in sublist]))}")
print(f"PyPI number of packages with at least one packet: {len([ x for x in pypi_packets if x > 0 ])}")

# Print all domains
with_telemetry = 0
without_telemetry = 0
for s in accpy_packages:
    p = s['name']
    WHITELISTED_DOMAINS = ["pypi.org", "files.pythonhosted.org"]
    d1 = [x for x in s['packets_domains'] if x not in WHITELISTED_DOMAINS]
    d2 = [x for x in s['dep_packets_domains'] if x not in WHITELISTED_DOMAINS]

    if len(d1) > 0:
      print(f"{p} - {d1}");
      with_telemetry += 1
    else:
      without_telemetry += 1
    #if len(d2) > 0:
    #print(f"{p} - {d2}");

print(f"AccPy packages with telemetry: {with_telemetry}")
print(f"AccPy packages without telemetry: {without_telemetry}")

In [None]:
import numpy as np
from math import comb
import scipy.stats as st

accpy_data = []
for pkg in accpy_packages:
    if pkg["packets"] > 0:
        accpy_data.append([pkg["packets"], pkg["packets_size"] / pkg["packets"]])
    if pkg["dep_packets"] > 0:
        accpy_data.append([pkg["dep_packets"], pkg["dep_packets_size"] / pkg["dep_packets"]])

accpy_data = np.array(accpy_data)

pypi_data = []
for pkg in pypi_packages:
    if pkg["packets"] > 0:
        pypi_data.append([pkg["packets"], pkg["packets_size"] / pkg["packets"]])
    if pkg["dep_packets"] > 0:
        pypi_data.append([pkg["dep_packets"], pkg["dep_packets_size"] / pkg["dep_packets"]])

pypi_data = np.array(pypi_data)

print("AccPy data shape:", accpy_data.shape)
print("PyPI data shape:", pypi_data.shape)

def hotelling_t2_test(X, Y):
    """
    Perform two-sample Hotelling T^2 test on two 2D samples:
      X: n1 x d
      Y: n2 x d
    Returns T^2 statistic, F statistic, and p-value.
    """
    X = np.asarray(X)
    Y = np.asarray(Y)

    n1, d = X.shape
    n2, _ = Y.shape

    # Means
    mean_X = np.mean(X, axis=0)
    mean_Y = np.mean(Y, axis=0)
    diff_mean = mean_X - mean_Y

    # Covariances
    Sx = np.cov(X, rowvar=False, bias=False)
    Sy = np.cov(Y, rowvar=False, bias=False)

    Sp = ((n1 - 1) * Sx + (n2 - 1) * Sy) / (n1 + n2 - 2)

    Sp_inv = np.linalg.inv(Sp)

    T2 = (n1 * n2) / (n1 + n2) * diff_mean @ Sp_inv @ diff_mean

    numerator_df = d
    denominator_df = n1 + n2 - d - 1

    if denominator_df <= 0:
        raise ValueError("Not enough observations to perform Hotelling's T^2 test.")

    F = ((n1 + n2 - d - 1) / (d * (n1 + n2 - 2))) * T2

    p_value = 1 - st.f.cdf(F, numerator_df, denominator_df)

    return T2, F, p_value

T2, F_stat, p_val = hotelling_t2_test(accpy_data, pypi_data)
print(f"Hotelling T^2 = {T2:.4f}")
print(f"F statistic   = {F_stat:.4f}")
print(f"p-value       = {p_val:.6g}")


In [None]:
import numpy as np
from scipy.spatial.distance import cdist

def random_subsample(X, size=1000, random_state=None):
    """
    Return 'size' random rows from X (without replacement).
    If X is smaller than 'size', return X as-is.
    """
    if random_state is not None:
        np.random.seed(random_state)
    n = len(X)
    if n <= size:
        return X
    idx = np.random.choice(n, size=size, replace=False)
    return X[idx]

def energy_distance_nd(X, Y):
    """
    Compute the generalized energy distance between two samples X and Y
    in any dimension d, using Euclidean distance.
    """
    d_xx = cdist(X, X)
    d_yy = cdist(Y, Y)
    d_xy = cdist(X, Y)
    mean_xx = d_xx.mean()
    mean_yy = d_yy.mean()
    mean_xy = d_xy.mean()
    return 2.0 * mean_xy - mean_xx - mean_yy

def two_sample_energy_test_nd(X, Y, n_permutations=1000, subsample_size=1000, random_state=None):
    """
    Permutation-based E-test on d-dimensional data, using random subsampling
    to avoid out-of-memory errors with large datasets.
    """
    if random_state is not None:
        np.random.seed(random_state)

    X_sub = random_subsample(X, size=subsample_size, random_state=random_state)
    Y_sub = random_subsample(Y, size=subsample_size, random_state=random_state)

    stat_obs = energy_distance_nd(X_sub, Y_sub)

    n1 = len(X_sub)
    n2 = len(Y_sub)
    pooled = np.vstack([X_sub, Y_sub])
    count = 0
    for _ in range(n_permutations):
        np.random.shuffle(pooled)  # shuffle in place
        Xp = pooled[:n1]
        Yp = pooled[n1:]
        stat_perm = energy_distance_nd(Xp, Yp)
        if stat_perm >= stat_obs:
            count += 1

    p_value = (count + 1.0) / (n_permutations + 1.0)
    return stat_obs, p_value

accpy_x = accpy_packets + accpy_dep_packets
accpy_y = accpy_packets_size + accpy_dep_packets_size
accpy_data = np.column_stack([accpy_x, accpy_y])

pypi_x = pypi_packets + pypi_dep_packets
pypi_y = pypi_packets_size + pypi_dep_packets_size
pypi_data = np.column_stack([pypi_x, pypi_y])


stat, p_val = two_sample_energy_test_nd(
    X=accpy_data,
    Y=pypi_data,
    n_permutations=1000,
    subsample_size=2000,   # choose a feasible sample size
    random_state=42
)

print("Approx. Energy Distance on subsample:", stat)
print("Permutation test p-value:", p_val)


In [None]:
# Load package lists from files
def load_package_list(file_path):
    with open(file_path, 'r') as f:
        return set(line.strip() for line in f)

accpy_packages = load_package_list('processed_data/accpy_index_new.list')
pypi_packages = load_package_list('processed_data/index.list')
accpy_packages.update({"availsim4", "certifi", "cmmnbuild-dep-manager", "comrad", "lhcsmapi", "longitudinal-tomography", "midas", "oaf", "pjlsa", "pybt", "pyda", "pydaq", "pyjapc", "pyrbac", "python-env", "pytimber", "stubgenj"})

# Find AccPy packages not in PyPI using set difference
accpy_not_in_pypi = accpy_packages - pypi_packages

# Calculate percentages
total_accpy_packages = len(accpy_packages)
total_pypi_packages = len(pypi_packages)
accpy_not_in_pypi_count = len(accpy_not_in_pypi)
accpy_in_pypi_count = total_accpy_packages - accpy_not_in_pypi_count

# Create pie chart
labels = ['Acc-Py packages not in PyPI', 'Acc-Py packages in PyPI']
sizes = [accpy_not_in_pypi_count, accpy_in_pypi_count]
colors = ['lightcoral', 'lightskyblue']
explode = (0.1, 0)  # Explode the first slice (AccPy not in PyPI)

plt.pie(sizes, explode=explode, labels=labels, colors=colors,
        autopct='%1.1f%%', shadow=True, startangle=140, textprops={'fontsize': 16})
plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
plt.title('Acc-Py Package Presence in PyPI', fontsize=20)
plt.show()

print(f"Total AccPy packages: {total_accpy_packages}")
print(f"AccPy packages not in PyPI: {accpy_not_in_pypi_count}")
print(f"Total PyPi packages: {total_pypi_packages}")


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import textwrap

# Load the data
df = pd.read_csv(survey_file)

# Define the Yes/No and Likert scale columns
yes_no_columns = [
    'Prior to this demonstration, did you have any knowledge of how a dependency confusion attack works? (cf. https://owasp.org/www-project-top-10-ci-cd-security-risks/CICD-SEC-03-Dependency-Chain-Abuse)\n',
    'Prior to this demonstration, were you aware that the acc-py environment could be affected by dependency confusion vulnerabilities?\n',
    'Do you think that the solution provided is useful?',
    'Were you aware of the potential privacy problems of telemetry?',
    'Were you aware of all the informations collected by acc-py?'
]

titles = [
    '(Q4) Prior to this demonstration, did you have any knowledge of how a dependency confusion attack works?',
    '(Q5) Prior to this demonstration, were you aware that the Acc-Py environment could be targeted by dependency confusion attacks?',
    '(Q8) Do you think that the solution to counter dependency confusion attacks we proposed is useful?',
    '(Q10) Were you aware of the potential privacy problems of telemetry?',
    '(Q11) Were you aware of all the informations collected by Acc-Py?'
]

likert_columns = [
    'How relevant do you believe this issue is for acc-py?',
    'How severe do you find dependency confusion attacks?',
    'Do you think this might be a problem?'
]
title_overrides = [
    '(Q6) How relevant do you believe dependency confusion attacks are for Acc-Py?',
    '(Q7) How severe do you find dependency confusion attacks?',
    '(Q13) Do you think the current usage of telemetry in Acc-Py might be a problem?',
]

def wrap_title(text, width=72):
    """Wrap a title string to the given width."""
    return '\n'.join(textwrap.wrap(text, width))

# Possible responses we want to ensure always appear
yes_no_responses = ["Yes", "No"]

# --- Horizontal Bar Charts for Yes/No Columns ---
fig, axes = plt.subplots(len(yes_no_columns), 1, figsize=(6, 4), sharex=True)

for i, column in enumerate(yes_no_columns):
    ax = axes[i]
    # Count the occurrences of each response
    counts = df[column].value_counts()

    # Reindex so both 'Yes' and 'No' appear, even if one is missing
    counts = counts.reindex(yes_no_responses, fill_value=0)

    # Create a horizontal bar chart
    #ax.barh(counts.index.astype(str), counts.values)
    WD = 0.01
    ax.barh(["No"], [-counts.values[1]], color="#a83232", height=WD)
    ax.twinx().barh(["Yes"], [counts.values[0]], color="#42a832", height=WD)

    # Wrap the title to ensure it fits nicely in the width
    ax.set_title(wrap_title(titles[i]), fontsize=10, loc='left')

    ax.set_xticks([-4,-3,-2,-1,0,1,2,3,4])
    ax.set_xticklabels([4,3,2,1,0,1,2,3,4])

    # Only label the bottom plot with an X-label to save space
    if i < len(yes_no_columns) - 1:
        ax.set_xlabel('')
    else:
        ax.set_xlabel('Answer count')

# Adjust subplot spacing, then use tight_layout
plt.subplots_adjust(hspace=0.05)  # Adjust vertical spacing as needed
plt.tight_layout()
plt.show()

# --- Likert Scale Columns (Line + Error Bar) ---
fig, axes = plt.subplots(len(likert_columns), 1, figsize=(6, 4))

# Loop through the Likert columns to calculate and plot averages
for i, column in enumerate(likert_columns):
    ax = axes[i]

    # Calculate the average and standard deviation
    average_score = df[column].mean()
    std_dev = df[column].std()

    # Plot a horizontal line representing the scale (from 0 to 10)
    ax.plot([0, 10], [1, 1], color='gray', lw=2)  # The scale line

    # Mark the average score with a red dot and error bar
    ax.errorbar(
        average_score,
        1,
        xerr=std_dev,
        fmt='o',
        color='red',
        markersize=10,
        capsize=5
    )

    # Annotate the average score
    ax.text(
        average_score,
        1.02,
        f'Avg: {average_score:.2f}',
        horizontalalignment='center',
        fontsize=10
    )
    ax.text(
        average_score,
        0.96,
        f'SD: {std_dev:.2f}',
        horizontalalignment='center',
        fontsize=10
    )

    # Set the x-axis limits and labels
    ax.set_xlim(0, 10)
    ax.set_xticks(range(0, 11))

    # Remove y-axis ticks and labels (since this is a single line)
    ax.set_yticks([])

    # Set the title of the subplot
    ax.set_title(wrap_title(title_overrides[i]), fontsize=10, loc='left')

plt.tight_layout()
plt.show()
