In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import linregress
from scipy.stats import levene
from scipy.stats import kruskal
from scipy.stats import friedmanchisquare
from scipy.stats import ttest_ind
from statsmodels.stats.oneway import anova_oneway
from itertools import combinations
import scipy.stats as stats
from scipy.stats import entropy
from scipy.stats import pearsonr, spearmanr
from scipy.stats import ttest_ind
from scipy.stats import levene, kruskal

#Initial Experiment Analysis

In [None]:
tius_scores = 'path'
iis_scores = 'path2'

merged = tius_scores.merge(iis_scores, on=['Scenario','Temp','Run'])

Visualize TIUS by temperature levels (Fig 5a)

In [None]:
plt.rcParams['font.size'] = 20
plt.figure(figsize=(5, 5))
sns.boxplot(x='Temp', y='TIUS', data=merged)

means = data_four_one.groupby('Temp')['TIUS'].mean()
x = range(0, len(means))

# Linear interpolation for trend line
slope, intercept = np.polyfit(x, means, 1)
plt.plot(x, slope * x + intercept, color='r', linestyle='-')

plt.xlabel('Temperature')
plt.ylabel('TIUS')
plt.show()

Visualize IIS by temperature level (Fig 5b)

In [None]:
plt.figure(figsize=(5, 5))
sns.boxplot(x='Temp', y='IIS', data=data_four_one)
means = data_four_one.groupby('Temp')['IIS'].mean()
x = range(0, len(means))
slope, intercept = np.polyfit(x, means, 1)
plt.plot(x, slope * x + intercept, color='r', linestyle='-')
plt.xlabel('Temperature')
plt.ylabel('IIS')
plt.show()

Visualize TIUS and IIS by Scenarios (Fig. 6)

In [None]:
def plot_boxplots_scenarios(df,goal):
  df = data_four_one[['Type','Scenario','Temp',goal,'TIUS']]
  melted = df.melt(id_vars=['Scenario','Temp','Type'], var_name='Metric', value_name='Value')

  custom_order = ['Adding Local PCs', 'Internet Connectivity', 'Adding DMZ', 'Adding DRA', 'Adding Communication Servers', 'Basic Zone Based Firewall', 'Transparent IOS Firewall', 'Time Based Access List', 'IP Traffic Export', 'Role Based CLI Access']
  custom_palette = {goal: '#e07c4f', 'TIUS': '#1C70B9'}
  fig, ax1 = plt.subplots(figsize=(7, 3.5))

  color = 'tab:brown'
  ax1.set_xlabel(" ")
  ax1.set_ylabel('IIS', color=color)
  sns.boxplot(x='Scenario', y='Value', hue='Metric', data=melted, palette=custom_palette, order=custom_order)
  ax1.tick_params(axis='y', labelcolor=color)

  plt.xticks(rotation=90, fontsize = 16)

  ax2 = ax1.twinx()  # instantiate a second Axes that shares the same x-axis

  color = 'tab:blue'
  ax2.set_ylabel('TIUS', color=color)  # we already handled the x-label with ax1
  ax2.tick_params(axis='y', labelcolor=color)
  ax2.set_ylim(bottom= 0 , top = 100)
  ax1.legend_.set_visible(False)
  plt.show()

plot_boxplots_scenarios(merged,'IIS')

# Human - LLM correlation

Fig. 7: The correlation between human and LLM IIS (Spearman of 0.847 with a p-value of 4.11 × 10−67)

In [None]:
plt.rcParams['font.size'] = 20
def plotcorrelations(df):
  plt.figure(figsize=(6, 6))
  plt.scatter(df['LLM Score'], df['Human Score'])
  slope, intercept = np.polyfit(df['LLM Score'], df['Human Score'], 1)
  regression_line = slope * df['LLM Score'] + intercept
  plt.plot(df['LLM Score'], regression_line, color='red', label=f'Regression Line: y = {slope:.2f}x + {intercept:.2f}')

  plt.xlabel("LLM IIS")
  plt.ylabel("Human IIS")
  plt.show()


  # Calculate correlations
  spearman_corr, spearman_p = stats.spearmanr(df['LLM Score'], df['Human Score'])
  pearson_corr, pearson_p = stats.pearsonr(df['LLM Score'], df['Human Score'])
  kendall_corr, kendall_p = stats.kendalltau(df['LLM Score'], df['Human Score'])


  print(f"Spearman Correlation: {spearman_corr:.3f}, p-value: {spearman_p}")
  print(f"Pearson Correlation: {pearson_corr:.3f}, p-value: {pearson_p}")
  print(f"Kendall Tau Correlation: {kendall_corr:.3f}, p-value: {kendall_p}")
  new_row = {'Spearman': spearman_corr, 'Spearman_p':spearman_p, 'Pearson': pearson_corr,'pearson_p':pearson_p, 'Kendall': kendall_corr,'kendall_p':kendall_p}
  return new_row


#Full Experiment Analysis

In [None]:
tius_scores = 'path'
iis_scores = 'path2'

merged = tius_scores.merge(iis_scores, on=['Scenario','Type'])
plt.rcParams['font.size'] = 20

## RQ1: How effectively does GPT-4 interpret network topology images of varying quality?

Visualizing TIUS by scenario type (Fig. 9a)

In [None]:
merged['TIUS'] = merged['TIUS']*100
plt.figure(figsize=(5, 5))
sns.boxplot(x='Type', y='TIUS', data=merged, palette='muted')
plt.xlabel('')
plt.ylabel('TIUS',fontsize=18)
plt.show()

Statistical tests

In [None]:
df_topo = merged[merged['Type'] == 'Topology']
df_config = merged[merged['Type'] == 'Configuration']
levene_stat, levene_p = levene(df_topo['TIUS'], df_config['TIUS'])
print(f"Levene's test statistic: {levene_stat:.3f}")
print(f"Levene's test p-value: {levene_p:.3f}")
welch_stat, welch_p = ttest_ind(df_topo['TIUS'], df_config['TIUS'], equal_var=False) # Welch's t-test assumes unequal variances
print(f"Welch's t-test statistic: {welch_stat:.3f}")
print(f"Welch's t-test p-value: {welch_p:.3f}")

Visualizing TIUS by platform

In [None]:
desired_order = ['PowerPoint','GNS3', 'Paper Sketches']
# Plot the boxplot
plt.figure(figsize=(5, 5))
sns.boxplot(x='Platform', y='TIUS', data=merged, hue='Platform', order=desired_order)
# Customize labels and appearance
plt.xlabel('')
plt.ylabel('TIUS')
plt.xticks(rotation=20)
plt.show()

Statistical tests

In [None]:
platforms = merged['Platform'].unique()
platform_groups = {platform: merged[merged['Platform'] == platform]['TIUS'] for platform in platforms}
# Perform pairwise t-tests
for platform1, platform2 in combinations(platforms, 2):
    t_stat, p_value = ttest_ind(platform_groups[platform1], platform_groups[platform2])
    print(f"T-test between {platform1} and {platform2}:")
    print(f"  t-statistic: {t_stat:.3f}")
    print(f"  p-value: {p_value:.3f}")

Visualizing TIUS by diagram type

In [None]:
desired_order = ['Normal','No Labels on Edges', 'Messy Layout']
plt.figure(figsize=(5, 5))
sns.boxplot(x='Diagram_Type', y='TIUS', data=merged, hue='Diagram_Type',order = desired_order)
plt.xlabel('')
plt.ylabel('TIUS')
plt.xticks(rotation=30)
plt.show()

Statistical tests

In [None]:
# Group data by platform
types = merged['Diagram_Type'].unique()
platform_groups = {platform: merged[merged['Diagram_Type'] == platform]['TIUS'] for platform in types}

# Perform pairwise t-tests
for platform1, platform2 in combinations(types, 2):
    t_stat, p_value = ttest_ind(platform_groups[platform1], platform_groups[platform2])
    print(f"T-test between {platform1} and {platform2}:")
    print(f"  t-statistic: {t_stat:.3f}")
    print(f"  p-value: {p_value:.3f}")

Visualizing TIUS by diagram type and platform (Fig. 8)

In [None]:
# Group the data by 'Platform' and 'Diagram_Type' and calculate mean scores.
desired_order = ['PowerPoint','GNS3', 'Paper Sketches']

def plot_vision_boxplots(df):
  means = df.groupby(['Platform', 'Diagram_Type'])['TIUS'].mean().reset_index()

  plt.figure(figsize=(10, 5))
  sns.boxplot(
      data=df,
      x='Platform',
      y='TIUS',
      hue='Diagram_Type',
      palette='muted',
      order = desired_order
  )

  plt.xlabel('')
  plt.ylabel('TIUS')

  plt.legend(title='Diagram Type',  loc='lower left', title_fontsize=16, fontsize=16)
  plt.xticks(rotation=0)
  # Show the plot.
  plt.tight_layout()
  plt.show()

Statistical tests

In [None]:
groups = merged.groupby(['Platform', 'Diagram_Type'])['TIUS']

# Extract data for each group
data = [group.values for _, group in groups]

# Perform Levene's test
statistic, p_value = levene(*data)

print(f"Levene's Test Statistic: {statistic}")
print(f"P-value: {p_value}")

# Interpret the results
alpha = 0.05  # Significance level
if p_value < alpha:
    print("Variances are significantly different.")
else:
    print("Variances are not significantly different.")

Pairwise Leene's test

In [None]:
plt.rcParams['font.size'] = 14
groups = merged.groupby(['Platform', 'Diagram_Type'])['TIUS']

# Create a pivot table to store p-values
combinations = list(groups.groups.keys())
n = len(combinations)
p_values = np.zeros((n, n))

# Perform pairwise Levene's test and store the p-values
for i in range(n):
    for j in range(i, n):  # Only calculate upper triangle
        group1_name = combinations[i]
        group2_name = combinations[j]
        group1_data = merged[(merged['Platform'] == group1_name[0]) & (merged['Diagram_Type'] == group1_name[1])]['TIUS']
        group2_data = merged[(merged['Platform'] == group2_name[0]) & (merged['Diagram_Type'] == group2_name[1])]['TIUS']

        if i != j:
            _, p_value = levene(group1_data, group2_data)
        else:
            p_value = 1  # Variance is identical with itself

        p_values[i, j] = p_values[j, i] = p_value

# Create a DataFrame for the heatmap
p_values_df = pd.DataFrame(p_values, index=combinations, columns=combinations)

# Plot the heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(p_values_df, annot=True, fmt=".2f", cmap="coolwarm", cbar=True, xticklabels=True, yticklabels=True,annot_kws={"size": 10})
plt.title("Pairwise Levene's Test P-Values")
plt.xlabel("Group")
plt.ylabel("Group")
plt.xticks(rotation=30, ha="right")
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

Informatio Gain calculation

In [None]:
def calculate_information_gain(data, target_variable, feature):
    # Calculate entropy of the target variable
    target_entropy = entropy(data[target_variable].value_counts(normalize=True))

    # Calculate conditional entropy of target variable given the feature
    conditional_entropy = 0
    for value in data[feature].unique():
        subset = data[data[feature] == value]
        conditional_entropy += (len(subset) / len(data)) * entropy(subset[target_variable].value_counts(normalize=True))

    # Calculate information gain
    information_gain = target_entropy - conditional_entropy
    return information_gain


target_variable = 'TIUS'
features = ['Platform', 'Scenario', 'Diagram_Type', 'Type']

for feature in features:
  try:
    information_gain = calculate_information_gain(merged, target_variable, feature)
    print(f"Information Gain of {feature} on {target_variable}: {information_gain}")
  except KeyError:
    print(f"Feature '{feature}' not found in the DataFrame. Skipping.")

Pairwise t-tests on platforms and diagram types

In [None]:
# Group data by 'Platform' and 'Diagram_Type'
groups = merged.groupby(['Platform', 'Diagram_Type'])['TIUS']

# Create a list of all unique group combinations
combinations = list(groups.groups.keys())
num_combinations = len(combinations)

# Perform pairwise Welch's t-tests
results = {}
p_values = np.full((num_combinations, num_combinations), np.nan)  # Matrix to store p-values

for i in range(num_combinations):
    for j in range(i + 1, num_combinations):
        group1_name = combinations[i]
        group2_name = combinations[j]

        group1_data = merged[
            (merged['Platform'] == group1_name[0]) &
            (merged['Diagram_Type'] == group1_name[1])
        ]['TIUS']

        group2_data = merged[
            (merged['Platform'] == group2_name[0]) &
            (merged['Diagram_Type'] == group2_name[1])
        ]['TIUS']

        t_stat, p_val = ttest_ind(group1_data, group2_data, equal_var=False)  # Welch's t-test
        results[(group1_name, group2_name)] = (t_stat, p_val)
        p_values[i, j] = p_val
        p_values[j, i] = p_val  # Symmetric matrix

# Convert the combinations to readable labels for the heatmap
labels = [f"{platform}-{diagram_type}" for platform, diagram_type in combinations]

# Create a DataFrame for the heatmap
p_values_df = pd.DataFrame(p_values, index=labels, columns=labels)

# Plot the heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(p_values_df, annot=True, fmt=".2f", cmap="coolwarm", cbar=True,
            xticklabels=labels, yticklabels=labels,annot_kws={"size": 10})
plt.title("Pairwise Welch's t-test p-values")
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

## RQ2: How effectively can an LLM-based networking co-pilot handle network intents?

Visualize IIS by scenario type (Fig 9b)

In [None]:
plt.figure(figsize=(5, 5))
sns.boxplot(x='Type', y='Score', data = merged, palette='muted')
plt.xlabel('')
plt.ylabel('IIS')
plt.show()

Variance equalty statistical test

In [None]:
groups = merged.groupby('Type')['Score'].apply(list)

# Performing Levene's test
statistic, p_value = levene(*groups)

# Printing the results
print(f"Levene's variance equality test statistic: {statistic}")
print(f"Levene's test p-value: {p_value}")

# Determining significance and conclusion
alpha = 0.05  # Significance level
if p_value < alpha:
    print(f"The test is significant (p < {alpha}). Variances are not equal among the groups.")
else:
    print(f"The test is not significant (p >= {alpha}). Variances are equal among the groups.")

Mean equalty statistical tests (t-test)

In [None]:
groups = merged.groupby('Type')['Score'].apply(list)

# Performing Welch's t-test
group1, group2 = groups.iloc[0], groups.iloc[1]  # Assuming two groups for the t-test
statistic, p_value = ttest_ind(group1, group2, equal_var=False)

# Printing the results
print(f"Welch's t-test statistic: {statistic}")
print(f"Welch's t-test p-value: {p_value}")

# Determining significance and conclusion
alpha = 0.05  # Significance level
if p_value < alpha:
    print(f"The test is significant (p < {alpha}). The means of the two groups are significantly different.")
else:
    print(f"The test is not significant (p >= {alpha}). The means of the two groups are not significantly different.")

Visualize IIS by platform

In [None]:
desired_order = ['PowerPoint','GNS3', 'Paper Sketches']
plt.figure(figsize=(5, 5))
sns.boxplot(x='Platform', y='Score', data = mergedall, hue='Platform',order = desired_order)
plt.xlabel('')
plt.ylabel('IIS')
plt.xticks(rotation=10)
plt.show()

Pairwise mean equality statistical tests (t-test)

In [None]:
groups = merged.groupby('Platform')['Score'].apply(list)

# Performing pairwise Welch's t-tests
print("Performing pairwise Welch's t-tests:")

# Initialize a list to store results
pairwise_results = []
alpha = 0.05  # Significance level

for (group1_name, group1_data), (group2_name, group2_data) in combinations(groups.items(), 2):
    # Welch's t-test with unequal variances
    stat, p = ttest_ind(group1_data, group2_data, equal_var=False)

    result = {
        'Group 1': group1_name,
        'Group 2': group2_name,
        'Statistic': stat,
        'p-value': p,
        'Conclusion': 'Significant' if p < alpha else 'Not Significant'
    }
    pairwise_results.append(result)

pairwise_results_df = pd.DataFrame(pairwise_results)
print(pairwise_results_df)

Visualizing IIS by diagram type

In [None]:
desired_order = ['Normal','No Labels on Edges', 'Messy Layout']
plt.figure(figsize=(5, 5))
sns.boxplot(x='Diagram_Type', y='Score', data=merged, hue='Diagram_Type', order = desired_order)
plt.xlabel('')
plt.ylabel('IIS')
plt.xticks(rotation=20)
plt.show()

Pairwise mean equality statistical tests (t-test)

In [None]:
groups = merged.groupby('Diagram_Type')['Score'].apply(list)

# Performing pairwise Welch's t-tests
print("Performing pairwise Welch's t-tests:")

# Initialize a list to store results
pairwise_results = []
alpha = 0.05  # Significance level

for (group1_name, group1_data), (group2_name, group2_data) in combinations(groups.items(), 2):
    # Welch's t-test with unequal variances
    stat, p = ttest_ind(group1_data, group2_data, equal_var=False)

    result = {
        'Group 1': group1_name,
        'Group 2': group2_name,
        'Statistic': stat,
        'p-value': p,
        'Conclusion': 'Significant' if p < alpha else 'Not Significant'
    }
    pairwise_results.append(result)

pairwise_results_df = pd.DataFrame(pairwise_results)
print(pairwise_results_df)

Visualizing TIUS by diagram type and platform (Fig. 10)

In [None]:
# Group the data by 'Platform' and 'Diagram_Type' and calculate mean scores.
desired_order = ['PowerPoint','GNS3', 'Paper Sketches']  # Replace with your specific order
def plot_boxplots(df,goal):
  means = df.groupby(['Platform', 'Diagram_Type'])[goal].mean().reset_index()

  # Create a boxplot.
  plt.figure(figsize=(10, 5))
  sns.boxplot(
      data=df,
      x='Platform',
      y='Score',
      hue='Diagram_Type',
      palette='muted',
      order = desired_order
  )

  # Customize plot aesthetics.
  # plt.title('Distribution of IIS by Diagram Type and Platform')
  plt.xlabel('')
  plt.ylabel('IIS')
  # plt.xticks(rotation=15)
  plt.yticks()

  plt.legend(title='Diagram Type', loc='lower left', title_fontsize=16, fontsize=16)
  # plt.legend.get_frame().set_alpha(0.5)
  plt.xticks(rotation=0)
  # Show the plot.
  plt.tight_layout()
  plt.show()
plot_boxplots(merged,'Score')

Pairwise variance equality tests

In [None]:
plt.rcParams['font.size'] = 14

groups = merged.groupby(['Platform', 'Diagram_Type'])['Score']
# Create a pivot table to store p-values
combinations = list(groups.groups.keys())
n = len(combinations)
p_values = np.zeros((n, n))

# Perform pairwise Levene's test and store the p-values
for i in range(n):
    for j in range(i, n):  # Only calculate upper triangle
        group1_name = combinations[i]
        group2_name = combinations[j]
        group1_data = merged[(merged['Platform'] == group1_name[0]) & (merged['Diagram_Type'] == group1_name[1])]['Score']
        group2_data = merged[(merged['Platform'] == group2_name[0]) & (merged['Diagram_Type'] == group2_name[1])]['Score']

        if i != j:
            _, p_value = levene(group1_data, group2_data)
        else:
            p_value = 1  # Variance is identical with itself

        p_values[i, j] = p_values[j, i] = p_value

# Create a DataFrame for the heatmap
p_values_df = pd.DataFrame(p_values, index=combinations, columns=combinations)

plt.figure(figsize=(10, 8))
sns.heatmap(p_values_df, annot=True, fmt=".2f", cmap="coolwarm", cbar=True, xticklabels=True, yticklabels=True,annot_kws={"size": 10})
plt.title("Pairwise Levene's Test P-Values")
plt.xlabel("Group")
plt.ylabel("Group")
plt.xticks(rotation=45, ha="right")
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

Pairwise mean equality statistical tests (t-test)

In [None]:
# Group data by 'Platform' and 'Diagram_Type'
groups = merged.groupby(['Platform', 'Diagram_Type'])['Score']

# Create a list of all unique group combinations
combinations = list(groups.groups.keys())
num_combinations = len(combinations)

# Perform pairwise Welch's t-tests
results = {}
p_values = np.full((num_combinations, num_combinations), np.nan)  # Matrix to store p-values

for i in range(num_combinations):
    for j in range(i + 1, num_combinations):
        group1_name = combinations[i]
        group2_name = combinations[j]

        group1_data = merged[
            (merged['Platform'] == group1_name[0]) &
            (merged['Diagram_Type'] == group1_name[1])
        ]['Score']

        group2_data = merged[
            (merged['Platform'] == group2_name[0]) &
            (merged['Diagram_Type'] == group2_name[1])
        ]['Score']

        t_stat, p_val = ttest_ind(group1_data, group2_data, equal_var=False)  # Welch's t-test
        results[(group1_name, group2_name)] = (t_stat, p_val)
        p_values[i, j] = p_val
        p_values[j, i] = p_val  # Symmetric matrix

# Convert the combinations to readable labels for the heatmap
labels = [f"{platform}-{diagram_type}" for platform, diagram_type in combinations]

# Create a DataFrame for the heatmap
p_values_df = pd.DataFrame(p_values, index=labels, columns=labels)

plt.figure(figsize=(10, 8))
sns.heatmap(p_values_df, annot=True, fmt=".2f", cmap="coolwarm", cbar=True,
            xticklabels=labels, yticklabels=labels,annot_kws={"size": 10})
plt.title("Pairwise Welch's t-test p-values")
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

Informatio Gain calculation

In [None]:
target_variable = 'Score'
features = ['Platform', 'Scenario', 'Diagram_Type', 'Type','TIUS']

for feature in features:
  try:
    information_gain = calculate_information_gain(merged, target_variable, feature)
    print(f"Information Gain of {feature} on {target_variable}: {information_gain}")
  except KeyError:
    print(f"Feature '{feature}' not found in the DataFrame. Skipping.")

## RQ3: To what extent does the complexity of intent, as perceived by domain experts, align with the intent implementations produced by an LLM-based networking co-pilot?

In [None]:
scenario_to_bin = {
    'Adding DRA': 'Hard',
    'Basic Zone Based Firewall': 'Hard',
    'Adding DMZ': 'Medium',
    'IP Traffic Export': 'Medium',
    'Internet Connectivity': 'Medium',
    'Transparent IOS Firewall': 'Medium', #?
    'Adding Communication Servers': 'Easy',
    'Adding Local PCs': 'Easy',
    'Role Based CLI Access': 'Easy',
    'Time Based Access List': 'Easy'
}

merged['Complexity'] = merged['Scenario'].map(scenario_to_bin)

Visualize IIS by scenario complexity (Fig. 11)

In [None]:
plt.figure(figsize=(5, 5))
palette = ['#1f77b4', '#ff7f0e', '#d62728']
sns.boxplot(x='Complexity', y='Score', data = merged, palette=palette)
plt.xlabel('')
plt.ylabel('IIS')
plt.show()
merged.groupby('Complexity')['Score'].mean()

Statistical tests - complexity's mipact on IIS

In [None]:
groups = merged.groupby('Complexity')['Score'].apply(list)

# Performing Levene's test
levene_statistic, levene_p_value = levene(*groups)

# Printing Levene's test results
print(f"Levene's variance equality test statistic: {levene_statistic}")
print(f"Levene's test p-value: {levene_p_value}")

# Determining significance and conclusion for Levene's test
alpha = 0.05  # Significance level
if levene_p_value < alpha:
    print(f"The test is significant (p < {alpha}). Variances are not equal among the groups.")
else:
    print(f"The test is not significant (p >= {alpha}). Variances are equal among the groups.")

# Performing Kruskal-Wallis H-test
kruskal_statistic, kruskal_p_value = kruskal(*groups)

# Printing Kruskal-Wallis results
print(f"\nKruskal-Wallis H-test statistic: {kruskal_statistic}")
print(f"Kruskal-Wallis H-test p-value: {kruskal_p_value}")

# Determining significance and conclusion for Kruskal-Wallis test
if kruskal_p_value < alpha:
    print(f"The Kruskal-Wallis test is significant (p < {alpha}). At least one group distribution is significantly different.")
else:
    print(f"The Kruskal-Wallis test is not significant (p >= {alpha}). Group distributions are not significantly different.")