This notebook requires pre-computed data. You can get this data by running:

`python3 -m analysis download`

and then

`python3 -m analysis compute-features`

This will build a `data.json` and `features.csv` file in the root of this directory.

In [48]:
import pandas as pd
from analysis.models.data import Data
from analysis.performance_gap import top_performers, unresolved_instances

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

TOP_K = 3

with open("data.json") as f:
    data = Data.model_validate_json(f.read())

df = pd.read_csv("features.csv")

def good_metric(metric: str) -> bool:
    if metric == "instance_id":
        return False

    if metric.startswith("instance") or metric.startswith("patch"):
        return True

    if metric.endswith("diff"):
        return True

    return False


In [49]:
# Drop any features that are zero for all instances (this is the default and indicative of some kind of failure)
df = df.loc[:, (df != 0).any(axis=0)]
metrics = [column for column in df.columns if good_metric(column)]

In [50]:
print(f"Number of models: {len(data.systems)}")
print(f"Number of instances: {len(data.dataset.instances)}")
print(f"Number of features: {len(df.columns)}")

Number of models: 56
Number of instances: 500
Number of features: 81


In [51]:
# Source model is always OpenHands
source = data.systems[data.closest_system("OpenHands")]
targets = top_performers(data.systems.values(), k=TOP_K)

In [52]:
print(f"Source model: {source.metadata.name}")
print(f"Target models: \n{'\n'.join(['  - ' + t.metadata.name for t in targets])}")

Source model: OpenHands + CodeAct v2.1 (claude-3-5-sonnet-20241022)
Target models: 
  - Blackbox AI Agent
  - Learn-by-interact
  - devlo


In [53]:
# Compute the performance gap
gap = unresolved_instances(source, targets, threshold=1)
df['gap'] = df['instance_id'].apply(lambda instance_id: 1 if instance_id in gap else 0)

In [54]:
print(f"Number of instances with performance gap: {len(gap)}")

Number of instances with performance gap: 125


In [55]:
correlations = {metric: df[metric].corr(df["gap"]) for metric in metrics}

# filter out the nans
correlations = {metric: corr for metric, corr in correlations.items() if not pd.isna(corr)}

In [56]:
# The top 5 metrics with the highest correlation
top_metrics = sorted(correlations, key=lambda x: abs(correlations[x]), reverse=True)[:5]
for top_metric in top_metrics:
    print(f"{top_metric}: {correlations[top_metric]}")

code/number_of_returns/diff: 0.10408069720830572
error/number_of_broad_excepts/diff: -0.07723789843828396
patch/number_of_added_lines: -0.0705463458223313
code/max_function_length/diff: 0.06952425482539248
instance/problem_statement_length: 0.06912898473206688


In [57]:
from scipy import stats # type: ignore

max_stat = (len(data.dataset.instances) - len(gap))  * len(gap)

results = []
for metric in metrics:
    result = stats.mannwhitneyu(
        df[df['gap']==0][metric], 
        df[df['gap']==1][metric]
    )
    results.append({
        "metric": metric,
        "p value": result.pvalue,
        "relative statistic": (result.statistic / max_stat) - 0.5
    })

print("Mann-Whitney U Test Results")
results = sorted(results, key=lambda x: abs(x["relative statistic"]), reverse=True)
for result in results[:10]:
    print(f"Metric: {result['metric']}, relative statistic: {result['relative statistic']*100:0.2f}%, p: {result['p value']:.2f}")

Mann-Whitney U Test Results
Metric: patch/number_of_removed_lines, relative statistic: 4.42%, p: 0.12
Metric: code/number_of_returns/diff, relative statistic: -3.19%, p: 0.30
Metric: type/number_of_custom_types/diff, relative statistic: 3.06%, p: 0.16
Metric: code/max_function_length/diff, relative statistic: -3.05%, p: 0.32
Metric: patch/number_of_files, relative statistic: 2.87%, p: 0.09
Metric: error/number_of_finally_blocks/diff, relative statistic: 2.84%, p: 0.05
Metric: code/number_of_variables/diff, relative statistic: 2.78%, p: 0.33
Metric: error/number_of_raise_statements/diff, relative statistic: -2.41%, p: 0.42
Metric: code/number_of_decorators/diff, relative statistic: 2.31%, p: 0.39
Metric: code/average_function_length/diff, relative statistic: -2.23%, p: 0.48


In [58]:
from sklearn.feature_selection import f_classif

# F-scores for each feature
f_scores, p_values = f_classif(df[metrics], df['gap'])
f_scores
feature_scores = pd.DataFrame({
    'feature': df[metrics].columns,
    'F_score': f_scores,
    'p_value': p_values
})
top_features = feature_scores.sort_values('F_score', ascending=False)[:10]
print(top_features)

                                     feature   F_score   p_value
11               code/number_of_returns/diff  5.442859  0.020047
22        error/number_of_broad_excepts/diff  2.982744  0.084778
27               patch/number_of_added_lines  2.485835  0.115512
9              code/max_function_length/diff  2.413978  0.120893
29         instance/problem_statement_length  2.386476  0.123026
24  dependency/number_of_function_calls/diff  2.012931  0.156591
3                  code/number_of_lines/diff  1.999483  0.157979
25                     patch/number_of_files  1.869242  0.172180
26                     patch/number_of_lines  1.850264  0.174369
28             patch/number_of_removed_lines  1.733301  0.188597


In [59]:
X_train, X_test, y_train, y_test = train_test_split(
    df[metrics], df['gap'], test_size=0.2, random_state=42
)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train model
clf = RandomForestClassifier(
    n_estimators=100,
    class_weight='balanced',
    max_depth=None,    # to control tree depth
    min_samples_leaf=2 # to prevent overfitting
)
clf.fit(X_train, y_train)

In [60]:
y_pred = clf.predict(X_test)
y_prob = clf.predict_proba(X_test)

print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.99      0.85        73
           1       0.75      0.11      0.19        27

    accuracy                           0.75       100
   macro avg       0.75      0.55      0.52       100
weighted avg       0.75      0.75      0.67       100



In [61]:
import numpy as np # type: ignore

feature_importance = pd.DataFrame({
    'feature': df[metrics].columns,
    'importance': clf.feature_importances_,
    'std': np.std([tree.feature_importances_ for tree in clf.estimators_], axis=0)  # Add std across trees
}).sort_values('importance', ascending=False)

print(feature_importance.head(10))

                                     feature  importance       std
8          code/average_function_length/diff    0.071909  0.044251
29         instance/problem_statement_length    0.060062  0.043046
26                     patch/number_of_lines    0.056506  0.034055
27               patch/number_of_added_lines    0.054832  0.036246
10   code/number_of_function_parameters/diff    0.052209  0.034493
24  dependency/number_of_function_calls/diff    0.052169  0.036726
3                  code/number_of_lines/diff    0.047634  0.033852
11               code/number_of_returns/diff    0.047332  0.028731
7              code/number_of_variables/diff    0.047151  0.031422
9              code/max_function_length/diff    0.044527  0.033182


In [62]:
from sklearn.inspection import permutation_importance # type: ignore

r = permutation_importance(clf, X_test, y_test, n_repeats=10)
perm_importance = pd.DataFrame({
    'feature': df[metrics].columns,
    'perm_importance': r.importances_mean,
    'perm_std': r.importances_std
}).sort_values('perm_importance', ascending=False)

print(perm_importance.head(10))


                                     feature  perm_importance  perm_std
14      type/number_of_type_annotations/diff            0.008  0.004000
13            code/number_of_decorators/diff            0.006  0.006633
11               code/number_of_returns/diff            0.006  0.009165
27               patch/number_of_added_lines            0.005  0.012845
6     code/number_of_control_statements/diff            0.005  0.005000
8          code/average_function_length/diff            0.005  0.010247
3                  code/number_of_lines/diff            0.004  0.004899
25                     patch/number_of_files            0.004  0.004899
24  dependency/number_of_function_calls/diff            0.003  0.004583
22        error/number_of_broad_excepts/diff            0.002  0.004000


In [63]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(class_weight='balanced')
lr.fit(X_train, y_train)

y_pred = lr.predict(X_test)
y_prob = lr.predict_proba(X_test)

print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.66      0.71        73
           1       0.32      0.44      0.38        27

    accuracy                           0.60       100
   macro avg       0.54      0.55      0.54       100
weighted avg       0.64      0.60      0.62       100



In [64]:
df['unresolved'] = df['instance_id'].apply(lambda instance_id: 0 if instance_id in source.results.resolved else 1)

correlations = {metric: df[metric].corr(df["unresolved"]) for metric in metrics}

# filter out the nans
correlations = {metric: corr for metric, corr in correlations.items() if not pd.isna(corr)}

# The top 5 metrics with the highest correlation
top_metrics = sorted(correlations, key=lambda x: abs(correlations[x]), reverse=True)[:5]
for top_metric in top_metrics:
    print(f"{top_metric}: {correlations[top_metric]}")

patch/number_of_added_lines: 0.25729286547823116
patch/number_of_lines: 0.23324109140208865
code/number_of_lines/diff: -0.22504473518170223
patch/number_of_removed_lines: 0.1951583074246105
code/max_nested_depth/diff: -0.1933921241459966


In [65]:
max_stat = (len(data.dataset.instances) - len(source.results.resolved))  * len(source.results.resolved)

results = []
for metric in metrics:
    result = stats.mannwhitneyu(
        df[df['unresolved']==0][metric], 
        df[df['unresolved']==1][metric]
    )
    results.append({
        "metric": metric,
        "p value": result.pvalue,
        "relative statistic": (result.statistic / max_stat) - 0.5
    })

print("Mann-Whitney U Test Results")
results = sorted(results, key=lambda x: abs(x["relative statistic"]), reverse=True)
for result in results[:10]:
    print(f"Metric: {result['metric']}, relative statistic: {result['relative statistic']*100:0.2f}%, p: {result['p value']:.10f}")

Mann-Whitney U Test Results
Metric: patch/number_of_added_lines, relative statistic: -18.28%, p: 0.0000000000
Metric: patch/number_of_lines, relative statistic: -17.90%, p: 0.0000000000
Metric: code/number_of_lines/diff, relative statistic: 17.33%, p: 0.0000000000
Metric: code/number_of_comment_lines/diff, relative statistic: 11.72%, p: 0.0000022231
Metric: code/max_nested_depth/diff, relative statistic: 11.40%, p: 0.0000051045
Metric: patch/number_of_removed_lines, relative statistic: -11.08%, p: 0.0000180728
Metric: error/max_except_depth/diff, relative statistic: 10.37%, p: 0.0000111061
Metric: code/number_of_control_statements/diff, relative statistic: 10.00%, p: 0.0000747549
Metric: error/number_of_try_blocks/diff, relative statistic: 9.21%, p: 0.0001668199
Metric: code/number_of_returns/diff, relative statistic: 9.10%, p: 0.0002773216


In [66]:
# F-scores for each feature
f_scores, p_values = f_classif(df[metrics], df['unresolved'])
f_scores
feature_scores = pd.DataFrame({
    'feature': df[metrics].columns,
    'F_score': f_scores,
    'p_value': p_values
})
top_features = feature_scores.sort_values('F_score', ascending=False)[:10]
print(top_features)

                                   feature    F_score       p_value
27             patch/number_of_added_lines  35.233666  5.492029e-09
26                   patch/number_of_lines  28.592998  1.364220e-07
3                code/number_of_lines/diff  26.513406  3.778012e-07
28           patch/number_of_removed_lines  19.678617  1.129018e-05
2               code/max_nested_depth/diff  19.310269  1.359175e-05
21             error/max_except_depth/diff  18.952553  1.627950e-05
4        code/number_of_comment_lines/diff  14.299276  1.748904e-04
25                   patch/number_of_files  12.066170  5.583150e-04
6   code/number_of_control_statements/diff  11.985822  5.823106e-04
22      error/number_of_broad_excepts/diff  10.687187  1.153557e-03


In [67]:
X_train, X_test, y_train, y_test = train_test_split(
    df[metrics], df['unresolved'], test_size=0.2, random_state=42
)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train model
clf = RandomForestClassifier(
    n_estimators=100,
    class_weight='balanced',
    max_depth=None,    # to control tree depth
    min_samples_leaf=2 # to prevent overfitting
)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
y_prob = clf.predict_proba(X_test)

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

feature_importance = pd.DataFrame({
    'feature': df[metrics].columns,
    'importance': clf.feature_importances_,
    'std': np.std([tree.feature_importances_ for tree in clf.estimators_], axis=0)
}).sort_values('importance', ascending=False)

print(feature_importance.head(10))

r = permutation_importance(clf, X_test, y_test, n_repeats=10)
perm_importance = pd.DataFrame({
    'feature': df[metrics].columns,
    'perm_importance': r.importances_mean,
    'perm_std': r.importances_std
}).sort_values('perm_importance', ascending=False)

print(perm_importance.head(10))


Classification Report:
              precision    recall  f1-score   support

           0       0.58      0.81      0.68        43
           1       0.80      0.56      0.66        57

    accuracy                           0.67       100
   macro avg       0.69      0.69      0.67       100
weighted avg       0.71      0.67      0.67       100

                                     feature  importance       std
3                  code/number_of_lines/diff    0.076122  0.051562
26                     patch/number_of_lines    0.075180  0.051572
29         instance/problem_statement_length    0.072062  0.040766
27               patch/number_of_added_lines    0.064727  0.045852
8          code/average_function_length/diff    0.064362  0.039195
24  dependency/number_of_function_calls/diff    0.048130  0.030870
6     code/number_of_control_statements/diff    0.044334  0.035041
2                 code/max_nested_depth/diff    0.042070  0.041499
10   code/number_of_function_parameters/diff  