In [284]:
import pandas as pd
from analysis.models.data import Data
from analysis.performance_gap import top_performers, unresolved_instances

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

TOP_K = 3

with open("data.json") as f:
    data = Data.model_validate_json(f.read())

df = pd.read_csv("features.csv")

def good_metric(metric: str) -> bool:
    if metric == "instance_id":
        return False

    if metric.startswith("instance") or metric.startswith("patch"):
        return True

    if metric.endswith("diff"):
        return True

    return False


In [285]:
# Drop any features that are zero for all instances (this is the default and indicative of some kind of failure)
df = df.loc[:, (df != 0).any(axis=0)]
metrics = [column for column in df.columns if good_metric(column)]

In [286]:
print(f"Number of models: {len(data.systems)}")
print(f"Number of instances: {len(data.dataset.instances)}")
print(f"Number of features: {len(df.columns)}")

Number of models: 56
Number of instances: 500
Number of features: 81


In [287]:
# Source model is always OpenHands
source = data.systems[data.closest_system("OpenHands")]
targets = top_performers(data.systems.values(), k=TOP_K)

In [288]:
print(f"Source model: {source.metadata.name}")
print(f"Target models: \n{'\n'.join(['  - ' + t.metadata.name for t in targets])}")

Source model: OpenHands + CodeAct v2.1 (claude-3-5-sonnet-20241022)
Target models: 
  - Blackbox AI Agent
  - Learn-by-interact
  - devlo


In [289]:
# Compute the performance gap
gap = unresolved_instances(source, targets, threshold=1)
df['gap'] = df['instance_id'].apply(lambda instance_id: 1 if instance_id in gap else 0)

In [290]:
print(f"Number of instances with performance gap: {len(gap)}")

Number of instances with performance gap: 125


In [291]:
correlations = {metric: df[metric].corr(df["gap"]) for metric in metrics}

# filter out the nans
correlations = {metric: corr for metric, corr in correlations.items() if not pd.isna(corr)}

In [292]:
# The top 5 metrics with the highest correlation
top_metrics = sorted(correlations, key=lambda x: abs(correlations[x]), reverse=True)[:5]
for top_metric in top_metrics:
    print(f"{top_metric}: {correlations[top_metric]}")

code/number_of_returns/diff: 0.10408069720830572
error/number_of_broad_excepts/diff: -0.07723789843828396
patch/number_of_added_lines: -0.0705463458223313
code/max_function_length/diff: 0.06952425482539248
instance/problem_statement_length: 0.06912898473206688


In [293]:
from scipy import stats # type: ignore

max_stat = (len(data.dataset.instances) - len(gap))  * len(gap)

results = []
for metric in metrics:
    result = stats.mannwhitneyu(
        df[df['gap']==0][metric], 
        df[df['gap']==1][metric]
    )
    results.append({
        "metric": metric,
        "p value": result.pvalue,
        "relative statistic": (result.statistic / max_stat) - 0.5
    })

print("Mann-Whitney U Test Results")
results = sorted(results, key=lambda x: abs(x["relative statistic"]), reverse=True)
for result in results[:10]:
    print(f"Metric: {result['metric']}, relative statistic: {result['relative statistic']*100:0.2f}%, p: {result['p value']:.2f}")

Mann-Whitney U Test Results
Metric: patch/number_of_removed_lines, relative statistic: 4.42%, p: 0.12
Metric: code/number_of_returns/diff, relative statistic: -3.19%, p: 0.30
Metric: type/number_of_custom_types/diff, relative statistic: 3.06%, p: 0.16
Metric: code/max_function_length/diff, relative statistic: -3.05%, p: 0.32
Metric: patch/number_of_files, relative statistic: 2.87%, p: 0.09
Metric: error/number_of_finally_blocks/diff, relative statistic: 2.84%, p: 0.05
Metric: code/number_of_variables/diff, relative statistic: 2.78%, p: 0.33
Metric: error/number_of_raise_statements/diff, relative statistic: -2.41%, p: 0.42
Metric: code/number_of_decorators/diff, relative statistic: 2.31%, p: 0.39
Metric: code/average_function_length/diff, relative statistic: -2.23%, p: 0.48


In [294]:
from sklearn.feature_selection import f_classif

# F-scores for each feature
f_scores, p_values = f_classif(df[metrics], df['gap'])
f_scores
feature_scores = pd.DataFrame({
    'feature': df[metrics].columns,
    'F_score': f_scores,
    'p_value': p_values
})
top_features = feature_scores.sort_values('F_score', ascending=False)[:10]
print(top_features)

                                     feature   F_score   p_value
11               code/number_of_returns/diff  5.442859  0.020047
22        error/number_of_broad_excepts/diff  2.982744  0.084778
27               patch/number_of_added_lines  2.485835  0.115512
9              code/max_function_length/diff  2.413978  0.120893
29         instance/problem_statement_length  2.386476  0.123026
24  dependency/number_of_function_calls/diff  2.012931  0.156591
3                  code/number_of_lines/diff  1.999483  0.157979
25                     patch/number_of_files  1.869242  0.172180
26                     patch/number_of_lines  1.850264  0.174369
28             patch/number_of_removed_lines  1.733301  0.188597


In [295]:
X_train, X_test, y_train, y_test = train_test_split(
    df[metrics], df['gap'], test_size=0.2, random_state=42
)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train model
clf = RandomForestClassifier(
    n_estimators=100,
    class_weight='balanced',
    max_depth=None,    # to control tree depth
    min_samples_leaf=2 # to prevent overfitting
)
clf.fit(X_train, y_train)

In [296]:
y_pred = clf.predict(X_test)
y_prob = clf.predict_proba(X_test)

print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.97      0.85        73
           1       0.67      0.15      0.24        27

    accuracy                           0.75       100
   macro avg       0.71      0.56      0.55       100
weighted avg       0.73      0.75      0.69       100



In [297]:
import numpy as np # type: ignore

feature_importance = pd.DataFrame({
    'feature': df[metrics].columns,
    'importance': clf.feature_importances_,
    'std': np.std([tree.feature_importances_ for tree in clf.estimators_], axis=0)  # Add std across trees
}).sort_values('importance', ascending=False)

print(feature_importance.head(10))

                                     feature  importance       std
8          code/average_function_length/diff    0.072925  0.039613
29         instance/problem_statement_length    0.058920  0.044370
26                     patch/number_of_lines    0.057584  0.037917
7              code/number_of_variables/diff    0.052367  0.039287
6     code/number_of_control_statements/diff    0.051114  0.038918
24  dependency/number_of_function_calls/diff    0.050642  0.036599
10   code/number_of_function_parameters/diff    0.048472  0.032727
11               code/number_of_returns/diff    0.044756  0.036510
9              code/max_function_length/diff    0.043785  0.031144
27               patch/number_of_added_lines    0.042511  0.030434


In [298]:
from sklearn.inspection import permutation_importance # type: ignore

r = permutation_importance(clf, X_test, y_test, n_repeats=10)
perm_importance = pd.DataFrame({
    'feature': df[metrics].columns,
    'perm_importance': r.importances_mean,
    'perm_std': r.importances_std
}).sort_values('perm_importance', ascending=False)

print(perm_importance.head(10))


                                   feature  perm_importance  perm_std
8        code/average_function_length/diff            0.009  0.007000
3                code/number_of_lines/diff            0.008  0.007483
6   code/number_of_control_statements/diff            0.007  0.004583
11             code/number_of_returns/diff            0.007  0.007810
2               code/max_nested_depth/diff            0.006  0.006633
26                   patch/number_of_lines            0.005  0.006708
28           patch/number_of_removed_lines            0.005  0.005000
12             code/number_of_imports/diff            0.005  0.008062
29       instance/problem_statement_length            0.004  0.004899
21             error/max_except_depth/diff            0.004  0.004899


In [299]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(class_weight='balanced')
lr.fit(X_train, y_train)

y_pred = lr.predict(X_test)
y_prob = lr.predict_proba(X_test)

print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.66      0.71        73
           1       0.32      0.44      0.38        27

    accuracy                           0.60       100
   macro avg       0.54      0.55      0.54       100
weighted avg       0.64      0.60      0.62       100

