# Results Analysis
## 1. Setup
### 1.1 Imports

In [1]:
from helpers import *
from ipywidgets import HTML
from scipy.stats.mstats import hmean
import pandas as pd

pd.set_option('styler.format.precision', 4)

### 1.2 Constants
These are used to filter and rename the columns and metrics.

In [2]:
MARKS = [ 'CodeXGLUE', 'D2A', 'Draper VDISC' ]
COMBO = [ 'All', 'All Balanced', 'CodeXGLUE+D2A' ]
# MARKS = [ 'codexglue', 'd2a', 'draper' ]
# COMBO = [ 'all', 'allbalanced', 'codexglued2a' ]
METRICS = [ 'avg_prec', 'accuracy', 'f1', 'precision', 'recall', 'cet_score' ]
RENAMES = [
    ('codexglue', 'CodeXGLUE'), ('d2a', 'D2A'), ('draper', 'Draper VDISC'),
    ('all', 'All'), ('allbalanced', 'All Balanced'), ('codexglued2a', 'CodeXGLUE+D2A'),
    ('codebert', 'CodeBERT'), ('cotext', 'CoTexT'), ('linevul', 'LineVul'),
    ('regvd', 'ReGVD'), ('textcnn', 'TextCNN'), ('xgboost', 'XGBoost')
]
METRIC_NAMES = { 'accuracy': 'Acc.', 'avg_prec': 'AP', 'f1': 'F1', 'precision': 'P', 'recall': 'R' }

### 1.3 Datasets
Load the code execution task results and predictions.

In [3]:
predictions = load_predictions()
add_truth(predictions, 'd2a', '../data/csv/d2a/valid.csv')
add_truth(predictions, 'codexglue', '../data/csv/codexglue/test.csv')
add_truth(predictions, 'draper', '../data/csv/draper/test.csv')
tasks = pd.read_csv('tasks.csv')

## 2. Research Questions
### 2.1 RQ1 - How well do models learn code execution tasks?

We will create a code execution task (CET) score as the harmonic mean and display the results.

In [4]:
tasks['cet_score'] = tasks.iloc[:, 1:].apply(hmean, axis=1)
tasks.style.background_gradient('RdYlGn', axis=None)

Unnamed: 0,model,task1,task2,task3,task4,task5,task6,cet_score
0,codebert,0.9994,0.9994,0.9788,0.9835,0.9979,0.499,0.8516
1,cotext,0.9998,0.9998,0.9479,0.9909,0.9992,0.501,0.8497
2,linevul,0.9994,0.9993,0.9724,0.9223,0.9985,0.499,0.8428
3,regvd,0.9462,0.9263,0.4989,0.5002,0.611,0.499,0.6135
4,textcnn,0.9809,0.4999,0.4989,0.4998,0.4999,0.501,0.5444
5,xgboost,0.8869,0.7405,0.5241,0.5135,0.5639,0.5011,0.5939


All of the models are able partially learn at least one task. Unexpectedly, ReGVD fails to learn quite a few. XGBoost never fully learns a task, but ends up with a higher mean than TextCNN.

### 2.2 RQ2 - Is SVD performance correlated with code execution ability?

Calculate the metrics and display the first few rows.

In [5]:
metrics = load_metrics(predictions)
metrics = pd.merge(metrics, tasks[['model', 'cet_score']], on='model')
for search, replace in RENAMES: metrics = metrics.replace(search, replace)

metrics.groupby(['model']).count()[['test']]

Output()

Unnamed: 0_level_0,test
model,Unnamed: 1_level_1
CoTexT,18
CodeBERT,18
LineVul,18
ReGVD,18
TextCNN,18
XGBoost,18


#### 2.2.1 Correlation by Metric
Let's look at the correlation of each metric with the CET score.

In [6]:
def cet2metric(ds):
    both = metrics.loc[metrics.train.isin(ds), METRICS].corr()[['cet_score']]
    same = metrics.loc[metrics.train.isin(ds) & (metrics.train == metrics.test), METRICS].corr()[['cet_score']]
    diff = metrics.loc[metrics.train.isin(ds) & (metrics.train != metrics.test), METRICS].corr()[['cet_score']]

    corr = pd.concat([both, same, diff], axis=1)
    corr.columns = [ 'All', 'Train = Test', 'Train ≠ Test' ]
    corr = corr.rename(index=METRIC_NAMES)
    return corr.sort_values(by='All', ascending=False).iloc[1:, :]


cet2metric(MARKS).style\
    .background_gradient(cmap='bwr_r', axis=None, vmin=-1, vmax=1)\
    .format(precision=2)

Unnamed: 0,All,Train = Test,Train ≠ Test
P,0.16,0.28,0.15
F1,0.15,0.53,0.01
Acc.,0.08,0.14,0.07
AP,0.05,0.23,0.0
R,0.03,0.3,-0.07


AP correlation is strong for training/testing on same dataset, but extremely weak when train/test on different datasets is included.

#### 2.2.2 AP Correlation by Training and Test Dataset

In [7]:
def cet2ap(ds):
    corr = metrics.loc[metrics.train.isin(ds), ['train', 'test', 'avg_prec', 'cet_score']]
    corr = corr.groupby(['train', 'test']).corr()[['cet_score']].reset_index()#.rename(columns={'level_2': 'corr'})
    corr = corr[corr.level_2 == 'avg_prec']
    corr = corr.pivot(index='train', columns='test', values='cet_score')
    corr.columns.name = 'Test'
    corr.index.name = 'Train'
    return corr

cet2ap(MARKS).style\
             .background_gradient(cmap='bwr_r', axis=None, vmin=-1, vmax=1)\
             .format(precision=2)

Test,CodeXGLUE,D2A,Draper VDISC
Train,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
CodeXGLUE,0.2,-0.1,0.28
D2A,-0.65,0.15,0.14
Draper VDISC,0.36,0.47,0.65


This is made clear when considering AP correlation to CET score based on the training and test dataset. Draper appears to be driving the positive results while D2A has a strong negative correlation.

#### 2.2.3 Metrics Correlation by Training and Test Datasets

In [24]:
def cet2data(ds):
    group = [ 'train', 'test' ]
    corr = metrics.loc[metrics.train.isin(ds), group + METRICS]
    corr = corr.groupby(group).corr()[['cet_score']].reset_index().rename(columns={'level_2': 'corr'})
    corr = corr.pivot(index=group, columns='corr', values='cet_score').drop(columns='cet_score')
    
    # Get total correlation
    corr2 = metrics.loc[metrics.train.isin(ds), group + METRICS]
    corr2 = corr2.groupby('train').corr(numeric_only=True)[['cet_score']].reset_index().rename(columns={'level_1': 'corr'})
    corr2 = corr2.pivot(index='train', columns='corr', values='cet_score').drop(columns='cet_score')
    corr2 = corr2.reset_index()
    corr2['test'] = 'Total'
    corr2 = corr2.set_index(group)
    
    corr = pd.concat([corr, corr2], axis=0).sort_values(by=['train', 'test'])
    corr.columns.name = None
    corr.index.names = [ 'Train', 'Test' ]
    corr = corr.rename(columns=METRIC_NAMES)
    return corr

marks = cet2metric(MARKS).transpose()
marks.index.name = 'Train'
marks = marks.reset_index()
marks['Test'] = ''
marks = marks.set_index(['Train', 'Test'])

combo = cet2metric(COMBO).transpose()
combo.index.name = 'Train'
combo = combo.reset_index()
combo['Train'] = ['Combo All', 'Combo Train = Test', 'Combo Train != Test']
combo['Test'] = ''
combo = combo.set_index(['Train', 'Test'])

corr = cet2data(MARKS + COMBO)
corr = pd.concat([corr, marks, combo], axis=0)

corr.style\
    .background_gradient(cmap='bwr_r', axis=None, vmin=-1, vmax=1)\
    .format(precision=2)

Unnamed: 0_level_0,Unnamed: 1_level_0,Acc.,AP,F1,P,R
Train,Test,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
All,CodeXGLUE,0.72,0.78,0.4,0.72,0.06
All,D2A,0.61,0.82,0.4,0.51,0.32
All,Draper VDISC,0.78,0.7,0.79,0.85,0.18
All,Total,0.17,0.58,0.49,0.65,0.19
All Balanced,CodeXGLUE,-0.05,0.53,0.76,0.05,0.59
All Balanced,D2A,0.77,0.87,0.76,0.77,0.76
All Balanced,Draper VDISC,0.37,0.4,0.5,0.43,0.32
All Balanced,Total,0.21,0.21,0.22,0.11,0.49
CodeXGLUE,CodeXGLUE,0.43,0.2,0.47,0.31,0.55
CodeXGLUE,D2A,0.34,-0.1,0.15,0.37,-0.07


#### 2.2.4 AP Drop On Train/Test Difference

How much worse do models get when they aren't trained and tested on the same dataset?

In [9]:
group = [ 'train' ]
same = metrics.loc[metrics.train.isin(MARKS) & (metrics.train == metrics.test), group + METRICS]
diff = metrics.loc[metrics.train.isin(MARKS) & (metrics.train != metrics.test), group + METRICS]
same = same.groupby(group).mean()
diff = diff.groupby(group).mean()
diff = (diff - same)
diff.index.name = None

diff.rename(columns=METRIC_NAMES)\
    .drop(columns='cet_score')\
    .style.background_gradient(cmap='bwr_r', vmin=-1, vmax=1)\
    .format(precision=2)

Unnamed: 0,AP,Acc.,F1,P,R
CodeXGLUE,-0.31,-0.16,-0.12,-0.29,0.29
D2A,-0.39,-0.09,-0.35,-0.38,-0.17
Draper VDISC,0.09,-0.4,-0.24,-0.01,-0.44


### 2.3 RQ3 - Does combining datasets improve SVD performance?

#### 2.3.1 AP Difference by Train and Test Dataset

Let's look at how the AP changes for each benchmark.

In [10]:
combo = metrics[metrics.train.isin(COMBO)].groupby(['model', 'train', 'test'])[['avg_prec']].first()
marks = metrics[metrics.train.isin(MARKS)].sort_values(by='avg_prec', ascending=False).groupby(['model', 'test'])[['avg_prec']].first()
diff = (combo - marks).reset_index().pivot(index=['model', 'train'], columns='test', values='avg_prec')

diff.style\
    .background_gradient(cmap='bwr_r', vmin=-1, vmax=1)\
    .format(precision=2)

Unnamed: 0_level_0,test,CodeXGLUE,D2A,Draper VDISC
model,train,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CoTexT,All,-0.0,-0.03,0.02
CoTexT,All Balanced,-0.06,-0.08,-0.07
CoTexT,CodeXGLUE+D2A,0.01,-0.03,-0.28
CodeBERT,All,-0.09,-0.06,0.02
CodeBERT,All Balanced,-0.14,-0.18,-0.17
CodeBERT,CodeXGLUE+D2A,-0.0,-0.08,-0.47
LineVul,All,-0.02,-0.07,0.01
LineVul,All Balanced,-0.09,-0.17,-0.19
LineVul,CodeXGLUE+D2A,-0.01,-0.04,-0.48
ReGVD,All,0.02,-0.1,-0.01


In [11]:
metrics.loc[metrics.model == 'CoTexT']

Unnamed: 0,model,train,test,has_logits,avg_prec,accuracy,f1,precision,recall,cet_score
18,CoTexT,All,D2A,False,0.5595,0.5822,0.6448,0.5751,0.7338,0.849695
19,CoTexT,All,Draper VDISC,False,0.3632,0.9453,0.5797,0.5774,0.5821,0.849695
20,CoTexT,All,CodeXGLUE,False,0.5283,0.612,0.5853,0.5749,0.596,0.849695
21,CoTexT,All Balanced,CodeXGLUE,False,0.4762,0.5007,0.63,0.4776,0.9251,0.849695
22,CoTexT,All Balanced,Draper VDISC,False,0.2733,0.8887,0.4691,0.3394,0.7594,0.849695
23,CoTexT,All Balanced,D2A,False,0.5153,0.5134,0.6771,0.5153,0.987,0.849695
24,CoTexT,CodeXGLUE,D2A,False,0.5179,0.5067,0.5702,0.5186,0.6331,0.849695
25,CoTexT,CodeXGLUE,Draper VDISC,False,0.0684,0.5475,0.1261,0.072,0.5038,0.849695
26,CoTexT,CodeXGLUE,CodeXGLUE,False,0.5329,0.6197,0.574,0.5912,0.5578,0.849695
27,CoTexT,CodeXGLUE+D2A,CodeXGLUE,False,0.5455,0.6358,0.5786,0.6175,0.5442,0.849695


#### 2.3.2 Mean AP Difference by Training and Test Datasets

Let's look at the mean AP difference by training dataset to compress the information above.

In [12]:
diff.reset_index().groupby('train').mean(numeric_only=True)\
    .style.background_gradient(cmap='bwr_r', vmin=-1, vmax=1)\
    .format(precision=2)

test,CodeXGLUE,D2A,Draper VDISC
train,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
All,-0.07,-0.1,0.0
All Balanced,-0.1,-0.15,-0.11
CodeXGLUE+D2A,0.0,-0.05,-0.33


#### 2.3.3 Metric Differences by Train/Test Pair

In [25]:
ap = metrics.copy()
ap['name'] = ''
ap.loc[ap.train.isin(COMBO), 'name'] = 'On Merged'
ap.loc[ap.train == ap.test, 'name'] = '= Test'
ap.loc[ap.train.isin(MARKS) & (ap.train != ap.test), 'name'] = '≠ Test'
# ap['type'] = 'mark'
# ap.loc[ap.train.isin(COMBO), 'type'] = 'combo'
# ap['same'] = False
# ap.loc[ap.train == ap.test, 'same'] = True

ap = ap.groupby('name').mean(numeric_only=True)[['avg_prec', 'accuracy', 'f1', 'precision', 'recall']]
ap = ap.reset_index()
ap = ap.melt('name').pivot(index='name', columns='variable', values='value')
ap = ap.rename(columns=METRIC_NAMES)
ap.columns.name = None
ap.index.name = 'Train'
ap = ap.reset_index()
# ap.columns = [ 'On Merged', '!= Test', '== Test']
# ap.index.name = None

rint(ap.style.background_gradient(cmap='Blues')\
  .format(precision=2)

Unnamed: 0,Train,Acc.,AP,F1,P,R
0,= Test,0.71,0.56,0.49,0.55,0.51
1,On Merged,0.58,0.46,0.49,0.44,0.71
2,≠ Test,0.49,0.35,0.25,0.32,0.4


### 2.4 RQ4 - Does combining datasets improve SVD/CET correlation?

#### 2.4.1 Correlation by Metric

In [14]:
marks = cet2metric(MARKS)
combo = cet2metric(COMBO)

display(HTML('<h4 style="margin:0">Correlation with Benchmarks</h4>'))
display(marks.style.background_gradient(cmap='bwr_r', axis=None, vmin=-1, vmax=1).format(precision=2))
display(HTML('<h4 style="margin:1rem 0 0">Correlation with Merged</h4>'))
display(combo.style.background_gradient(cmap='bwr_r', axis=None, vmin=-1, vmax=1).format(precision=2))
display(HTML('<h4 style="margin:1rem 0 0">Merged - Benchmarks</h4>'))
display((combo - marks).style.background_gradient(cmap='bwr_r', axis=None, vmin=-1, vmax=1).format(precision=2))

HTML(value='<h4 style="margin:0">Correlation with Benchmarks</h4>')

Unnamed: 0,All,Train = Test,Train ≠ Test
P,0.16,0.28,0.15
F1,0.15,0.53,0.01
Acc.,0.08,0.14,0.07
AP,0.05,0.23,0.0
R,0.03,0.3,-0.07


HTML(value='<h4 style="margin:1rem 0 0">Correlation with Merged</h4>')

Unnamed: 0,All,Train = Test,Train ≠ Test
F1,0.23,,0.23
Acc.,0.22,,0.22
AP,0.18,,0.18
P,0.12,,0.12
R,0.11,,0.11


HTML(value='<h4 style="margin:1rem 0 0">Merged - Benchmarks</h4>')

Unnamed: 0,All,Train = Test,Train ≠ Test
AP,0.13,,0.18
Acc.,0.14,,0.15
F1,0.08,,0.22
P,-0.04,,-0.03
R,0.08,,0.18


In [15]:
t = pd.concat([marks, combo[['All']]], axis=1)
t.columns.values[-1] = 'Merged'
# t.columns = [ 'Benchmarks', 'Combined' ]
# t = t.sort_values('Combined', ascending=False)
t = t.transpose()
t = t[sorted(t)]
t.sort_values(by='AP', ascending=False).style.background_gradient(cmap='Blues', axis=None).format(precision=2)

Unnamed: 0,AP,Acc.,F1,P,R
Train = Test,0.23,0.14,0.53,0.28,0.3
Merged,0.18,0.22,0.23,0.12,0.11
All,0.05,0.08,0.15,0.16,0.03
Train ≠ Test,0.0,0.07,0.01,0.15,-0.07


#### 2.4.2 Correlation by Training and Test Dataset

In [28]:
apm = cet2ap(MARKS)
apm['mean'] = apm.mean(axis=1)
apm = apm.sort_values(by='mean', ascending=False)

apc = cet2ap(COMBO)
apc['mean'] = apc.mean(axis=1)
apc = apc.sort_values(by='mean', ascending=False)
ap = pd.concat([apm, apc])
ap['mean'] = ap.mean(axis=1)
ap = ap.sort_values(by='mean', ascending=False)

ap.style\
  .background_gradient(cmap='bwr_r', axis=None, vmin=-1, vmax=1)\
  .format(precision=2)

Test,CodeXGLUE,D2A,Draper VDISC,mean
Train,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
All,0.78,0.82,0.7,0.77
All Balanced,0.53,0.87,0.4,0.6
Draper VDISC,0.36,0.47,0.65,0.49
CodeXGLUE,0.2,-0.1,0.28,0.13
CodeXGLUE+D2A,0.33,0.19,-0.18,0.11
D2A,-0.65,0.15,0.14,-0.12


## 3. Miscellaneous
### 3.1 Average Precision

In [17]:
ap = metrics.groupby(['model', 'train', 'test']).mean()[['avg_prec']].reset_index()
ap = ap.pivot(['model', 'train'], 'test', 'avg_prec')
ap['mean'] = ap.mean(axis=1)
ap.style.background_gradient(cmap='Blues')

  ap = ap.pivot(['model', 'train'], 'test', 'avg_prec')


Unnamed: 0_level_0,test,CodeXGLUE,D2A,Draper VDISC,mean
model,train,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
CoTexT,All,0.5283,0.5595,0.3632,0.4837
CoTexT,All Balanced,0.4762,0.5153,0.2733,0.4216
CoTexT,CodeXGLUE,0.5329,0.5179,0.0684,0.3731
CoTexT,CodeXGLUE+D2A,0.5455,0.5595,0.0676,0.3909
CoTexT,D2A,0.4625,0.5905,0.0624,0.3718
CoTexT,Draper VDISC,0.4583,0.527,0.3475,0.4443
CodeBERT,All,0.5921,0.6397,0.5542,0.5953
CodeBERT,All Balanced,0.5373,0.5198,0.3718,0.4763
CodeBERT,CodeXGLUE,0.6821,0.5661,0.0705,0.4396
CodeBERT,CodeXGLUE+D2A,0.6816,0.6249,0.0711,0.4592


In [18]:
# for metric in METRICS:
#     combo = metrics[metrics.train.isin(COMBO)].groupby(['model', 'train', 'test'])[[metric]].mean()
#     marks = metrics[metrics.train.isin(MARKS)].sort_values(by='avg_prec', ascending=False).groupby(['model', 'test'])[[metric]].first()
#     diff = (combo - marks).reset_index().pivot(['model', 'train'], 'test', metric)
#     # display(combo)
#     # display(marks)
#     display(HTML(f'<h4>{metric}</h4>'))
#     display(diff.reset_index().groupby('train').mean().style.background_gradient(cmap='bwr_r', vmin=-1, vmax=1))
#     display(diff.style.background_gradient(cmap='bwr_r', vmin=-1, vmax=1))

In [19]:
top = metrics.sort_values(by='avg_prec', ascending=False)
# top = top.loc[top.train.isin(COMBO)]
top = top.groupby(['test', 'model']).first()
top = top.reset_index().sort_values(by=['test', 'avg_prec'], ascending=[True, False])
top.style.background_gradient(cmap='Blues')

Unnamed: 0,test,model,train,has_logits,avg_prec,accuracy,f1,precision,recall,cet_score
2,CodeXGLUE,LineVul,CodeXGLUE,True,0.7154,0.6526,0.5359,0.6937,0.4367,0.8428
4,CodeXGLUE,TextCNN,CodeXGLUE,True,0.6975,0.6501,0.4963,0.7325,0.3753,0.5444
1,CodeXGLUE,CodeBERT,CodeXGLUE,True,0.6821,0.6373,0.531,0.6538,0.447,0.8516
5,CodeXGLUE,XGBoost,CodeXGLUE,True,0.5945,0.5838,0.4669,0.5672,0.3968,0.5939
3,CodeXGLUE,ReGVD,CodeXGLUE+D2A,True,0.5551,0.5794,0.5499,0.5408,0.5594,0.6135
0,CodeXGLUE,CoTexT,CodeXGLUE+D2A,False,0.5455,0.6358,0.5786,0.6175,0.5442,0.8497
7,D2A,CodeBERT,D2A,True,0.7033,0.6208,0.6523,0.6199,0.6883,0.8516
8,D2A,LineVul,D2A,True,0.6961,0.6124,0.6078,0.637,0.5812,0.8428
10,D2A,TextCNN,D2A,True,0.6602,0.5638,0.3401,0.7791,0.2175,0.5444
11,D2A,XGBoost,D2A,True,0.6602,0.5956,0.644,0.5908,0.7078,0.5939


In [20]:
table = metrics.melt(['model', 'train', 'test'])
table = table.loc[
    ((table.variable == 'accuracy') & table.test.isin(['CodeXGLUE', 'D2A'])) |
    ((table.variable == 'f1') & (table.test == 'Draper VDISC')) |
    (table.variable == 'avg_prec')
]
table = table.sort_values(by='model')
table = table.pivot(index=['model', 'train'], columns=['test', 'variable'], values='value')

# Sort columns, always having AP 2nd.
cols = sorted(table)
cols[-1], cols[-2] = cols[-2], cols[-1]
table = table[cols].rename(columns=METRIC_NAMES)

table.columns.names = [ None, None ]
table.index.names = [ 'Model', 'Train' ]
table.style.highlight_max()

Unnamed: 0_level_0,Unnamed: 1_level_0,CodeXGLUE,CodeXGLUE,D2A,D2A,Draper VDISC,Draper VDISC
Unnamed: 0_level_1,Unnamed: 1_level_1,Acc.,AP,Acc.,AP,F1,AP
Model,Train,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
CoTexT,All,0.612,0.5283,0.5822,0.5595,0.5797,0.3632
CoTexT,All Balanced,0.5007,0.4762,0.5134,0.5153,0.4691,0.2733
CoTexT,CodeXGLUE,0.6197,0.5329,0.5067,0.5179,0.1261,0.0684
CoTexT,CodeXGLUE+D2A,0.6358,0.5455,0.5772,0.5595,0.1226,0.0676
CoTexT,D2A,0.5234,0.4625,0.6174,0.5905,0.1074,0.0624
CoTexT,Draper VDISC,0.5348,0.4583,0.5034,0.527,0.5623,0.3475
CodeBERT,All,0.5011,0.5921,0.5117,0.6397,0.5978,0.5542
CodeBERT,All Balanced,0.485,0.5373,0.5168,0.5198,0.3261,0.3718
CodeBERT,CodeXGLUE,0.6373,0.6821,0.5302,0.5661,0.1215,0.0705
CodeBERT,CodeXGLUE+D2A,0.6376,0.6816,0.5201,0.6249,0.1337,0.0711


In [21]:
for name, group in table.groupby('Model'):
    display(HTML(f'<h4>{name}</h4>'))
    display(group.style.highlight_max())

HTML(value='<h4>CoTexT</h4>')

Unnamed: 0_level_0,Unnamed: 1_level_0,CodeXGLUE,CodeXGLUE,D2A,D2A,Draper VDISC,Draper VDISC
Unnamed: 0_level_1,Unnamed: 1_level_1,Acc.,AP,Acc.,AP,F1,AP
Model,Train,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
CoTexT,All,0.612,0.5283,0.5822,0.5595,0.5797,0.3632
CoTexT,All Balanced,0.5007,0.4762,0.5134,0.5153,0.4691,0.2733
CoTexT,CodeXGLUE,0.6197,0.5329,0.5067,0.5179,0.1261,0.0684
CoTexT,CodeXGLUE+D2A,0.6358,0.5455,0.5772,0.5595,0.1226,0.0676
CoTexT,D2A,0.5234,0.4625,0.6174,0.5905,0.1074,0.0624
CoTexT,Draper VDISC,0.5348,0.4583,0.5034,0.527,0.5623,0.3475


HTML(value='<h4>CodeBERT</h4>')

Unnamed: 0_level_0,Unnamed: 1_level_0,CodeXGLUE,CodeXGLUE,D2A,D2A,Draper VDISC,Draper VDISC
Unnamed: 0_level_1,Unnamed: 1_level_1,Acc.,AP,Acc.,AP,F1,AP
Model,Train,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
CodeBERT,All,0.5011,0.5921,0.5117,0.6397,0.5978,0.5542
CodeBERT,All Balanced,0.485,0.5373,0.5168,0.5198,0.3261,0.3718
CodeBERT,CodeXGLUE,0.6373,0.6821,0.5302,0.5661,0.1215,0.0705
CodeBERT,CodeXGLUE+D2A,0.6376,0.6816,0.5201,0.6249,0.1337,0.0711
CodeBERT,D2A,0.5051,0.468,0.6208,0.7033,0.1312,0.0738
CodeBERT,Draper VDISC,0.5344,0.4749,0.4933,0.5177,0.5947,0.5371


HTML(value='<h4>LineVul</h4>')

Unnamed: 0_level_0,Unnamed: 1_level_0,CodeXGLUE,CodeXGLUE,D2A,D2A,Draper VDISC,Draper VDISC
Unnamed: 0_level_1,Unnamed: 1_level_1,Acc.,AP,Acc.,AP,F1,AP
Model,Train,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
LineVul,All,0.5948,0.6944,0.5185,0.6227,0.5927,0.5432
LineVul,All Balanced,0.4777,0.6244,0.5168,0.5251,0.3564,0.3441
LineVul,CodeXGLUE,0.6526,0.7154,0.5168,0.4921,0.1216,0.0834
LineVul,CodeXGLUE+D2A,0.6409,0.71,0.5822,0.6602,0.1164,0.0589
LineVul,D2A,0.5183,0.4736,0.6124,0.6961,0.1092,0.063
LineVul,Draper VDISC,0.5403,0.4836,0.4883,0.5303,0.5923,0.5355


HTML(value='<h4>ReGVD</h4>')

Unnamed: 0_level_0,Unnamed: 1_level_0,CodeXGLUE,CodeXGLUE,D2A,D2A,Draper VDISC,Draper VDISC
Unnamed: 0_level_1,Unnamed: 1_level_1,Acc.,AP,Acc.,AP,F1,AP
Model,Train,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
ReGVD,All,0.4671,0.4985,0.5117,0.5207,0.4247,0.4044
ReGVD,All Balanced,0.4594,0.4894,0.5168,0.482,0.1281,0.3255
ReGVD,CodeXGLUE,0.5406,0.4635,0.4832,0.6078,0.0,0.0457
ReGVD,CodeXGLUE+D2A,0.5794,0.5551,0.5403,0.5643,0.1354,0.0863
ReGVD,D2A,0.5165,0.4785,0.6141,0.6255,0.1075,0.07
ReGVD,Draper VDISC,0.5307,0.4661,0.4883,0.5213,0.4281,0.4161


HTML(value='<h4>TextCNN</h4>')

Unnamed: 0_level_0,Unnamed: 1_level_0,CodeXGLUE,CodeXGLUE,D2A,D2A,Draper VDISC,Draper VDISC
Unnamed: 0_level_1,Unnamed: 1_level_1,Acc.,AP,Acc.,AP,F1,AP
Model,Train,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
TextCNN,All,0.4579,0.4683,0.5067,0.5235,0.3911,0.345
TextCNN,All Balanced,0.5132,0.501,0.4631,0.4993,0.3939,0.3243
TextCNN,CodeXGLUE,0.6501,0.6975,0.5168,0.5277,0.1217,0.0636
TextCNN,CodeXGLUE+D2A,0.6175,0.6424,0.5403,0.6229,0.1138,0.0574
TextCNN,D2A,0.5447,0.4837,0.5638,0.6602,0.0987,0.0646
TextCNN,Draper VDISC,0.5212,0.4703,0.4916,0.5228,0.403,0.3447


HTML(value='<h4>XGBoost</h4>')

Unnamed: 0_level_0,Unnamed: 1_level_0,CodeXGLUE,CodeXGLUE,D2A,D2A,Draper VDISC,Draper VDISC
Unnamed: 0_level_1,Unnamed: 1_level_1,Acc.,AP,Acc.,AP,F1,AP
Model,Train,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
XGBoost,All,0.5315,0.4768,0.448,0.452,0.0994,0.2291
XGBoost,All Balanced,0.4821,0.4682,0.4732,0.4828,0.2102,0.1436
XGBoost,CodeXGLUE,0.5838,0.5945,0.5168,0.4825,0.1253,0.0871
XGBoost,CodeXGLUE+D2A,0.6007,0.5865,0.5168,0.5991,0.1281,0.0727
XGBoost,D2A,0.5004,0.4674,0.5956,0.6602,0.1159,0.0623
XGBoost,Draper VDISC,0.537,0.4578,0.4715,0.4879,0.1016,0.2355
