In [129]:
import pandas as pd
import seaborn as sns
import wandb

# Results
Get results from Weights and Biases and produce plots for paper

In [130]:
api = wandb.Api()

# Project is specified by <entity/project-name>
runs = api.runs("messer/JavaDoc-Relevance-Binary-Classifier")

summary_list, config_list, name_list, group_list = [], [], [], []
for run in runs:
    # .summary contains the output keys/values for metrics like accuracy.
    #  We call ._json_dict to omit large files
    summary_list.append(run.summary._json_dict)

    # .config contains the hyperparameters.
    #  We remove special values that start with _.
    config_list.append(
        {k: v for k,v in run.config.items()
         if not k.startswith('_')})

    # .name is the human-readable name of the run.
    name_list.append(run.name)
    group_list.append(run.group)

runs_df = pd.DataFrame({
    "summary": summary_list,
    "config": config_list,
    "name": name_list,
    "group": group_list
})

runs_df.to_csv("data/training_results.csv")
runs_df.head()

Unnamed: 0,summary,config,name,group
0,"{'_step': 8, 'eval/runtime': 43.7549, 'eval/re...","{'bf16': False, 'fp16': False, 'fsdp': '[]', '...",grateful-gorge-1364,LORA:microsoft/codebert-base
1,"{'train/total_flos': 1028463566524416.0, 'eval...","{'bf16': False, 'fp16': False, 'fsdp': '[]', '...",fancy-river-1361,LORA:microsoft/codebert-base
2,"{'_wandb': {'runtime': 5150}, 'train/loss': 0,...","{'bf16': False, 'fp16': False, 'fsdp': '[]', '...",golden-terrain-1356,Fine-Tuned LLM:microsoft/codebert-base
3,"{'test/f1_macro': 0.8903430662282452, 'eval/re...","{'bf16': False, 'fp16': False, 'fsdp': '[]', '...",laced-paper-1355,Fine-Tuned LLM:microsoft/codebert-base
4,"{'test/f1_weighted': 0.88553575990578, 'train/...","{'bf16': False, 'fp16': False, 'fsdp': '[]', '...",fresh-violet-1354,Fine-Tuned LLM:microsoft/codebert-base


### Process WandB API call results

In [131]:
runs_df = pd.read_csv('data/training_results.csv')

In [132]:
df = pd.concat([runs_df[['name', 'group', 'config']], pd.json_normalize(runs_df['summary'])], axis=1)

df.head()

Unnamed: 0,name,group,config
0,grateful-gorge-1364,LORA:microsoft/codebert-base,"{'bf16': False, 'fp16': False, 'fsdp': '[]', '..."
1,fancy-river-1361,LORA:microsoft/codebert-base,"{'bf16': False, 'fp16': False, 'fsdp': '[]', '..."
2,golden-terrain-1356,Fine-Tuned LLM:microsoft/codebert-base,"{'bf16': False, 'fp16': False, 'fsdp': '[]', '..."
3,laced-paper-1355,Fine-Tuned LLM:microsoft/codebert-base,"{'bf16': False, 'fp16': False, 'fsdp': '[]', '..."
4,fresh-violet-1354,Fine-Tuned LLM:microsoft/codebert-base,"{'bf16': False, 'fp16': False, 'fsdp': '[]', '..."


### Group and produce plots

In [133]:
grey = (187/255, 187/255, 187/255)
darkgreen = (51/255, 117/255, 56/255)
teal = (93/255, 168/255, 153/255)
blue = (148/255, 203/255, 236/255)
yellow = (220/255, 205/255, 125/255)

In [134]:
grouped_df = df[['group', 'test/accuracy', 'test/precision_weighted', 'test/recall_weighted', 'test/f1_weighted']].groupby('group').max().reset_index()
grouped_df.columns = ['group', 'accuracy', 'precision_weighted', 'recall_weighted', 'f1_weighted']
grouped_df

KeyError: "['test/accuracy', 'test/precision_weighted', 'test/recall_weighted', 'test/f1_weighted'] not in index"

In [None]:
acc = sns.barplot(grouped_df[['group', 'accuracy']].sort_values('accuracy', ascending=False), y='group', x='accuracy', color=grey)
acc.set(title='Maximum Test Accuracy', xlabel='Accuracy', ylabel='Model Type')
acc.get_figure().savefig('plots/accuracy.pdf', bbox_inches='tight')

In [None]:
recall = sns.barplot(grouped_df[['group', 'recall_weighted']].sort_values('recall_weighted', ascending=False), y='group', x='recall_weighted', color=grey)
recall.set(title='Maximum Test Recall', xlabel='Weighted Recall', ylabel='Model Type')
recall.get_figure().savefig('plots/recall.pdf', bbox_inches='tight')

In [None]:
precision = sns.barplot(grouped_df[['group', 'precision_weighted']].sort_values('precision_weighted', ascending=False), y='group', x='precision_weighted', color=grey)
precision.set(title='Maximum Test Precision', xlabel='Weighted Precision', ylabel='Model Type')
precision.get_figure().savefig('plots/precision.pdf', bbox_inches='tight')

In [None]:
f1 = sns.barplot(grouped_df[['group', 'f1_weighted']].sort_values('f1_weighted', ascending=False), y='group', x='f1_weighted', color=grey)
f1.set(title='Maximum Test F1 Score', xlabel='Weighted F1 Score', ylabel='Model Type')
f1.get_figure().savefig('plots/f1.pdf', bbox_inches='tight')

In [None]:
grouped_df_melted = grouped_df.melt(id_vars=['group'])
grouped_df_melted['variable'] = grouped_df_melted['variable'].map(lambda metric_name: metric_name.title().replace('_Weighted', ''))
grouped_df_melted

In [None]:
metrics = sns.barplot(grouped_df_melted.sort_values(['value'], ascending=False), y='group', x='value', hue='variable', hue_order=['Accuracy', 'Precision', 'Recall', 'F1'])


hatches = ['//', 'x', '\\', 'o']
colors = [grey, teal, blue, darkgreen]
styles = zip(hatches, colors)


for style, these_bars in zip(styles, metrics.containers):
    for this_bar in these_bars:
        this_bar.set_hatch(3 * style[0])
        this_bar.set_facecolor(style[1])
        this_bar.set_edgecolor('black')

metrics.set(xlabel='Maximum Result', ylabel='Model Type')
metrics.legend(title='Metric')

metrics.get_figure().savefig('plots/metric_results.pdf', bbox_inches='tight')