# Parse Weights and Biases Exports
- for graphs and tables

In [1]:
import pandas as pd

In [2]:
# CONFIGURATION
data_filepath = "../wandb_exports/wandb_apv_export.csv"
label_col = "apv"
metric_name = "f1"
metric_name_long = "F1"

In [3]:
# Read Data
c = label_col
df = pd.read_csv(data_filepath)
df.columns

Index(['Name', 'task', 'device', 'base_lr', 'dim_hid', 'shuffle', 'test_run',
       'grad_clip', 'label_col', 'multitask',
       ...
       'gradients/graph_3heads.head2.1.bias', 'test_svo_dist_norm_mae',
       'train_svo_dist_norm_r2', 'parameters/graph_3heads.head2.1.weight',
       'parameters/graph_3heads.head2.1.bias', 'train_svo_dist_norm_mse',
       'train_svo_dist_norm_mae', 'train_batch_svo_dist_norm_r2',
       'gradients/graph_3heads.head1.4.weight',
       'gradients/graph_3heads.head1.4.bias'],
      dtype='object', length=669)

In [4]:
def make_tasks_string(label_cols: str) -> str:
    return ", ".join([t.replace("[", "").replace('"', "").replace("]", "") for t in label_cols.split(",")])

In [78]:
cols = ["multitask", "label_cols", "architecture", "dim_hid", f"test_{metric_name}", "model_n_params", "epoch", "_wandb.runtime", f"test_{c}_{metric_name}"]
if cols[-1] in df.columns:
    df_table = df[cols]
    some_multitask = True
else:
    df_table = df[cols[:-1]]
    some_multitask = False

if some_multitask is True:
    df_table[f"test_{metric_name}"] = df_table.apply(lambda row: row[f"test_{metric_name}"] if row["multitask"] == False else row[f"test_{c}_{metric_name}"], axis=1)
    
df_table = df_table[["multitask", "label_cols", "architecture", "dim_hid", f"test_{metric_name}"]]
df_table["label_cols"] = df_table.apply(lambda row: make_tasks_string(row["label_cols"]) if row["multitask"] == True else "N/A", axis=1)
df_table["architecture"] = df_table["architecture"].apply(lambda arch: "BERT" if "BERT" in arch else "BiLSTM")
df_table = df_table.rename(columns={"multitask": "Multitask?","label_cols": "Tasks", "architecture": "Architecture", f"test_{metric_name}": f"Test {metric_name_long}", "dim_hid": "Hidden Dimension Size"})
df_table

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


Unnamed: 0,Multitask?,Tasks,Architecture,Hidden Dimension Size,Test R^2
0,False,,BERT,20,0.643181
1,False,,BiLSTM,20,0.563724
2,False,,BiLSTM,20,0.313645
3,False,,BiLSTM,20,
4,False,,BiLSTM,20,
5,False,,BiLSTM,20,
6,False,,BiLSTM,20,
7,False,,BiLSTM,20,
8,False,,BiLSTM,20,
9,False,,BiLSTM,20,


In [79]:
print(df_table.to_latex(float_format=lambda x: '%.3f' % x, index=False))

\begin{tabular}{lllrr}
\toprule
 Multitask? &                       Tasks & Architecture &  Hidden Dimension Size &  Test R\textasciicircum 2 \\
\midrule
      False &                         N/A &         BERT &                     20 &     0.643 \\
      False &                         N/A &       BiLSTM &                     20 &     0.564 \\
      False &                         N/A &       BiLSTM &                     20 &     0.314 \\
      False &                         N/A &       BiLSTM &                     20 &       NaN \\
      False &                         N/A &       BiLSTM &                     20 &       NaN \\
      False &                         N/A &       BiLSTM &                     20 &       NaN \\
      False &                         N/A &       BiLSTM &                     20 &       NaN \\
      False &                         N/A &       BiLSTM &                     20 &       NaN \\
      False &                         N/A &       BiLSTM &            

In [5]:
cols = ["multitask", "label_cols", "architecture", "dim_hid", f"test_{metric_name}", "model_n_params", "epoch", "_wandb.runtime", f"test_{c}_{metric_name}", "early_stopping"]
df_table = df[cols]
df_table

Unnamed: 0,multitask,label_cols,architecture,dim_hid,test_f1,model_n_params,epoch,_wandb.runtime,test_apv_f1,early_stopping
0,False,,BERTClassifier,20,0.983775,66378323.0,2,771,,True
1,False,,BiLSTMClassifier,256,0.931564,10509827.0,5,300,,True
2,False,,BiLSTMClassifier,128,0.958705,8621059.0,7,367,,True
3,False,,BiLSTMClassifier,64,0.945152,8082179.0,7,430,,True
4,False,,BiLSTMClassifier,32,0.915096,,6,573,,True
5,False,,BiLSTMClassifier,20,0.89445,,4,197,,True
6,True,"[""apv"",""scv""]",BiLSTMMultitask,20,,7868998.0,6,333,0.934863,True
7,True,"[""apv"",""scv""]",BiLSTMMultitask,20,,7868998.0,6,333,0.926762,False
8,True,"[""apv"",""scv"",""hv""]",BiLSTMMultitask,20,,7869481.0,6,373,0.907786,False
9,True,"[""svo_dist_norm"",""apv"",""scv"",""hv""]",BiLSTMMultitask,20,,7869922.0,6,388,0.570454,False


In [6]:
df_table["num_epochs"] = df_table["epoch"].apply(lambda epoch: epoch + 1)
df_table = df_table.drop(columns=["epoch"])
df_table = df_table.rename(columns={"_wandb.runtime": "runtime"})
df_table["mean_epoch_runtime"] = df_table.apply(lambda row: row["runtime"] / row["num_epochs"], axis=1)

df_table

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,multitask,label_cols,architecture,dim_hid,test_f1,model_n_params,runtime,test_apv_f1,early_stopping,num_epochs,mean_epoch_runtime
0,False,,BERTClassifier,20,0.983775,66378323.0,771,,True,3,257.0
1,False,,BiLSTMClassifier,256,0.931564,10509827.0,300,,True,6,50.0
2,False,,BiLSTMClassifier,128,0.958705,8621059.0,367,,True,8,45.875
3,False,,BiLSTMClassifier,64,0.945152,8082179.0,430,,True,8,53.75
4,False,,BiLSTMClassifier,32,0.915096,,573,,True,7,81.857143
5,False,,BiLSTMClassifier,20,0.89445,,197,,True,5,39.4
6,True,"[""apv"",""scv""]",BiLSTMMultitask,20,,7868998.0,333,0.934863,True,7,47.571429
7,True,"[""apv"",""scv""]",BiLSTMMultitask,20,,7868998.0,333,0.926762,False,7,47.571429
8,True,"[""apv"",""scv"",""hv""]",BiLSTMMultitask,20,,7869481.0,373,0.907786,False,7,53.285714
9,True,"[""svo_dist_norm"",""apv"",""scv"",""hv""]",BiLSTMMultitask,20,,7869922.0,388,0.570454,False,7,55.428571


In [96]:
257.000000 / 81.857143

3.1396160503671626

In [7]:
import plotly.express as px

In [8]:
px.bar(df_table, x=df_table["mean_epoch_runtime"], y=df_table["architecture"])

In [17]:
import numpy as np

In [35]:
df_fig = df_table.groupby(by=["architecture"])["mean_epoch_runtime"].describe().reset_index()
df_fig = df_fig.rename(columns={"architecture":"Model", "mean": "Mean Epoch Runtime"})
df_fig["display_text"] = df_fig.apply(lambda row: f"{row['Mean Epoch Runtime']:.1f}" if pd.isna(row['std']) else f"{row['Mean Epoch Runtime']:.1f} ± {row['std']:.1f}", axis=1)
fig = px.bar(df_fig, x=df_fig["Model"], y=df_fig["Mean Epoch Runtime"], error_y=df_fig["std"], text=df_fig["display_text"])
fig = fig.update_layout(
    {"title": "Mean Epoch Runtime for Different Model Architectures"},
    width=500
)
fig = fig.update_traces(textposition="outside")
fig