In [None]:
import pathlib
import pandas as pd
import wandb
api = wandb.Api()


### Download all runs from a sweep and store data in a csv.

In [None]:
# Labels all
sweep_id = "12qeci5o"
full_csv = "labels_all_full.csv"
best_csv = "labels_all_best.csv"
plot_title = "Multi Labels"

# Labels single
# sweep_id = "4yorg0e3"
# full_csv = "labels_single_full.csv"
# best_csv = "labels_single_best.csv"
# plot_title = "Single Label"

# Combined best
combined_csv = "combined_best.csv"

In [None]:
# Project is specified by <entity/project-name>
sweep = api.sweep(f"LARICS-GNN/MIDS-GNN/{sweep_id}")  # labels_all
runs = sweep.runs

summary_fields = ['train_loss', 'val_loss', 'eval_loss', 'best_train_loss', 'best_val_loss', 'train_acc', 'val_acc', 'eval_accuracy', 'best_train_accuracy', 'best_val_accuracy', 'mean_error', 'mean_abs_error', 'std_error', 'duration']
config_fields = ['jk', 'activation', 'gnn_layers', 'architecture', 'learning_rate', 'hidden_channels', 'selected_extra_feature']

records = []
for run in runs:
    summary = {k: run.summary[k] for k in summary_fields}
    config = {k: run.config[k] for k in config_fields}
    config["target"] = run.config["dataset"]["target"]
    info = {"name": run.name, "id": run.id}
    records.append({**info, **config, **summary})


In [None]:
save_path = pathlib.Path().cwd() / "Results"  # For Jupyter notebook.

runs_df = pd.DataFrame.from_records(records)
runs_df = runs_df.fillna('none')
runs_df.to_csv(save_path / full_csv)

runs_df

### Load data from all runs, select the best, and store it in a csv.

In [None]:
# Load the DataFrame from the CSV file
loaded_df = pd.read_csv(save_path / full_csv, index_col=0)

# Find the rows with the smallest 'best_val_loss' for each unique combination of 'architecture' and 'selected_extra_feature'
best_rows = loaded_df.loc[loaded_df.groupby(['architecture', 'selected_extra_feature'])['best_val_loss'].idxmin()]

# Save the new DataFrame to a CSV file
best_rows.to_csv(save_path / best_csv)

best_rows

### Combine csv files from two sweeps

In [None]:
# Load the CSV files into DataFrames
labels_all_best_df = pd.read_csv(save_path / "labels_all_best.csv", index_col=0)
labels_single_best_df = pd.read_csv(save_path / "labels_single_best.csv", index_col=0)

# Combine the DataFrames
combined_df = pd.concat([labels_all_best_df, labels_single_best_df])

# Save the combined DataFrame to a CSV file
combined_df.to_csv(save_path / combined_csv)

combined_df

### Make a summarized results table

In [None]:
# Load the combined_best.csv into a dataframe
combined_best_df = pd.read_csv(save_path / combined_csv, index_col=0)

# Create a new column with the formatted values
combined_best_df['formatted_accuracy'] = combined_best_df.apply(
    lambda row: f"{row['best_val_accuracy']:.2f} $|$ {row['eval_accuracy']:.2f}", axis=1
)

# Pivot the dataframe to create the desired table
pivot_table = combined_best_df.pivot_table(
    index='architecture',
    columns=['selected_extra_feature', 'target'],
    values='formatted_accuracy',
    aggfunc="first"
)

# Reorder the columns to match the desired structure
pivot_table = pivot_table.reindex(columns=pd.MultiIndex.from_product(
    [combined_best_df['selected_extra_feature'].unique(), ['true_labels_all_padded', 'true_labels_single']],
    names=['Feature', 'Target']))

# Rename the columns for latex
pivot_table.columns = pivot_table.columns.set_levels(
    ['Noisy Probability' if x == 'noisy_probability' else
     'True Probability' if x == 'true_probability' else
     'Predicted Probability' if x == 'predicted_probability' else
     'None' if x == 'none' else x
     for x in pivot_table.columns.levels[0]], level=0)

pivot_table.columns = pivot_table.columns.set_levels(
    ['Multi' if x == 'true_labels_all_padded' else
     'Single' if x == 'true_labels_single' else x
     for x in pivot_table.columns.levels[1]], level=1)

# Sort the table with custom order
pivot_table = pivot_table.reindex(["MLP", "GCN", "GIN", "GraphSAGE", "GAT", "GATLinNet"])
pivot_table = pivot_table.reindex(columns=["None", "Noisy Probability", "Predicted Probability", "True Probability"], level=0)

# Generate LaTeX code for the table
latex_code = pivot_table.to_latex(column_format="l|cccccccc", multicolumn=True, multicolumn_format="c", multirow=True, escape=False)

print(latex_code)

pivot_table

### Draw bar plots for the entire results

In [None]:
import plotly.express as px
import plotly.subplots as sp

# Load the dataframe from the CSV file
runs_df = pd.read_csv(save_path / full_csv, index_col=0)

# List of config parameters to plot
config_params = ['jk', 'activation', 'gnn_layers', 'architecture', 'learning_rate', 'hidden_channels', 'selected_extra_feature']

# Target value to plot
target_value = 'eval_accuracy'

# Create a subplot grid
fig = sp.make_subplots(rows=3, cols=3, subplot_titles=config_params)

# Add box plots to the subplots
for i, param in enumerate(config_params):
    row = i // 3 + 1
    col = i % 3 + 1
    box_fig = px.box(runs_df, x=param, y=target_value, color=param)
    for trace in box_fig['data']:
        fig.add_trace(trace, row=row, col=col)

# Update layout
fig.update_xaxes(type='category')
fig.update_layout(title_text=f'{target_value} for {plot_title}', showlegend=False, height=1000)
fig.show()

### Download saved best models

In [None]:
import os

# Load the DataFrame with best results.
best_df = pd.read_csv(save_path / best_csv, index_col=0)

predicted_probability_df = best_df[best_df['selected_extra_feature'] == 'predicted_probability']

# Base path for the files
base_path = "mkrizman@login-gpu.hpc.srce.hr:/storage/home/mkrizman/MIDS-GNN/sweep_all_labels/Models/"

# Iterate over the rows in the DataFrame
for index, row in predicted_probability_df.iterrows():
    file_name = f"{row['id']}_best_model.pth"
    source_path = base_path + file_name
    destination_path = pathlib.Path().cwd() / "Models"
    destination_path = str(destination_path.resolve())

    # Use scp to copy the file
    os.system(f"scp {source_path} {destination_path}")