In [1]:
import pandas as pd 
import wandb
from tqdm import tqdm
import requests
import os
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from sklearn import metrics
import matplotlib.pyplot as plt
import scipy.stats as stats
import matplotlib.colors as mcolors

In [2]:
from read_wandb import wandb_results
api = wandb_results("206713612", wandb_username="maayan-aytek1")

BASE_METRIC = "accuracy_per_mean_user_and_bot"


In [8]:
def result_metric(sweeps, group_name, drop_list=[0], drop_HPT=False, metric=BASE_METRIC, epoch="best", dummy_group=False):
    df = api.get_sweeps_results(sweeps, metric=metric) 
    config_cols = [c for c in df.columns if "config_" in c and c!="config_wandb_run_id" and c!="config_online_simulation_size"]
    HPT_cols = [col for col in config_cols if df[col].nunique() > 1]
    if drop_HPT:
        df=df.drop([c for c in HPT_cols if not c in ["config_LLM_SIM_SIZE", "config_seed"]], axis=1)
        HPT_cols = ["config_LLM_SIM_SIZE", "config_seed"]  
    numeric_cols = df.select_dtypes(include='number').columns.tolist()
    # if 'config_rt_model_file_name' in df.columns: 
    #     df['config_rt_model_file_name'] = df['config_rt_model_file_name'].apply(lambda x: x.split('_seed')[0])
    if not dummy_group:
        grouped = df.groupby([c for c in HPT_cols if c != "config_seed"])
    else:
        df['dummy_group'] = 'New LLM'
        
        grouped = df.groupby('dummy_group')
    
    mean_df = grouped[numeric_cols].mean()
    std_df = grouped[numeric_cols].std()
    
    if epoch=="best":
        best_col = mean_df[[c for c in mean_df.columns if (metric in c and metric[-4:] == c.split("_epoch")[0][-4:])]].idxmax(axis=1)
    else:
        best_col = mean_df[[c for c in mean_df.columns if f"{metric}_epoch{epoch}" in c]].idxmax(axis=1)
    
    result = grouped.apply(lambda x: x[best_col.loc[x.name]].values)
    means = grouped.apply(lambda x: x[best_col.loc[x.name]].mean())
    stds = grouped.apply(lambda x: x[best_col.loc[x.name]].std())
    
    df_cols = {'mean': means, 'std': stds, 'values': result.values}
    
    if epoch == "best":
        df_cols['epoch'] = best_col.apply(lambda x: int(x.split("epoch")[1]) if "epoch" in x else "last")
    
    df_cols['CI'] = result.apply(lambda x: bootstrap_ci(x))
    summary_df = pd.DataFrame(df_cols, index=best_col.index)
    
    for d in drop_list:
        if d in summary_df.index:
            summary_df=summary_df.drop(d)

    if len(summary_df.index.names) == 1:
        return summary_df.rename_axis(group_name)
    else:
        return summary_df

def bootstrap_ci(data, n_bootstrap=1000, ci=0.95):
    bootstrapped_means = []
    for _ in range(n_bootstrap):
        sample = np.random.choice(data, size=len(data), replace=True)
        bootstrapped_means.append(np.mean(sample))
    lower_bound = np.percentile(bootstrapped_means, (1 - ci) / 2 * 100)
    upper_bound = np.percentile(bootstrapped_means, (1 + ci) / 2 * 100)
    return lower_bound, upper_bound


# For HyperParameterTuning

For every configuration that you test in the sweep, you will receive in the table the average, standard deviation, all the values obtained for the different seed values, and also the confidence interval within which the result is located at a confidence level of 95%.

When epoch="best" is defined, you can check in which epoch the best result is obtained. If epoch=5 is defined, you will receive the result obtained for epoch number 5.

You can test multiple sweeps simultaneously by entering them into the list found in the first element of the function result_metric.

In [5]:
sweep_results = result_metric(["gihrejb0"], "LLMs", drop_HPT=False, epoch='best')
sweep_results

Total number of sweeps: 1
Download sweep_id='gihrejb0' data...


100%|██████████| 9/9 [00:00<00:00, 3509.88it/s]


Unnamed: 0_level_0,mean,std,values,epoch,CI
LLMs,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
normal,0.839446,0.002792,"[0.8417926790544936, 0.841346422342924, 0.8374...",22,"(0.8370719020261552, 0.8414745227670034)"
uniform,0.838315,0.001065,"[0.8377004597636084, 0.8396278361815124, 0.837...",16,"(0.8374698860177403, 0.8391596827278345)"


In [10]:
sweep_results = result_metric(["nh9eyaq1"], "LLMs", drop_HPT=False, epoch='best', dummy_group=True)
sweep_results

Total number of sweeps: 1
Download sweep_id='nh9eyaq1' data...


Unnamed: 0_level_0,mean,std,values,epoch,CI
LLMs,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
New LLM,0.83619,0.001682,"[0.8371521808705251, 0.834866139441589, 0.8339...",14,"(0.8348854277003969, 0.8374292824620628)"


In [30]:
sweep_results = result_metric(["rr4bf5bj"], "LLMs", drop_HPT=False, epoch='best', dummy_group=False)
sweep_results.reset_index()

Total number of sweeps: 1
Download sweep_id='rr4bf5bj' data...


Unnamed: 0,config_rt_user_noise_std,config_rt_neutral_sampling,config_rt_frustration_std_method,mean,std,values,epoch,CI
0,200,1000,+,0.836979,0.001992,"[0.8400159701451503, 0.8374672073500151, 0.834...",20,"(0.835508510753624, 0.8386065825734462)"
1,200,1000,/,0.83757,0.002598,"[0.8393964648476003, 0.8355232446906189, 0.839...",last,"(0.8354718042123036, 0.8394873807381027)"
2,200,800,+,0.837341,0.002995,"[0.8378782271688218, 0.8351722705271559, 0.840...",14,"(0.8350102656603005, 0.8396724216486516)"
3,200,800,/,0.837163,0.002275,"[0.8388582202235954, 0.8338600988903725, 0.836...",19,"(0.8353541311863246, 0.8388281644230622)"
4,200,normal,+,0.837345,0.001237,"[0.8365394680610218, 0.8384429625645824, 0.836...",12,"(0.8364254228451934, 0.8383448839529024)"
5,200,normal,/,0.837635,0.002379,"[0.839637143670511, 0.8401320919190824, 0.8344...",11,"(0.8357560426088992, 0.8394628844213561)"
6,300,1000,+,0.838036,0.002364,"[0.8414243308476722, 0.839321243925161, 0.8364...",14,"(0.8363162179299343, 0.839851659987934)"
7,300,1000,/,0.836368,0.000789,"[0.8366641666218375, 0.8354961955228851, 0.837...",14,"(0.8357412774935529, 0.8369838304369315)"
8,300,800,+,0.837205,0.001097,"[0.8390263045485709, 0.8369912642463646, 0.837...",18,"(0.8365180792493341, 0.8381647341407689)"
9,300,800,/,0.836295,0.001916,"[0.8346060943480512, 0.8390550693024155, 0.834...",8,"(0.8348687521753421, 0.8378505506671526)"


In [13]:
sweep_results = result_metric(["hnmkm931"], "LLMs", drop_HPT=False, epoch='best', dummy_group=False)
sweep_results

Total number of sweeps: 1
Download sweep_id='hnmkm931' data...


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,mean,std,values,epoch,CI
config_rt_model_file_name,config_rt_model_class_weight,config_rt_model_top_features,config_rt_model_min_samples_leaf,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
min_samples_leaf_100_class_weight_balanced_subsample_top_features_20,balanced_subsample,20,100,0.803827,0.007995,"[0.8079651068224895, 0.8089049420416016, 0.794...",21,"(0.7946109517517668, 0.8089049420416016)"
min_samples_leaf_100_class_weight_balanced_subsample_top_features_all,balanced_subsample,all,100,0.800524,0.009434,"[0.8071830597201283, 0.8046608346752931, 0.789...",7,"(0.7897284865772579, 0.8071830597201283)"
min_samples_leaf_100_class_weight_balanced_top_features_20,balanced,20,100,0.803759,0.010876,"[0.8082166202163459, 0.8116984540783388, 0.791...",12,"(0.7913618779335834, 0.8116984540783388)"
min_samples_leaf_100_class_weight_balanced_top_features_all,balanced,all,100,0.801297,0.014236,"[0.811304125438893, 0.8075879203077279, 0.7849...",14,"(0.7849986468532864, 0.811304125438893)"
min_samples_leaf_20_class_weight_balanced_subsample_top_features_20,balanced_subsample,20,20,0.801658,0.008609,"[0.7923035867726862, 0.8092496113601433, 0.803...",10,"(0.7923035867726863, 0.8092496113601433)"
min_samples_leaf_20_class_weight_balanced_subsample_top_features_all,balanced_subsample,all,20,0.800324,0.009481,"[0.8033689053307933, 0.8079081068996256, 0.789...",18,"(0.7896952031273484, 0.8079081068996256)"
min_samples_leaf_20_class_weight_balanced_top_features_20,balanced,20,20,0.800564,0.012437,"[0.8077261806863028, 0.8077623429208532, 0.786...",15,"(0.7862021247476508, 0.8077623429208532)"
min_samples_leaf_20_class_weight_balanced_top_features_all,balanced,all,20,0.801823,0.00627,"[0.8079223001047566, 0.8021522343350065, 0.795...",19,"(0.795394991585859, 0.8079223001047566)"
min_samples_leaf_50_class_weight_balanced_subsample_top_features_20,balanced_subsample,20,50,0.803595,0.007382,"[0.8121170640316089, 0.799482245693308, 0.7991...",20,"(0.799185076502973, 0.8121170640316088)"
min_samples_leaf_50_class_weight_balanced_subsample_top_features_all,balanced_subsample,all,50,0.803135,0.008288,"[0.81074408007565, 0.8043577337208752, 0.79430...",20,"(0.794303151111411, 0.8107440800756501)"


In [14]:
sweep_results = result_metric(["42wibgw1"], "LLMs", drop_HPT=False, epoch='best', dummy_group=False)
sweep_results

Total number of sweeps: 1
Download sweep_id='42wibgw1' data...


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,mean,std,values,epoch,CI
config_rt_model_file_name,config_rt_model_class_weight,config_rt_model_top_features,config_rt_model_min_samples_leaf,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
min_samples_leaf_100_class_weight_balanced_subsample_top_features_20,balanced_subsample,20,100,0.836768,0.001404,"[0.8351968767317809, 0.8379012449859079, 0.837...",21,"(0.8351968767317809, 0.8379012449859079)"
min_samples_leaf_100_class_weight_balanced_subsample_top_features_all,balanced_subsample,all,100,0.8369,0.000252,"[0.8371155752297419, 0.8366225382946252, 0.836...",9,"(0.8366225382946252, 0.8371155752297419)"
min_samples_leaf_100_class_weight_balanced_top_features_20,balanced,20,100,0.837315,0.001356,"[0.8385974426912752, 0.8374519736681949, 0.835...",18,"(0.8358950308736369, 0.8385974426912752)"
min_samples_leaf_100_class_weight_balanced_top_features_all,balanced,all,100,0.837462,0.002099,"[0.8398647309850867, 0.835986445817165, 0.8365...",18,"(0.835986445817165, 0.8398647309850867)"
min_samples_leaf_20_class_weight_balanced_subsample_top_features_20,balanced_subsample,20,20,0.836669,0.001892,"[0.8379861002281137, 0.834501391657148, 0.8375...",12,"(0.834501391657148, 0.8379861002281137)"
min_samples_leaf_20_class_weight_balanced_subsample_top_features_all,balanced_subsample,all,20,0.837695,0.001677,"[0.8377945331370256, 0.8359706140758668, 0.839...",11,"(0.8359706140758668, 0.8393197174370671)"
min_samples_leaf_20_class_weight_balanced_top_features_20,balanced,20,20,0.836614,0.001761,"[0.8356686748178921, 0.8355273771061232, 0.838...",17,"(0.8355273771061232, 0.8386460324534374)"
min_samples_leaf_20_class_weight_balanced_top_features_all,balanced,all,20,0.836802,0.00141,"[0.8379406353880007, 0.8352242637813103, 0.837...",15,"(0.8352242637813103, 0.8379406353880007)"
min_samples_leaf_50_class_weight_balanced_subsample_top_features_20,balanced_subsample,20,50,0.837531,0.002032,"[0.8383367194971337, 0.8352194818445661, 0.839...",13,"(0.8352194818445661, 0.8390366751829358)"
min_samples_leaf_50_class_weight_balanced_subsample_top_features_all,balanced_subsample,all,50,0.838501,0.000999,"[0.8373830504895878, 0.8388137564932842, 0.839...",9,"(0.8373830504895879, 0.8393070493026439)"


# Result for a specific epoch

In [8]:
sweep_results = result_metric(["ekvlnnzi"], "LLMs", drop_HPT=True, epoch='best',  dummy_group=False)
sweep_results

Total number of sweeps: 1
Download sweep_id='ekvlnnzi' data...


KeyError: 'config_LLM_SIM_SIZE'