In [211]:
import numpy as np
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json
from util.util import generate_data_paths,generate_history_results_path, load_history_data, generate_concat_dataset,filter_history_information,load_experiment_data

In [212]:
def summarize_history(data_table, pick_cols, col_pick_model="val_loss", pick_min=True, max_epochs=100,min_epoch=10,
                      col_pick_max="test_acc", data_path_col='history_path'):
    if data_path_col not in data_table.columns:
        print("there are no history path to load history data")
        return
    history_information_table = []
    temp = data_table[pick_cols]
    history_cols = temp[data_path_col]
    for path in history_cols.values:
        fix_col_pick_model = col_pick_model
        history_data = pd.read_csv(path)
        available_cols = history_data.columns
  
        # check if col_pick_model exist
        if not col_pick_model in history_data.columns:
            print("col {} isn't in the history data ".format(col_pick_model))
            print("use default val_loss as pick col")
            fix_col_pick_model = "val_loss"
        

        # limit total epoch to max epoch
        history_epoch = len(history_data)
        if history_epoch > max_epochs:
            history_data = history_data[:max_epochs]
        
        if history_epoch > min_epoch:
            history_data = history_data[min_epoch:]
#             print("update history data :",history_data.head())
            history_data = history_data.reset_index(drop=True)


        # deal with how to use a metric to pick best model
        if pick_min:
            pick_row_idx = history_data[fix_col_pick_model].argmin()
        else:
            pick_row_idx = history_data[fix_col_pick_model].argmax()

        #

        # val_loss_name = 'val_loss' if 'val_loss' in history_data.columns else 'val_loss_x'
        metric_pick_model = ['']
        # get max possible test_auc score information
        best_row_idx = history_data[col_pick_max].argmax()

#         print("pick row idx : ",pick_row_idx)
#         print("pick best idx : ",best_row_idx)
        
        # best_col_pick_model = history_data.loc[best_row_idx, col_pick_max]
        # best_col_pick_max = history_data.loc[best_row_idx, col_pick_max]
        # 
        # best_test_auc = history_data.loc[best_row_idx, col_pick_max]
        test_class_col = [col for col in pick_cols if "test_class_" in col]

        history_info_dict = {
            "model_choice": ["best_possible_epoch", "picked_epoch"],
            "epoch": [best_row_idx, pick_row_idx],
            col_pick_max: [history_data.loc[best_row_idx, col_pick_max], history_data.loc[pick_row_idx, col_pick_max]],
            fix_col_pick_model: [history_data.loc[best_row_idx, fix_col_pick_model],
                                 history_data.loc[pick_row_idx, fix_col_pick_model]],
            "history_path": [path, path]
        }

        history_information = pd.DataFrame(history_info_dict)
        history_information_table.append(history_information)
    history_information_table = pd.concat(history_information_table)
    merge_table = pd.merge(temp, history_information_table, on=[data_path_col])
    return merge_table
def generate_history_results_path(row, full_result_path):
    remain='default\\version_0\\metrics.csv'
    test_fold = row['test_fold']
    shuffle_fold = row['shuffle_fold']
    increment_fold = row['increment_fold']
    valid_fold = row['valid_fold']
    history_path = os.path.join(full_result_path,test_fold, shuffle_fold,increment_fold, valid_fold,
                                remain)
#     print("current history path : ",history_path)
    return history_path
def load_history_data(data_table, pick_cols, data_path_col='history_path'):
    if data_path_col not in data_table.columns:
        print("there are no history path to load history data")
        return
    history_information_table = []
    temp = data_table[pick_cols]
    print("temp col : ",temp.columns)
    history_cols = temp[data_path_col]
    for path in history_cols.values:
        history_data = pd.read_csv(path)
        history_data[data_path_col] = [path] * len(history_data)
        history_information_table.append(history_data)
    history_information_table = pd.concat(history_information_table)
    merge_table = pd.merge(temp, history_information_table, on=[data_path_col])
    return merge_table
def load_data(data_paths, result_folder, result_file_name, info_file_name, load_history=False):
    list_data = []
    if len(data_paths) ==0:
        print("no data path ")
    for data_path in data_paths:
        result_folder_path = os.path.join(data_path, result_folder)
        result_data_path = os.path.join(result_folder_path, result_file_name)
        # check if file result exists
        if os.path.exists(result_data_path):
            data = pd.read_excel(result_data_path)
            data_size = len(data)
            info_data_path = os.path.join(result_folder_path, info_file_name)
            if os.path.exists(info_data_path):
                with open(info_data_path) as f:
                    info_data = json.load(f)
                    extra_fields = info_data["EXTRA_FIELDS"]
                    field_names = list(extra_fields.keys())
                    for field_name in field_names:
                        if extra_fields[field_name] == []:
                            extra_fields[field_name] = None
                        data[field_name] = data_size*[extra_fields[field_name]]
                list_data.append(data)
            else:
                print("no data info for {} ".format(result_data_path))

            if load_history:
                data['history_path'] = data.apply(lambda row: generate_history_results_path(row, data_path), axis=1)
#                 print("load current history path : ",data['history_path'].values[:5])

        else:
            print("the current data path {} does not exist ".format(result_data_path))

    final_data = pd.concat(list_data).reset_index(drop=True)
    return final_data
# prefix_lists=[augmentation_prefix,norm_prefix,model_prefix,dataset_prefix]
def load_experiment_data(common_path, prefix_lists=None,pick_cols=None,
                         col_pick_model=None,col_pick_model_min=True,
                         new_col_generate=None,load_history = False):

    result_folder = 'result_folder'
    file_name = 'model_result.xlsx'
    info_file_name = 'model_info.json'

    list_full_path = generate_data_paths(common_path, prefix_lists, [])
    data_result = load_data(list_full_path, result_folder, file_name, info_file_name, load_history=load_history)
    data_cols = data_result.columns

    pick_cols = ['test_fold', 'shuffle_fold', 'increment_fold',
       'valid_fold', 'target_dataset', 'source_dataset', 'normalize', 'aug',
       'model', 'source_label_space', 'target_label_space','history_path']
    if pick_cols is None:
        pick_cols = list(data_cols)

#     if new_col_generate is not None:
#         for col_generate in new_col_generate:    
#             new_col_name = col_generate[0]
#             func = col_generate[1]
#             data_result[new_col_name] = data_result.apply(lambda row: func(row,data_cols), axis=1)
#             pick_cols.append(new_col_name)
    if col_pick_model is None:
        col_pick_model = 'val_loss'
    pick_min = col_pick_model_min
    print("data result cols : ",data_result.columns)
    if load_history:
        summary = summarize_history(data_result, pick_cols)
        history_data = load_history_data(data_result, pick_cols)
        return data_result,history_data,summary
    return data_result



def modify_col_info(data_result):
    data_result['increment_fold'] = data_result['increment_fold'].replace(
    ['increment_fold_1', 'increment_fold_2', 'increment_fold_3'], ['1', '2', '3'])
    data_result['valid_fold'] = data_result['valid_fold'].replace(
        ['valid_fold_1', 'valid_fold_2', 'valid_fold_3'], ['1', '2', '3'])
    data_result['shuffle_fold'] = data_result['shuffle_fold'].replace(
        ['shuffle_fold_1', 'shuffle_fold_2', 'shuffle_fold_3', 'shuffle_fold_4'], ['1', '2', '3','4'])
    data_result['test_fold'] = data_result['test_fold'].replace(
        ['test_fold_1'], ['1'])
    data_result['aug'] = data_result['aug'].replace(
        ['no_aug', 'temporal_aug'], ['no', 'temp'])
    data_result['normalize'] = data_result['normalize'].replace(
        ['chan_norm', 'no_norm'], ['chan', 'no'])
    data_result['model'] = data_result['model'].replace(
        ['ComponentAdaptation', 'BaseModel', 'MultiDatasetAdaptation'], ['component', 'base','adapt'])

In [213]:
#compare 
model_list_prefix = [
    'vanilla',
    'adaptation',
    'component_adapt'
]
target_dataset_list_prefix = [
    "BCI_IV",
    "Cho2017",
    "Physionet"
]
augmentation_list_prefix = [
    'no_aug',
    'temp_aug'
]
norm_list_prefix = [
    'no_norm',
    'chan_norm'
]
prefix_list = [augmentation_list_prefix,norm_list_prefix,model_list_prefix,target_dataset_list_prefix]
common_path = "C:\\wduong_folder\\Dassl.pytorch-master\\EEG_Dassl_Lightning\\NeurIPS_competition\\experiment_1\\{}\\{}\\{}\\{}\\model"


In [214]:
data_result_1 = load_experiment_data(common_path,prefix_lists=prefix_list)

# data_result_1,progress_data_1,summary_1 = load_experiment_data(common_path,prefix_lists=prefix_list,load_history=True)
modify_col_info(data_result_1)
# modify_col_info(progress_data_1)
# modify_col_info(summary_1)

# summary_1.head()

data result cols :  Index(['test_acc', 'test_loss', 'test_fold', 'shuffle_fold', 'increment_fold',
       'valid_fold', 'target_dataset', 'source_dataset', 'normalize', 'aug',
       'model', 'source_label_space', 'target_label_space'],
      dtype='object')


In [206]:
# progress_data_1.head()
# conditions = [
#     ["target_dataset",["BCI_IV_2S"]]
# ]
# def modify_history_data(history_data):
#     model = history_data["model"]
# #only pick between epoch 10-30 for adapt model
# adapt_data = progress_data_1[progress_data_1['model'].isin(["adapt","component"])]
# adapt_data = adapt_data[adapt_data['epoch']>10]
# # filter_history_information(progress_data_1,)
# print(adapt_data.head())

In [196]:
# print("data result col ",data_result.columns)
# print("unique aug : ",np.unique(data_result['aug']))
# group_format = data_result_1.groupby(["normalize","aug","target_dataset","increment_fold","model"],as_index=False).mean()
# table = pd.pivot_table(group_format, values=['test_acc'], index=['target_dataset','normalize','aug','model'],columns=['increment_fold'])
# print(table)

# group_format = summary_1.groupby(["normalize","aug","target_dataset","increment_fold","model_choice","model"],as_index=False).mean()
# table = pd.pivot_table(group_format, values=['test_acc'], index=['target_dataset','normalize','aug',"model_choice",'model'],columns=['increment_fold'])
# print(table)

                                                             test_acc  \
increment_fold                                                      1   
target_dataset normalize aug  model_choice        model                 
BCI_IV         chan      no   best_possible_epoch adapt      0.713078   
                                                  base       0.634488   
                                                  component  0.635210   
                              picked_epoch        adapt      0.696782   
                                                  base       0.630466   
...                                                               ...   
physionet      no        temp best_possible_epoch base       0.409583   
                                                  component  0.412248   
                              picked_epoch        adapt      0.392486   
                                                  base       0.365903   
                                                  c

In [215]:
prefix_list = [augmentation_list_prefix,norm_list_prefix,model_list_prefix,target_dataset_list_prefix]
common_path = "C:\\wduong_folder\\Dassl.pytorch-master\\EEG_Dassl_Lightning\\NeurIPS_competition\\experiment_2\\{}\\{}\\{}\\{}\\model"

data_result_2 = load_experiment_data(common_path,prefix_lists=prefix_list)

modify_col_info(data_result_2)

print("data result col ",data_result_2.columns)
print("unique aug : ",np.unique(data_result_2['aug']))
group_format = data_result_2.groupby(["normalize","aug","target_dataset","increment_fold","model"],as_index=False).mean()
table = pd.pivot_table(group_format, values=['test_acc'], index=['target_dataset','normalize','aug','model'],columns=['increment_fold'])
print(table)

data result cols :  Index(['test_acc', 'test_loss', 'test_fold', 'shuffle_fold', 'increment_fold',
       'valid_fold', 'target_dataset', 'source_dataset', 'normalize', 'aug',
       'model', 'source_label_space', 'target_label_space'],
      dtype='object')
data result col  Index(['test_acc', 'test_loss', 'test_fold', 'shuffle_fold', 'increment_fold',
       'valid_fold', 'target_dataset', 'source_dataset', 'normalize', 'aug',
       'model', 'source_label_space', 'target_label_space'],
      dtype='object')
unique aug :  ['no' 'temp']
                                         test_acc                    
increment_fold                                  1         2         3
target_dataset normalize aug  model                                  
BCI_IV         chan      no   adapt      0.713903  0.638047  0.637376
                              base       0.669864  0.642636  0.614171
                              component  0.578177  0.558684  0.455858
                         temp adapt  

In [216]:
save_data_folder = "NeurIPS\data"
# save_graph_folder = "NeurIPS\graph"
group_format = data_result_1.groupby(["normalize","aug","target_dataset","increment_fold","model"],as_index=False).mean()
table = pd.pivot_table(group_format, values=['test_acc'], index=['target_dataset','normalize','aug','model'],columns=['increment_fold'])
print(table)
output_path = os.path.join(save_data_folder,'experiment_1.xlsx')
table.to_excel(output_path,float_format="%.2f")
group_format = data_result_2.groupby(["normalize","aug","target_dataset","increment_fold","model"],as_index=False).mean()
table = pd.pivot_table(group_format, values=['test_acc'], index=['target_dataset','normalize','aug','model'],columns=['increment_fold'])
print(table)
output_path = os.path.join(save_data_folder,'experiment_2.xlsx')
table.to_excel(output_path,float_format="%.2f")

                                         test_acc                    
increment_fold                                  1         2         3
target_dataset normalize aug  model                                  
BCI_IV         chan      no   adapt      0.697814  0.647896  0.625859
                              base       0.630466  0.618657  0.604407
                              component  0.630466  0.605404  0.574876
                         temp adapt      0.732364  0.673938  0.649821
                              base       0.728960  0.675485  0.656319
                              component  0.639645  0.640058  0.573398
               no        no   adapt      0.700804  0.623350  0.613139
                              base       0.637067  0.595245  0.574979
                              component  0.596947  0.570390  0.447951
                         temp adapt      0.695751  0.623814  0.589453
                              base       0.701733  0.656869  0.615890
                    