In [1]:
###
# https://docs.google.com/document/d/187FR1SXKftzRAX3xaxJo1KoFoTZ_Db4MmkJdrf9jdFc/edit
###

# AIIO Copyright (c) 2023, The Regents of the University of California,
# through Lawrence Berkeley National Laboratory (subject to receipt of
# any required approvals from the U.S. Dept. of Energy) and Ohio State
# University. All rights reserved.

import dill
import pandas as pd
import numpy as np
import glob
import pickle

import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

import os.path
import subprocess
import joblib
import shap
from scipy import stats
from numpy import loadtxt
from sklearn.model_selection import train_test_split
from pytorch_tabnet.tab_model import TabNetRegressor


pd.set_option('display.max_rows', None)
pd.set_option("display.max_columns", None)
pd.set_option('display.max_colwidth', None)

# /project/projectdirs/m2621/jeanbez/darshan/2019-01/1/Darshan_Total.csv
# file_tot="/project/projectdirs/m2621/jeanbez/darshan/2019-01/1/Darshan_Total.csv"

tot_dop_cols = ['darshan_log_version', 'exe', 'uid', 'start_time', 'start_time_asci',
                'end_time', 'end_time_asci', 'runtime', 'LUSTRE_OSTS', 'LUSTRE_MDTS', 'LUSTRE_STRIPE_OFFSET']

tot_dop_cols_min = ['darshan_log_version', 'start_time', 'end_time',
                    'runtime', 'LUSTRE_OSTS', 'LUSTRE_MDTS', 'LUSTRE_STRIPE_OFFSET']

##top_dop_cols_fast_slow=['total_POSIX_FASTEST_RANK', 'total_POSIX_FASTEST_RANK_BYTES','total_POSIX_SLOWEST_RANK', 'total_POSIX_SLOWEST_RANK_BYTES', 'total_POSIX_F_VARIANCE_RANK_TIME', 'total_POSIX_F_VARIANCE_RANK_BYTES']
# file_raw="/project/projectdirs/m2621/jeanbez/darshan/2019-01/1/Darshan.csv"


##
##POSIX_PERF_MIBS, MPIIO_PERF_MIBS, STDIO_PERF_MIBS
##
def split_df_by_perf(df_p, print_header_p=True):
    df_posix_all = df_p[df_p['POSIX_PERF_MIBS'] > 0]
    # find MPI-IO first
    df_mpiio = df_p[df_p['MPIIO_PERF_MIBS'] > 0]
    # Find POSIX from NON-MPIIO
    df_no_mpiio = df_p[df_p['MPIIO_PERF_MIBS'] <= 0]
    df_posix = df_no_mpiio[df_no_mpiio['POSIX_PERF_MIBS'] > 0]
    # Left are STDIO
    df_stdio = df_no_mpiio[df_no_mpiio['POSIX_PERF_MIBS'] <= 0]
    if print_header_p:
        print("\n Number of MPIIO job : ", df_mpiio.shape[0])
        print("\n Number of POSIX job : ", df_posix.shape[0])
        print("\n Number of STDIO job : ", df_stdio.shape[0])
        print("\n Number of POSIX(all) job : ", df_posix_all.shape[0])
    return df_mpiio, df_posix, df_stdio, df_posix_all

# ## POSIX_GROUP       "['POSIX' 'LUSTRE' 'STDIO']",   "['POSIX' 'STDIO']",  "['POSIX']",
# ##
# ## STDIO_Group       "['LUSTRE' 'STDIO']",          "['STDIO']",
# ##
# ## MPIIO-Group        "['POSIX' 'MPI-IO' 'LUSTRE' 'STDIO']",
# ##                   "['POSIX' 'MPI-IO' 'PNETCDF' 'LUSTRE' 'STDIO']",
# ##                   "['POSIX' 'MPI-IO']"]
# def split_df(df_p, drop_module=True):
#     #display(df_p['module'].apply(lambda x: str(x)).unique())
#     df_mpiio=df_p[df_p['module'].str.contains('MPI-IO', regex=False) == True]
#     df_no_mpiio=df_p[df_p['module'].str.contains('MPI-IO', regex=False) == False]
#     df_posix=df_no_mpiio[df_no_mpiio['module'].str.contains('POSIX', regex=False) == True]
#     df_stdio=df_no_mpiio[df_no_mpiio['module'].str.contains('POSIX', regex=False) == False]
#     if drop_module:
#         df_mpiio=df_mpiio.drop('module', axis=1)
#         df_posix=df_posix.drop('module', axis=1)
#         df_stdio=df_stdio.drop('module', axis=1)
#     return df_mpiio, df_posix, df_stdio


def read_total(file_tot, tot_dop_cols_p, print_header_p=False, is_split_p=False):
    #print("Read : ", file_tot)
    df_tot = pd.read_csv(file_tot)
    #print("\n Total Jobs : ", df_tot.shape[0])
    if print_header_p:
        my_list = list(df_tot)
        print("Header after Darshan_Total: ", my_list)
        # display(df_tot.head(2))
    df_tot.drop(tot_dop_cols_p, axis=1, inplace=True)
    df_tot.set_index('jobid', inplace=True)
    df_tot = df_tot.sort_index()
    ##df_tot.loc[df_tot < 0, df.select_dtypes(np.number).columns] = 0
    ##num = df._get_numeric_data()
    df_tot_num = df_tot._get_numeric_data()
    df_tot_num[df_tot_num < 0] = 0
    df_tot.fillna(0)
    if print_header_p:
        print("Total Jobs from ", file_tot, " :", df_tot.shape[0])
    if is_split_p:
        return split_df_by_perf(df_tot, print_header_p=print_header_p)
    else:
        return df_tot

#
# both file_start_index/file_end_index are inclusive (1, 3) = 1, 2 3


def read_total_multiple_days(file_dir_p, file_start_index_p, file_end_index_p,  tot_dop_cols_p=tot_dop_cols, split_index=3, print_header_p=False, is_split_p=False):
    print(tot_dop_cols_p)
    df_result = pd.DataFrame()
    for file_index in range(file_start_index_p, file_end_index_p+1):
        df_tmp = read_total(file_dir_p+'/'+str(file_index)+'.csv', tot_dop_cols_p,
                            print_header_p=print_header_p, is_split_p=is_split_p)[split_index]
        # df_result=df_result.append(df_tmp)
        df_result = pd.concat([df_result, df_tmp], sort=False)
    print("read_total_multiple_days:  df_result's size  = ", df_result.shape)
    return df_result

#
# both file_start_index/file_end_index are inclusive (1, 3) = 1, 2 3


def read_total_multiple_monthes(file_dir_p, file_start_index_p, file_end_index_p,  tot_dop_cols_p=tot_dop_cols, split_index=3, print_header_p=False, is_split_p=False, print_progress=False):
    print(tot_dop_cols_p)
    df_result = pd.DataFrame()
    for month_index in range(file_start_index_p, file_end_index_p+1):
        if print_progress:
            print(file_dir_p+'/'+str(month_index)+'/...')
        for day_index in range(1, 32):
            # print(file_dir_p+'/'+str(month_index)+'/'+str(day_index)+'.csv')
            if os.stat(file_dir_p+'/'+str(month_index)+'/'+str(day_index)+'.csv').st_size == 0:
                continue
            df_tmp = read_total(file_dir_p+'/'+str(month_index)+'/'+str(day_index)+'.csv',
                                tot_dop_cols_p, print_header_p=print_header_p, is_split_p=is_split_p)[split_index]
            df_result = pd.concat([df_result, df_tmp], sort=False)
    print("read_total_multiple_days:  df_result's size  = ", df_result.shape)
    return df_result

#POSIX_PERF_MIBS, MPIIO_PERF_MIBS, STDIO_PERF_MIBS


def sort_performance_descending(df_p, is_normalize_by_proc_osts=False, add_tag=False, n_tag_groups=3, drop_perf_cols=False, only_posix=True, drop_low_performance_job_ratio=0.2, plot_hist_perf=False, use_performance_as_group=False, even_group=False, keep_performance_p=False, is_convert_to_gbs=False):
    df_pp = df_p  # Make a copy to avoid change previous ones
    #df["C"] = df[["A", "B"]].max(axis=1)
    if only_posix:
        df_pp['performance'] = df_pp['POSIX_PERF_MIBS']
    else:
        df_pp['performance'] = df_pp[['POSIX_PERF_MIBS',
                                      'MPIIO_PERF_MIBS', 'STDIO_PERF_MIBS']].max(axis=1)
    if is_normalize_by_proc_osts:
        df_pp['performance'] = df_pp['performance'] / df_pp['nprocs']
        #df_pp['performance'] = df_pp['performance'] / df_pp['LUSTRE_STRIPE_WIDTH']

    if is_convert_to_gbs:
        df_pp['performance'] = df_pp['performance'] / 1024

    df_pp.sort_values('performance', inplace=True, ascending=False)
    # display(df_pp.head(10))
    # display(df_pp.tail(10))
    if drop_low_performance_job_ratio > 0:
        n_to_dop = int(df_pp.shape[0] * drop_low_performance_job_ratio)
        df_pp.drop(df_pp.tail(n_to_dop).index, inplace=True)
    # display(df_pp.head(10))
    # display(df_pp.tail(10))
    if plot_hist_perf:
        df_pp.hist(column='performance')
        #ax.set_xlabel("MB/sec (plot by sort_performance_descending)")
        # ax.set_ylabel("Count")
    if add_tag:
        if use_performance_as_group:
            df_pp['group'] = df_pp['performance']
        else:
            if even_group == False:
                max_perf = df_pp.max(axis=0)['performance']
                min_perf = df_pp.min(axis=0)['performance']
                print("max_perf=", max_perf)
                print("min_perf=", min_perf)
                group_length = (max_perf - min_perf)/n_tag_groups
                group_bound = []
                for i in range(1, n_tag_groups):
                    group_bound.append(min_perf + i * group_length)
                group_bound.append(max_perf)
                print(group_bound)
                # create a list of our conditions
                conditions = [
                    (df_pp['performance'] <= group_bound[0]),
                    (df_pp['performance'] > group_bound[0]) & (
                        df_pp['performance'] <= group_bound[1]),
                    (df_pp['performance'] > group_bound[1]) & (
                        df_pp['performance'] <= group_bound[2]),
                ]
                # create a list of the values we want to assign for each condition
                values = [0, 1, 2]
                # create a new column and use np.select to assign values to it using our lists as arguments
                df_pp['group'] = np.select(conditions, values)
            else:
                g = np.linspace(0, n_tag_groups,
                                df_pp.shape[0], endpoint=False).astype(int)
                df_pp['group'] = g
    if drop_perf_cols:
        if only_posix:
            df_pp.drop(['POSIX_PERF_MIBS'], axis=1, inplace=True)
        else:
            df_pp.drop(['POSIX_PERF_MIBS', 'MPIIO_PERF_MIBS',
                       'STDIO_PERF_MIBS'], axis=1, inplace=True)
    # display(df_pp.head(3))
    if keep_performance_p == False:
        df_pp.drop('performance', axis=1, inplace=True)
    return df_pp


def drop_cols(df_p, reg_pattern):
    df_dropped = df_p[df_p.columns.drop(list(df_p.filter(regex=reg_pattern)))]
    print("Have [", df_dropped.shape[1], "] colums after droping",
          reg_pattern, " from orginal [", df_p.shape[1],  "] cols")
    return df_dropped


def keep_cols(df_p, filter_str_to_keep):
    # keep valueLUSTRE_OSTS, nprocs
    df_filtered = df_p.filter(like=filter_str_to_keep, axis=1).copy()
    # df_filtered['valueLUSTRE_OSTS', 'nprocs']]=df_p[['valueLUSTRE_OSTS', 'nprocs']],
    df_filtered['valueLUSTRE_OSTS'] = df_p['valueLUSTRE_OSTS'].values
    df_filtered['nprocs'] = df_p['nprocs'].values
    return df_filtered


def normalize_cols(df_p, method_p="StandardScaler", cols=None, scaler_p=None):
    if method_p == "StandardScaler":
        scaler = StandardScaler()
    else:
        scaler = MinMaxScaler()
    # display(df.head(3))
    if cols is None:
        df_p_columns = df_p.columns
    else:
        df_p_columns = cols
    df_p = df_p.astype(float)
    # print(df_p.dtypes)
    ##df_tmp = pd.DataFrame(scaler.fit_transform(df_p.values), index=df_p.index, columns=df_p.columns)
    df_tmp = pd.DataFrame(scaler.fit_transform(
        df_p[df_p_columns].to_numpy()), index=df_p.index, columns=df_p_columns)
    # print(df_tmp.dtypes)
    df_tmp.columns = df_p_columns
    scaler_p = scaler
    return df_tmp


def filter_data(df_p, drop_by_size_ratio=0.2, drop_non_luster_access=True, drop_single_core=True, drop_time_related_counter=True, drop_default_striping=False, is_drop_zero_col=True):
    ##
    # Drop single-core jobs
    ##
    if drop_single_core:
        original_n = df_p.shape[0]
        print("original_n =", original_n)
        df_p = df_p.loc[(df_p['nprocs'] > 1)].copy()
        print("Drop [", original_n - df_p.shape[0], "]  among  ",
              original_n, " Rows by drop_single_core")

    df_p = df_p.drop(['total_POSIX_MODE'], axis=1, errors='ignore').copy()
    print('Drop [total_POSIX_MODE] ')
    ##
    # Drop zero cols
    ##
    if is_drop_zero_col:
        orig_cols_list = list(df_p)
        df_p = df_p.loc[:, (df_p != 0).any(axis=0)].copy()
        new_cols_list = list(df_p)
        print("# of Cols after drop zeros = (", len(new_cols_list), "/ # of in orig = ",
              len(orig_cols_list),  "): ", list(set(orig_cols_list) - set(new_cols_list)))
    ##
    # filter the size  (drop 20% of the bottom )
    ## (total_POSIX_BYTES_WRITTEN + total_POSIX_BYTES_READ)
    ##
    if drop_by_size_ratio > 0:
        df_p['total_POSIX_BYTES'] = df_p['total_POSIX_BYTES_WRITTEN'] + \
            df_p['total_POSIX_BYTES_READ']
        df_p = df_p.sort_values('total_POSIX_BYTES', ascending=False)
        n_to_dop = int(df_p.shape[0] * drop_by_size_ratio)
        print("Drop [", drop_by_size_ratio * 100, "%] = ", n_to_dop, "/",
              df_p.shape[0], " rows by total_POSIX_BYTES_WRITTEN+ total_POSIX_BYTES_READ")
        # print(df_p.shape)
        # df_p=df_p.drop(df_p.tail(n_to_dop).index)
        df_p = df_p.iloc[:-n_to_dop]
        df_p.drop('total_POSIX_BYTES', axis=1, inplace=True)
        # print(df_p.shape)
    ##
    # filter by data place Cori Home Directory, only consider accessing data from Luster file system
    # LUSTRE_STRIPE_SIZE  LUSTRE_STRIPE_WIDTH
    ##
    if drop_non_luster_access:
        original_n = df_p.shape[0]
        df_p = df_p.loc[(df_p['LUSTRE_STRIPE_SIZE'] > 0) &
                        (df_p['LUSTRE_STRIPE_WIDTH'] > 0)].copy()
        print("Drop [", original_n - df_p.shape[0], "]  among  ",
              original_n, " Rows by drop_non_luster_access")
    ##
    # Drop time-related counter
    # total_POSIX_MAX_READ_TIME_SIZE   total_POSIX_MAX_WRITE_TIME_SIZE  total_POSIX_F_OPEN_START_TIMESTAMP
    # total_POSIX_F_READ_START_TIMESTAMP total_POSIX_F_WRITE_START_TIMESTAMP total_POSIX_F_CLOSE_START_TIMESTAMP
    # total_POSIX_F_OPEN_END_TIMESTAMP total_POSIX_F_READ_END_TIMESTAMP total_POSIX_F_WRITE_END_TIMESTAMP
    # total_POSIX_F_CLOSE_END_TIMESTAMP total_POSIX_F_READ_TIME total_POSIX_F_WRITE_TIME total_POSIX_F_META_TIME
    # total_POSIX_F_MAX_READ_TIME total_POSIX_F_MAX_WRITE_TIME total_POSIX_F_FASTEST_RANK_TIME total_POSIX_F_SLOWEST_RANK_TIME
    if drop_time_related_counter:
        orig_cols_list = list(df_p)
        cols_without_TIMESTAMP = [
            x for x in list(df_p) if 'TIMESTAMP' not in x]
        df_p = df_p[cols_without_TIMESTAMP]
        other_dop_time_cols = ['total_POSIX_F_READ_TIME', 'total_POSIX_F_WRITE_TIME', 'total_POSIX_F_META_TIME', 'total_POSIX_F_MAX_READ_TIME', 'total_POSIX_F_MAX_WRITE_TIME', 'total_POSIX_F_FASTEST_RANK_TIME',  'total_POSIX_F_SLOWEST_RANK_TIME', 'total_POSIX_FASTEST_RANK',
                               'total_POSIX_FASTEST_RANK_BYTES', 'total_POSIX_SLOWEST_RANK', 'total_POSIX_SLOWEST_RANK_BYTES', 'total_POSIX_F_VARIANCE_RANK_TIME', 'total_POSIX_MAX_BYTE_READ', 'total_POSIX_MAX_BYTE_WRITTEN', 'total_POSIX_MAX_READ_TIME_SIZE', 'total_POSIX_MAX_WRITE_TIME_SIZE']
        df_p = df_p.drop(other_dop_time_cols, axis=1, errors='ignore').copy()
        new_cols_list = list(df_p)
        print("# of Cols after drop TIME (", len(new_cols_list), "/",
              len(orig_cols_list),  "): ", list(set(orig_cols_list) - set(new_cols_list)))
    ##
    # Drop default striping setting
    ##
    # if drop_default_striping:
    ##    original_n = df_p.shape[0]
    ##    df_p=df_p.loc[(df_p['LUSTRE_STRIPE_SIZE'] > 1) & (df_p['LUSTRE_STRIPE_WIDTH'] > 0)].copy()
    ##    print("Drop [", original_n - df_p.shape[0],"]  among  " , original_n, " Rows by drop_non_luster_access")

    print("After filter,  shape is : ", df_p.shape)
    return df_p


#POSIX_PERF_MIBS, MPIIO_PERF_MIBS, STDIO_PERF_MIBS
def add_tag(df_p, only_posix=False,  is_normalize_by_proc_osts=False,  is_convert_to_gbs=False, drop_low_performance_job_ratio=0, plot_hist_perf=False,  drop_orig_perf_cols=False, is_group_tag=False, n_groups=3, even_group=False, keep_performance_p=False, n_to_display=2):
    df_pp = df_p
    if only_posix:
        df_pp['performance'] = df_pp['POSIX_PERF_MIBS']
    else:
        df_pp['performance'] = df_pp[['POSIX_PERF_MIBS',
                                      'MPIIO_PERF_MIBS', 'STDIO_PERF_MIBS']].max(axis=1)
    if is_normalize_by_proc_osts:
        df_pp['performance'] = df_pp['performance'] / df_pp['nprocs']
    if is_convert_to_gbs:
        df_pp['performance'] = df_pp['performance'] / 1024
    if drop_low_performance_job_ratio > 0:
        df_pp.sort_values('performance', inplace=True, ascending=False)
        n_to_dop = int(df_pp.shape[0] * drop_low_performance_job_ratio)
        df_pp.drop(df_pp.tail(n_to_dop).index, inplace=True)
    if plot_hist_perf:
        df_pp.hist(column='performance')
    if is_group_tag == False:
        df_pp['tag'] = df_pp['performance']
    else:
        if even_group == False:
            max_perf = df_pp.max(axis=0)['performance']
            min_perf = df_pp.min(axis=0)['performance']
            print("max_perf=", max_perf)
            print("min_perf=", min_perf)
            group_length = (max_perf - min_perf)/n_groups
            group_bound = []
            for i in range(1, n_groups):
                group_bound.append(min_perf + i * group_length)
            group_bound.append(max_perf)
            print(group_bound)
            # create a list of our conditions
            conditions = [
                (df_pp['performance'] <= group_bound[0]),
                (df_pp['performance'] > group_bound[0]) & (
                    df_pp['performance'] <= group_bound[1]),
                (df_pp['performance'] > group_bound[1]) & (
                    df_pp['performance'] <= group_bound[2]),
            ]
            # create a list of the values we want to assign for each condition
            values = [0, 1, 2]
            # create a new column and use np.select to assign values to it using our lists as arguments
            df_pp['tag'] = np.select(conditions, values)
        else:
            g = np.linspace(
                0, n_groups, df_pp.shape[0], endpoint=False).astype(int)
            df_pp['tag'] = g
    if drop_orig_perf_cols:
        if only_posix:
            df_pp.drop(['POSIX_PERF_MIBS'], axis=1, inplace=True)
        else:
            df_pp.drop(['POSIX_PERF_MIBS', 'MPIIO_PERF_MIBS',
                       'STDIO_PERF_MIBS'], axis=1, inplace=True)
    # display(df_pp.head(n_to_display))
    if keep_performance_p == False:
        df_pp.drop('performance', axis=1, inplace=True)
    return df_pp


def transform_raw_data(df_p, file_to_save=None, n_to_display_p=2):
    df_tot_p = drop_cols(df_p, "MPIIO")
    df_tot_p = drop_cols(df_tot_p, "STDIO")
    df_tot_p = filter_data(df_tot_p, drop_by_size_ratio=0,
                           drop_single_core=False, is_drop_zero_col=False)
    # df_tot_p_orig=df_tot_p
    df_tot_p = add_tag(df_tot_p, only_posix=True,
                       drop_orig_perf_cols=True, n_to_display=n_to_display_p)
    # if log_transform:
    #    df_tot_p=np.log10(df_tot_p + 0.00001) ##Log transform
    # if log_transform_tag:
    #    df_tot_p["tag"]=np.log10(df_tot_p["tag"])
    # df_tot_p=df_tot_p.sample(frac=1)  ## Random shuffter the data
    if file_to_save is not None:
        df_tot_p.to_csv(file_to_save, header=True, index=False)
    return df_tot_p


def parser_darshan(darshan_file_name):
    output_csv_file = darshan_file_name+'.csv'
    temp_directory = darshan_file_name+'-tdir'
    subprocess.run(["./parser.sh", darshan_file_name,
                   output_csv_file, temp_directory])
    #subprocess.run(["rm", "-rf", temp_directory])
    return output_csv_file


def load_explain_model(explain_model_save_file_p, method="LIME"):
    if method == "LIME":
        with open(explain_model_save_file_p, 'rb') as f:
            explain_model = dill.load(f)
        return explain_model
    elif method == "SHAP":
        explain_model = joblib.load(filename=explain_model_save_file_p)
        return explain_model
    else:
        print("Not supported exlain method now !\n")
        exit()


def load_predict_model(predict_model_save_file_p, isTabnet=False):
    if isTabnet == False:
        predict_model = joblib.load(predict_model_save_file_p)
        return predict_model
    else:
        predict_model = TabNetRegressor(seed=42)
        predict_model.load_model(predict_model_save_file_p)
        return predict_model


def explain_df_with_LIME(explain_model, predict_model, explain_X, explain_Y, header,  run_web=False):
    # Build the LimeTabularExplainer
    explanation = explain_model.explain_instance(
        explain_X, predict_model.predict, num_features=len(explain_X))
    print("Prediction : ",   predict_model.predict(explain_X.reshape(1, -1)))
    print("Empirically estimated :     ", explain_Y)
    if run_web == False:
        explanation.show_in_notebook()
        return explanation.as_list()
        # display(explanation.as_list())
    else:
        # explanation.save_to_file('lime.html')
        return explanation.as_list()


def explain_df_with_SHAP(explain_model, predict_model, explain_X, explain_Y, header, run_web=False):
    df = pd.DataFrame([explain_X], columns=header)
    # display(df)
    shap_values = explain_model(df)
    # print(shap_values[0])
    if run_web == False:
        shap.plots.waterfall(shap_values[0])
        return shap_values[0]
        # shap.initjs()
        ##shap.plots.force(shap_values[0], matplotlib=True)
    else:
        # shap.initjs()
        ##output_of_force_plot=shap.plots.force(shap_values[0], show=False)
        # shap.plots.force(shap_values[0])
        ##shap.save_html("shap.html", output_of_force_plot)
        return shap_values[0]
    # https://stackoverflow.com/questions/65837159/how-to-get-the-shap-values-of-each-feature
    #f=shap.force_plot(explainer.expected_value, shap_values, X, show=False)
    #shap.save_html("index.htm", f)
    # shap.plots.force(shap_values[0])
    #
    #shap_values = model.shap_values(explain_X)
    # model.initjs()
    #model.force_plot(model.expected_value, shap_values, explain_X, feature_names=features)


def explain_io(explain_model_p,  predict_model_p, explain_X_p, explain_Y_p, header_p, explain_method="LIME", run_web_p=False):
    if explain_method == "LIME":
        return explain_df_with_LIME(explain_model=explain_model_p, predict_model=predict_model_p, header=header_p, explain_X=explain_X_p, explain_Y=explain_Y_p, run_web=run_web_p)
    elif explain_method == "SHAP":
        return explain_df_with_SHAP(explain_model=explain_model_p, predict_model=predict_model_p, header=header_p, explain_X=explain_X_p, explain_Y=explain_Y_p, run_web=run_web_p)
    else:
        print("Not supported exlain method now !\n")
        exit()


# Process darshan log
def get_counters_from_a_darshan_file(darshan_file_to_explain_p, train_feature_name_list=None):
    darshan_file_to_explain_csv = parser_darshan(darshan_file_to_explain_p)
    darshan_file_to_explain_df = read_total(
        darshan_file_to_explain_csv, print_header_p=False, is_split_p=False, tot_dop_cols_p=tot_dop_cols)
    #darshan_file_to_explain_df_transformed, df_orig=transform_raw_data(df_p=darshan_file_to_explain_df, n_to_display_p =3)
    darshan_file_to_explain_df_transformed = transform_raw_data(
        df_p=darshan_file_to_explain_df, n_to_display_p=3)
    df_orig = darshan_file_to_explain_df_transformed
    darshan_file_to_explain_df_transformed = feature_engineering(
        darshan_file_to_explain_df_transformed)
    if train_feature_name_list is not None:
        darshan_file_to_explain_df_transformed = darshan_file_to_explain_df_transformed[
            train_feature_name_list]
    print(darshan_file_to_explain_df_transformed.shape)
    headers = list(darshan_file_to_explain_df_transformed.columns)
    print(headers)
    headers.pop()
    print(headers)
    darshan_file_to_explain_df_transformed_np = darshan_file_to_explain_df_transformed.to_numpy()
    n_dims = darshan_file_to_explain_df_transformed_np.shape[1]
    explain_X = darshan_file_to_explain_df_transformed_np[:, 0:n_dims-1]
    explain_Y = darshan_file_to_explain_df_transformed_np[:, n_dims-1]
    print(type(explain_X))
    return explain_X[0], explain_Y[0], headers, df_orig


def min_log_transfer(df_p, col):
    zeros_sum = (df_p[col] == 0).sum()
    if zeros_sum != 0:
        minValue = df_p.loc[df_p[col] > 0.1, col].min()
        if minValue > 1:
            minValue = 1
        else:
            minValue = minValue / 2
        temp_col = df_p[col]
        new_col = temp_col.replace(to_replace=0, value=minValue)
        return np.log10(new_col)
    else:
        return np.log10(df_p[col])


def describe_col(df_p, col, try_log=False, try_pluy_one_log=False):
    fig, axs = plt.subplots(1, 3, figsize=(16, 4))
    display(df_p[col].describe())
    df_p[col].hist(ax=axs[0])
    df_p.plot.scatter(x=col, y='tag', c='DarkBlue', ax=axs[1])
    plt.plot()
    if try_pluy_one_log:
        zero_sum1 = (df_p[col] == 0).sum()
        print("# of zeros in df = ", zero_sum1,
              " (", zero_sum1/df_p.shape[0], ")")
        minValue = df_p.loc[df_p[col] > 0.1, col].min()
        print("minValue = ", minValue)
        df_p[col+'_log'] = plus_one_log_transfer(df_p, col)
        display(df_p[col+'_log'].describe())
        df_p[col+'_log'].hist(ax=axs[2])


def plus_one_log_transfer(df_p, col, log_flag=True):
    if log_flag:
        temp_col = df_p[col]
        return np.log10(temp_col+1)
    else:
        return df_p[col]


# To further delete:
## total_POSIX_MODE, total_POSIX_MAX_BYTE_READ, total_POSIX_MAX_BYTE_WRITTEN, total_POSIX_MAX_READ_TIME_SIZE, total_POSIX_MAX_WRITE_TIME_SIZE,
# Reference:  https://discuss.analyticsvidhya.com/t/methods-to-deal-with-zero-values-while-performing-log-transformation-of-variable/2431/9
def feature_engineering(df_p, file_to_save=None, log_transform_feature=True, log_transform_tag=True):
    df_result = pd.DataFrame()

    # Non-zero, which can be log10 directly
    df_result['nprocs'] = plus_one_log_transfer(
        df_p, 'nprocs', log_flag=log_transform_feature)
    df_result['POSIX_OPENS'] = plus_one_log_transfer(
        df_p, 'total_POSIX_OPENS', log_flag=log_transform_feature)
    df_result['LUSTRE_STRIPE_SIZE'] = plus_one_log_transfer(
        df_p, 'LUSTRE_STRIPE_SIZE', log_flag=log_transform_feature)
    df_result['LUSTRE_STRIPE_WIDTH'] = plus_one_log_transfer(
        df_p, 'LUSTRE_STRIPE_WIDTH', log_flag=log_transform_feature)

    # Zero value, copy directly
    #df_result['POSIX_FILENOS'] = df_p['total_POSIX_FILENOS']
    df_result['POSIX_FILENOS'] =   plus_one_log_transfer(df_p, 'total_POSIX_FILENOS', log_flag=log_transform_feature) 
    
    df_result['POSIX_DUPS'] = df_p['total_POSIX_DUPS']
    df_result['POSIX_MMAPS'] = df_p['total_POSIX_MMAPS']
    df_result['POSIX_RENAME_SOURCES'] = df_p['total_POSIX_RENAME_SOURCES']
    df_result['POSIX_RENAME_TARGETS'] = df_p['total_POSIX_RENAME_TARGETS']
    df_result['POSIX_RENAMED_FROM'] = df_p['total_POSIX_RENAMED_FROM']
    df_result['POSIX_MEM_ALIGNMENT'] = plus_one_log_transfer(df_p, 'total_POSIX_MEM_ALIGNMENT', log_flag=log_transform_feature)
    df_result['POSIX_FILE_ALIGNMENT'] = plus_one_log_transfer(df_p, 'total_POSIX_FILE_ALIGNMENT', log_flag=log_transform_feature)

    # Non-zero, which can not be log10 directly
    df_result['POSIX_READS'] = plus_one_log_transfer(
        df_p, 'total_POSIX_READS', log_flag=log_transform_feature)
    df_result['POSIX_WRITES'] = plus_one_log_transfer(
        df_p, 'total_POSIX_WRITES', log_flag=log_transform_feature)
    df_result['POSIX_SEEKS'] = plus_one_log_transfer(
        df_p, 'total_POSIX_SEEKS', log_flag=log_transform_feature)
    df_result['POSIX_STATS'] = plus_one_log_transfer(
        df_p, 'total_POSIX_STATS', log_flag=log_transform_feature)
    df_result['POSIX_BYTES_READ'] = plus_one_log_transfer(
        df_p, 'total_POSIX_BYTES_READ', log_flag=log_transform_feature)
    df_result['POSIX_BYTES_WRITTEN'] = plus_one_log_transfer(
        df_p, 'total_POSIX_BYTES_WRITTEN', log_flag=log_transform_feature)
    df_result['POSIX_CONSEC_READS'] = plus_one_log_transfer(
        df_p, 'total_POSIX_CONSEC_READS', log_flag=log_transform_feature)
    df_result['POSIX_CONSEC_WRITES'] = plus_one_log_transfer(
        df_p, 'total_POSIX_CONSEC_WRITES', log_flag=log_transform_feature)
    df_result['POSIX_SEQ_READS'] = plus_one_log_transfer(
        df_p, 'total_POSIX_SEQ_READS', log_flag=log_transform_feature)
    df_result['POSIX_SEQ_WRITES'] = plus_one_log_transfer(
        df_p, 'total_POSIX_SEQ_WRITES', log_flag=log_transform_feature)
    df_result['POSIX_FSYNCS'] = plus_one_log_transfer(
        df_p, 'total_POSIX_FSYNCS', log_flag=log_transform_feature)
    df_result['POSIX_FDSYNCS'] = plus_one_log_transfer(
        df_p, 'total_POSIX_FDSYNCS', log_flag=log_transform_feature)
    df_result['POSIX_RW_SWITCHES'] = plus_one_log_transfer(
        df_p, 'total_POSIX_RW_SWITCHES', log_flag=log_transform_feature)
    df_result['POSIX_MEM_NOT_ALIGNED'] = plus_one_log_transfer(
        df_p, 'total_POSIX_MEM_NOT_ALIGNED', log_flag=log_transform_feature)
    df_result['POSIX_FILE_NOT_ALIGNED'] = plus_one_log_transfer(
        df_p, 'total_POSIX_FILE_NOT_ALIGNED', log_flag=log_transform_feature)

    df_result['POSIX_SIZE_READ_0_100'] = plus_one_log_transfer(
        df_p, 'total_POSIX_SIZE_READ_0_100', log_flag=log_transform_feature)
    df_result['POSIX_SIZE_READ_100_1K'] = plus_one_log_transfer(
        df_p, 'total_POSIX_SIZE_READ_100_1K', log_flag=log_transform_feature)
    df_result['POSIX_SIZE_READ_1K_10K'] = plus_one_log_transfer(
        df_p, 'total_POSIX_SIZE_READ_1K_10K', log_flag=log_transform_feature)
    df_result['POSIX_SIZE_READ_10K_100K'] = plus_one_log_transfer(
        df_p, 'total_POSIX_SIZE_READ_10K_100K', log_flag=log_transform_feature)
    df_result['POSIX_SIZE_READ_100K_1M'] = plus_one_log_transfer(
        df_p, 'total_POSIX_SIZE_READ_100K_1M', log_flag=log_transform_feature)
    df_result['POSIX_SIZE_READ_1M_4M'] = plus_one_log_transfer(
        df_p, 'total_POSIX_SIZE_READ_1M_4M', log_flag=log_transform_feature)
    df_result['POSIX_SIZE_READ_4M_10M'] = plus_one_log_transfer(
        df_p, 'total_POSIX_SIZE_READ_4M_10M', log_flag=log_transform_feature)
    df_result['POSIX_SIZE_READ_10M_100M'] = plus_one_log_transfer(
        df_p, 'total_POSIX_SIZE_READ_10M_100M', log_flag=log_transform_feature)
    df_result['POSIX_SIZE_READ_100M_1G'] = plus_one_log_transfer(
        df_p, 'total_POSIX_SIZE_READ_100M_1G', log_flag=log_transform_feature)
    df_result['POSIX_SIZE_READ_1G_PLUS'] = plus_one_log_transfer(
        df_p, 'total_POSIX_SIZE_READ_1G_PLUS', log_flag=log_transform_feature)

    df_result['POSIX_SIZE_WRITE_0_100'] = plus_one_log_transfer(
        df_p, 'total_POSIX_SIZE_WRITE_0_100', log_flag=log_transform_feature)
    df_result['POSIX_SIZE_WRITE_100_1K'] = plus_one_log_transfer(
        df_p, 'total_POSIX_SIZE_WRITE_100_1K', log_flag=log_transform_feature)
    df_result['POSIX_SIZE_WRITE_1K_10K'] = plus_one_log_transfer(
        df_p, 'total_POSIX_SIZE_WRITE_1K_10K', log_flag=log_transform_feature)
    df_result['POSIX_SIZE_WRITE_10K_100K'] = plus_one_log_transfer(
        df_p, 'total_POSIX_SIZE_WRITE_10K_100K', log_flag=log_transform_feature)
    df_result['POSIX_SIZE_WRITE_100K_1M'] = plus_one_log_transfer(
        df_p, 'total_POSIX_SIZE_WRITE_100K_1M', log_flag=log_transform_feature)
    df_result['POSIX_SIZE_WRITE_1M_4M'] = plus_one_log_transfer(
        df_p, 'total_POSIX_SIZE_WRITE_1M_4M', log_flag=log_transform_feature)
    df_result['POSIX_SIZE_WRITE_4M_10M'] = plus_one_log_transfer(
        df_p, 'total_POSIX_SIZE_WRITE_4M_10M', log_flag=log_transform_feature)
    df_result['POSIX_SIZE_WRITE_10M_100M'] = plus_one_log_transfer(
        df_p, 'total_POSIX_SIZE_WRITE_10M_100M', log_flag=log_transform_feature)
    df_result['POSIX_SIZE_WRITE_100M_1G'] = plus_one_log_transfer(
        df_p, 'total_POSIX_SIZE_WRITE_100M_1G', log_flag=log_transform_feature)
    df_result['POSIX_SIZE_WRITE_1G_PLUS'] = plus_one_log_transfer(
        df_p, 'total_POSIX_SIZE_WRITE_1G_PLUS', log_flag=log_transform_feature)

    df_result['POSIX_STRIDE1_STRIDE'] = plus_one_log_transfer(
        df_p, 'total_POSIX_STRIDE1_STRIDE', log_flag=log_transform_feature)
    df_result['POSIX_STRIDE2_STRIDE'] = plus_one_log_transfer(
        df_p, 'total_POSIX_STRIDE2_STRIDE', log_flag=log_transform_feature)
    df_result['POSIX_STRIDE3_STRIDE'] = plus_one_log_transfer(
        df_p, 'total_POSIX_STRIDE3_STRIDE', log_flag=log_transform_feature)
    df_result['POSIX_STRIDE4_STRIDE'] = plus_one_log_transfer(
        df_p, 'total_POSIX_STRIDE4_STRIDE', log_flag=log_transform_feature)
    df_result['POSIX_STRIDE1_COUNT'] = plus_one_log_transfer(
        df_p, 'total_POSIX_STRIDE1_COUNT', log_flag=log_transform_feature)
    df_result['POSIX_STRIDE2_COUNT'] = plus_one_log_transfer(
        df_p, 'total_POSIX_STRIDE2_COUNT', log_flag=log_transform_feature)
    df_result['POSIX_STRIDE3_COUNT'] = plus_one_log_transfer(
        df_p, 'total_POSIX_STRIDE3_COUNT', log_flag=log_transform_feature)
    df_result['POSIX_STRIDE4_COUNT'] = plus_one_log_transfer(
        df_p, 'total_POSIX_STRIDE4_COUNT', log_flag=log_transform_feature)
    df_result['POSIX_ACCESS1_ACCESS'] = plus_one_log_transfer(
        df_p, 'total_POSIX_ACCESS1_ACCESS', log_flag=log_transform_feature)
    df_result['POSIX_ACCESS2_ACCESS'] = plus_one_log_transfer(
        df_p, 'total_POSIX_ACCESS2_ACCESS', log_flag=log_transform_feature)
    df_result['POSIX_ACCESS3_ACCESS'] = plus_one_log_transfer(
        df_p, 'total_POSIX_ACCESS3_ACCESS', log_flag=log_transform_feature)
    df_result['POSIX_ACCESS4_ACCESS'] = plus_one_log_transfer(
        df_p, 'total_POSIX_ACCESS4_ACCESS', log_flag=log_transform_feature)
    df_result['POSIX_ACCESS1_COUNT'] = plus_one_log_transfer(
        df_p, 'total_POSIX_ACCESS1_COUNT', log_flag=log_transform_feature)
    df_result['POSIX_ACCESS2_COUNT'] = plus_one_log_transfer(
        df_p, 'total_POSIX_ACCESS2_COUNT', log_flag=log_transform_feature)
    df_result['POSIX_ACCESS3_COUNT'] = plus_one_log_transfer(
        df_p, 'total_POSIX_ACCESS3_COUNT', log_flag=log_transform_feature)
    df_result['POSIX_ACCESS4_COUNT'] = plus_one_log_transfer(
        df_p, 'total_POSIX_ACCESS4_COUNT', log_flag=log_transform_feature)
    df_result['POSIX_F_VARIANCE_RANK_BYTES'] = plus_one_log_transfer(
        df_p, 'total_POSIX_F_VARIANCE_RANK_BYTES', log_flag=log_transform_feature)

    df_result['tag'] = plus_one_log_transfer(
        df_p, 'tag', log_flag=log_transform_tag)

    #describe_col(df_p, 'total_POSIX_F_VARIANCE_RANK_BYTES', try_pluy_one_log=True)
    if file_to_save is not None:
        df_result.to_csv(file_to_save, header=True, index=False)
    return df_result


def drop_sparse_rc(df_p, ratio_of_zero=0.75):
    # Remove sparse rowswith more than 0.75 zeros
    # https://stackoverflow.com/questions/37992585/how-to-remove-rows-from-a-dataframe-if-75-of-its-column-values-is-equal-to-0
    df_temp = df_p[(~df_p.astype('bool')).mean(axis=1) < ratio_of_zero]
    # Remove sparse column with more than 0.75 zeros
    # https://stackoverflow.com/questions/44250642/drop-columns-with-more-than-70-zeros
    df_temp2 = df_temp.loc[:, (df_temp == 0).mean() < ratio_of_zero]
    print("After drop_sparse_rc, shape =", df_temp2.shape)
    return df_temp2


KeyboardInterrupt: 