In [None]:
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV
import warnings
from sklearn.model_selection import cross_val_score, KFold
import pandas as pd
import itertools
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

try:
    import progressbar
    g_have_progressbar = True
except:
    g_have_progressbar = False

pd.set_option("display.max_rows", 10)
pd.set_option("display.max_columns", 60)
warnings.filterwarnings("ignore")

In [None]:
DATA_NAME = "ReCo"  # ReCo or Carbon8
REGRESSION_MODEL = "Linear"  # Linear, Ridge, RF

In [None]:
METADATA = {"outputdir": "image_executed", "prefix": "exhaustivesearch", 
              "dataname":DATA_NAME, "regtype":REGRESSION_MODEL}

In [None]:
def get_data(data_name="ReCo"):
    """load data.

    Args:
        data_name (str, optional): データ名 "ReCo" or "Carbon8". Defaults to "ReCo".

    Raises:
        ValueError: unknown data_name

    Returns:
        tuple containing

        - pd.DataFrame: データ
        - list[str]: 説明変数名リスト
        - str: 目的変数
    """
    if data_name == "ReCo":
        df = pd.read_csv("../data/TC_ReCo_detail_descriptor.csv")
        descriptor_names = ['C_R', 'C_T', 'vol_per_atom', 'Z', 'f4', 'd5', 'L4f', 'S4f', 'J4f',
                            '(g-1)J4f', '(2-g)J4f']
        target_name = 'Tc'
    elif data_name == "Carbon8":
        df = pd.read_csv("../data_calculated/Carbon8_cell_descriptor_Etot.csv")
        descriptor_names = ['a0.25_rp1.0', 'a0.25_rp1.5', 'a0.25_rp2.0', 'a0.25_rp2.5',
                            'a0.25_rp3.0', 'a0.5_rp1.0', 'a0.5_rp1.5', 'a0.5_rp2.0', 'a0.5_rp2.5',
                            'a0.5_rp3.0', 'a1.0_rp1.0', 'a1.0_rp1.5', 'a1.0_rp2.0', 'a1.0_rp2.5',
                            'a1.0_rp3.0', ]
        target_name = 'Etot'
    else:
        raise ValueError("unknown data_name={}".format(data_name))
    return df, descriptor_names, target_name


g_df, g_descriptor_names, g_target_name = get_data(DATA_NAME)


In [None]:
g_Xraw = g_df[g_descriptor_names].values
g_y = g_df[g_target_name].values

In [None]:
from sklearn.preprocessing import StandardScaler
g_scaler = StandardScaler()
g_scaler.fit(g_Xraw)
g_X = g_scaler.transform(g_Xraw)


In [None]:
def all_combinations(n, m=None):
    """make the iterator of all combinations

    Args:
        n (int): the number of descriptors
        m (int, optional): the maximum number of descriptors. Defaults to None.

    Yields:
        tuple(int, ...): a set of descriptors
    """
    seq = range(n)
    if m is None:
        m = n
    for i in range(1, m+1):
        for x in itertools.combinations(seq, i):
            yield x


In [None]:
from sklearn.metrics import make_scorer


def fit_cv_X(x, y, mode="Linear", nfold=5, nfold_model=3):
    """make CV scores 

    Args:
        x (np.array): descriptor
        y (np.array): target values
        mode (str, optional): a type of regression. "Linear" or "Ridge" or "RF". Defaults to "Linear".
        nfold (int, optional): the number of foldings in linear regression. Defaults to 5.
        nfold_model (int, optional): the number foldings in RdigeCV. Defaults to 3.

    Raises:
        ValueError: unknown mode

    Returns:
        tuple containing
        
        - float: the mean value of the score
        - float: the stddev vlaue of the score
        - np.array: coefficients of the regression for linear models, feature importance for randomforest mdoel.
    """
    kf = KFold(n_splits=nfold, shuffle=True, random_state=6)
    kf_model = KFold(n_splits=nfold_model, shuffle=True, random_state=6)

    meanlist = []
    varlist = []

    if mode == "Linear":
        reg = LinearRegression(fit_intercept=True, normalize=False)
    elif mode == "Ridge":
        reg = RidgeCV(cv=kf_model, fit_intercept=True, normalize=False)
    elif mode == "RF":
        reg = RandomForestRegressor(n_estimators=10)
    else:
        raise ValueError("unknown mode=", mode)

    scorelist = cross_val_score(
        reg, x, y, scoring=make_scorer(r2_score), cv=kf)

    # 平均
    mean = np.mean(scorelist)
    # 標準偏差
    std = np.std(scorelist)

    # モデルを作り直す．
    reg.fit(x, y)

    if mode in ["Linear", "Ridge"]:
        return mean, std, reg.coef_
    elif mode == "RF":
        return mean, std, reg.feature_importances_
    else:
        raise ValueError("unknown mode=", mode)

In [None]:
def fit_and_predict_combinations(x, y, regressionmodel, descriptor_names, have_progressbar=False, max_component=None):
    """accumulate the result of exhaustive search

    return valueはkeys()として"combination", "score_mean", "score_std": "coef"を含む．

    Args:
        x (np.array): descriptor
        y (np.array): target value
        regressionmodel (str): regression model name
        descriptor_names (list[str]): 説明変数名リスト．
        have_progressbar (bool, optional): have progress bar. Defaults to False.
        max_component (int, optional): the maximum number of descriptors. Defaults to None.

    Returns:
        dict: results.  a list of combination, mean,variance,coefficient
    """
    print_indicatorlabel = False

    n = x.shape[1]
    if max_component is None:
        max_component = n

    combi_list = []
    mean_list = []
    std_list = []
    coef_list = []

    for ncombi, s in enumerate(all_combinations(n, max_component)):
        pass

    if have_progressbar:
        bar = progressbar.ProgressBar(max_value=ncombi+1)

    for i, icombi in enumerate(all_combinations(n, max_component)):
        if have_progressbar:
            bar.update(i+1)

        icombi = np.array(icombi)
        combi = np.array(descriptor_names)[np.array(icombi)]
        combi_list.append(icombi)
        if print_indicatorlabel:
            print("indicators", combi)
        xtry = x[:, icombi]
        ytry = y
        mean, std, coef = fit_cv_X(xtry, ytry, regressionmodel)
        mean_list.append(mean)
        std_list.append(std)
        # The first element　of coef is the coefficient to y
        coef_list.append(coef.ravel())

    mean_list = np.array(mean_list)
    std_list = np.array(std_list)

    return {"combination": combi_list, "score_mean": mean_list, "score_std": std_list, "coef": coef_list}


In [None]:
import pickle
import os

def save_df(df_result, descriptor_names,  savefile):
    """df_resultを保存する．

    Args:
        df_result (pd.DataFrame): データ
        descriptor_names (list[str]): 説明変数名リスト
        savefile (str): 保存ファイル名
    """
    # print(descriptor)
    descriptor_names = np.array(descriptor_names)
    # df_result.to_csv("TC_ReCo_ES.csv")
    combinationlist = []
    for x in df_result["combination"].values:
        x2 = descriptor_names[np.array(x)]
        combinationlist.append("|".join(x2))
    df_result["descriptor"] = combinationlist
    with open(savefile, "wb") as f:
        pickle.dump(df_result, f)


g_savefile = "ESresult_{}_{}_{}.pickle".format(
    REGRESSION_MODEL, g_target_name, DATA_NAME)
print("filename", g_savefile)
if not os.path.exists(g_savefile):
    g_result = fit_and_predict_combinations(g_X, g_y, REGRESSION_MODEL,
                                            g_descriptor_names,
                                            have_progressbar=g_have_progressbar)
    g_df_score = pd.DataFrame(g_result).sort_values(
        by="score_mean", ascending=False).reset_index(drop=True)

    save_df(g_df_score, g_descriptor_names,  g_savefile)

with open(g_savefile, "rb") as f:
    g_df_result = pickle.load(f)
    print("load", g_savefile)
g_df_score = g_df_result[['combination', 'score_mean', 'score_std', 'coef']]


In [None]:
g_df_score.shape

In [None]:
def show_r2_hist(df, xlim=None, comment=None, metadata: dict=METADATA, 
                 tickfontsize=15, titlefontsize=15, labelfontsize=15):
    """R2のDOSを図示する．

    Args:
        df (pd.DataFrame): データ
        xlim (tuple(float, float), optional): 図のx range. Defaults to None.
        filename (str, optional): 保存ファイル名. Defaults to None.
        tickfontsize (int, optional): ticsk font size. Defaults to 15.
    """
    fig, ax = plt.subplots()
    df.hist("score_mean", bins=100, ax=ax)
    ax.set_title(ax.get_title(), fontsize=titlefontsize)
    ax.set_xlabel("R2", fontsize=titlefontsize)
    ax.set_ylabel("occurrence", fontsize=titlefontsize)
    if xlim is not None:
        ax.set_xlim(xlim)
    ax.tick_params(axis='x', labelsize=tickfontsize)
    ax.tick_params(axis='y', labelsize=tickfontsize)
    fig.tight_layout()
    filename = "_".join([metadata["prefix"],metadata["dataname"],
                         metadata["regtype"],str(comment)])+".png"
    fig.savefig(os.path.join(metadata["outputdir"],filename))
    print("saved to",filename)

show_r2_hist(g_df_score,comment="fullrange")


In [None]:
# 拡大
show_r2_hist(g_df_score, xlim=(-0.5, 1.0), comment='-05to10')
show_r2_hist(g_df_score, xlim=(0.4, 0.85), comment='04to085')

In [None]:
def calculate_coeffix(descriptor, combilist, coeflist):
    """表示のために 係数０の部分を加えて係数を作りなおす．

    Args:
        descriptor (list[str]): all the descriptor names
        combilist (list): a list of descriptor combinations of the models
        coeflist (np.array): a list of coefficients of the models

    Returns:
        list: a list of coefficnets whose length is the same as the length of all the descriptors
    """
    n = len(descriptor)
    coeffixlist = []
    for combi, coef in zip(combilist, coeflist):

        coeffix = np.zeros((n))
        # if combi=[1,2], and coef=[val1,val2], then coeffix=[0,val1,val2,0,0]
        for i, id in enumerate(combi):
            coeffix[id] = coef[i]

        # 都合でlistに直す．
        coeffixlist.append(list(coeffix))
    return coeffixlist


g_coeffixlist = calculate_coeffix(g_descriptor_names,
                                  g_df_score["combination"].values, g_df_score["coef"].values)
g_df_coef = pd.DataFrame(g_coeffixlist, columns=g_descriptor_names)
g_df_result = pd.concat([g_df_score, g_df_coef], axis=1)


In [None]:
import seaborn as sns


def show_weight_diagram(df_result, descriptor_names, nmax=50, 
                        comment='index_vs_abscoef',
                       metadata:dict=METADATA,
                       tickfontsize=15, figsize=(10,5)):
    """weight diagramの表示

    Args:
        df_result (pd.DataFrame): data
        descriptor_names (List[str]): 説明変数カラムリスト
        nmax (int, optional): the maximum number of the data to show. Defaults to 50.
        comment (str, optional): 表示ファイル用コメント. Defaults to 'index_vs_abscoef'.
        metadata (dict): 表示用データ
        tickfontsize (int, optional): ticks font size. Defaults to 15.
        figsize (Tuple[float], optional): figure size. Defaults to (7,5).
    """
    x = df_result.loc[:nmax, descriptor_names].values
    x = np.log10(np.abs(x))
    df_x = pd.DataFrame(x, columns=descriptor_names).replace(
        [-np.inf, np.inf], np.nan)
    df_weight_diagram = df_x.fillna(-3)
    fig, ax = plt.subplots(figsize=figsize)
    # ax.set_title("log10(abs(coef))")
    sns.heatmap(df_weight_diagram.T, ax=ax)
    ax.set_ylim((-0.5, df_weight_diagram.shape[1]+0.5))
    ax.tick_params(axis='x', labelsize=tickfontsize)
    ax.tick_params(axis='y', labelsize=tickfontsize)    
    fig.tight_layout()
    filename = "_".join([metadata["prefix"],metadata["dataname"],
                         metadata["regtype"],str(comment)])+".png"
    fig.savefig(os.path.join(metadata["outputdir"],filename))    
    print(filename)

show_weight_diagram(g_df_result, g_descriptor_names)

In [None]:
def show_indicator_diagram(df_result, descriptor_names, nmax=50):
    """indicator diagramを表示する．

    Args:
        df_result (pd.DataFrame): data
        nmax (int, optional): the maximum number of the data to show. Defaults to 50.
    """
    x = df_result[descriptor_names].values != 0
    df_indicator_diagram = pd.DataFrame(x, columns=descriptor_names)
    fig, ax = plt.subplots()
    sns.heatmap(df_indicator_diagram.loc[:nmax, :].T, ax=ax)
    ax.set_ylim((-0.5, df_indicator_diagram.shape[1]+0.5))
    fig.tight_layout()
    return df_indicator_diagram

g_df_indicator_diagram = show_indicator_diagram(
    g_df_result, g_descriptor_names)

In [None]:
def show_score_mean_std(df_result, comment: str="mean_std", 
                        metadata: dict=METADATA,
                       tickfontsize=15,labelfontsize=15,legendfontsize=15):
    """show index vs mean+-std.
    
    Args:
        df_result (pd.DataFrame): データ.
        comment (str): コメント. Defaults to "mean_std".
        metadata (dict): 表示用データ.  Defaults to METADATA.
        tickfontsize (int, optional): ticks font size. Defaults to 15.
        labelfontsize (int, optional): label font size. Defaults to 15.
        legendfontsize (int, optional): legend font size. Defaults to 15.
    """
    fig, ax = plt.subplots()
    df_result.loc[:50, :].plot(y="score_mean", yerr="score_std", ax=ax)
    ax.set_xlabel("index", fontsize=labelfontsize)
    ax.set_ylabel("$R^2$", fontsize=labelfontsize)
    ax.tick_params(axis='x', labelsize=tickfontsize)
    ax.tick_params(axis='y', labelsize=tickfontsize)
    ax.legend(fontsize=legendfontsize)
    fig.tight_layout()
    filename = "_".join([metadata["prefix"],metadata["dataname"],
                         metadata["regtype"],str(comment)])+".png"
    fig.savefig(os.path.join(metadata["outputdir"],filename))    
    print(filename)
    
show_score_mean_std(g_df_result)

In [None]:
def make_counts(df_result, descriptor_names, sentense, ratio=False):
    """
    説明変数が用いられた回数を計算する．

    Args:
        df_result (pd.DataFrame): データ
        descriptor_names (list[str]): 説明変数名リスト．
        sentense (str): query文
        ratio (bool, optional): 回数(False), 割合(True)を返す． Defaults to False.

    Returns:
        pd.DataFrame: 回数もしくは割合データ．
    """
    x = df_result[descriptor_names].values != 0  # 係数が０でない．＝その説明変数が含まれるモデル．
    df_indicator_diagram = df_result.copy()
    df_indicator_diagram.loc[:, descriptor_names] = x

    dfq = df_indicator_diagram.query(sentense)
    print("all=", dfq.shape[0])
    if ratio:
        return np.sum(dfq[descriptor_names], axis=0)/dfq.shape[0]
    else:
        return np.sum(dfq[descriptor_names], axis=0)


def make_block_weight_list(df_result, descriptor_names, querylist, 
                           comment='block_weight_list',
                          metadata:str = METADATA,
                          tickfontsize=15):
    """
    querylistのblock weight diagramを計算する．

    Args:
        df_result (pd.DataFrame): データ．
        descriptor_names (list[str]): 説明名リスト．
        querylist (list[str]): query文リスト．
        comment (str): 表示ファイル用コメント. Defaults to 'block_weight_list'.
        metadata (dict): 表示ファイル用データ. Defaults to METADATA.
    Returns:
        pd.DataFrame: block weight diagram.
    """
    result = []
    for sentense in querylist:
        # 前の図に合わせるためにdescriptor_namesの順序を逆にする．
        t = make_counts(
            df_result, descriptor_names[::-1], sentense, ratio=True)
        result.append(t)
    dfq = pd.DataFrame(result, index=querylist)
    display(dfq)
    
    fig, ax = plt.subplots(figsize=(5,5))
    sns.heatmap(dfq.T, ax=ax)  # 前の図に合わせるためにtransposeする．
    ax.set_xticklabels(ax.get_xticklabels(), ha='right', rotation=45)
    ax.tick_params(axis='x', labelsize=tickfontsize)
    ax.tick_params(axis='y', labelsize=tickfontsize)
    fig.tight_layout()
    filename = "_".join([metadata["prefix"],metadata["dataname"],
                         metadata["regtype"],str(comment)])+".png"
    fig.savefig(os.path.join(metadata["outputdir"],filename))    
    print(filename)    

if DATA_NAME == "ReCo":
    if REGRESSION_MODEL == "Linear":
        g_querylist = ["score_mean<0.15", "score_mean>0.15 and score_mean<0.5",
                       "score_mean>0.5 and score_mean<0.7", "score_mean>0.7"]
        make_block_weight_list(g_df_result, g_descriptor_names, g_querylist)
    if REGRESSION_MODEL == "RF":
        g_querylist = ["score_mean<0.0",
                       "0.6<score_mean<0.8", "score_mean>0.8", ]
        make_block_weight_list(g_df_result, g_descriptor_names, g_querylist)

In [None]:
def make_df_by_index(df_indicator_diagram, descriptor_names, index):
    """部分データを得るためのデータを作る．

    return valueはindexで指定された範囲で，
    descriptor_names（０でない数）+"N"（データインスタンス総数）をカラムに持つデータになる．

    Args:
        df_indicator_diagram (pd.DataFrame): indicator diagram データ
        descriptor_names (list[str]): 説明変数名リスト
        index (list[int]): データインスタンスの部分indexリスト

    Returns:
        pd.DataFrame: indicator diagram部分データ
    """
    dfq = df_indicator_diagram.iloc[index, :]
    print("all=", dfq[descriptor_names].shape[0])
    df_all = pd.DataFrame({"N": [dfq[descriptor_names].shape[0]]},)
    dfq_sum = dfq[descriptor_names].astype(int).sum(axis=0)
    df1 = pd.DataFrame(dfq_sum).T

    return pd.concat([df1, df_all], axis=1)
    # print(np.sum(dfq[descriptor_names], axis=0))


def make_all_ind_by_index(df_indicator_diagram, descriptor_names, regionindex, regionsize):
    """各領域の非ゼロの説明変数の割合を得る．

    regionindex=[0,1,..,N]
    for i in regionindex:
        region = [ i*regionsize, (i+1)*regionsize ]
    と各data instance index領域を定義する．

    Args:
        df_indicator_diagram (pd.DataFrame): データ
        descriptor_names (list[str]): 説明変数名リスト
        regionindex (list[int])): 領域インデックスリスト
        regionsize (int)): 領域サイズ

    Returns:
        pd.DataFrame: 分割領域ごとのデータ
    """
    df_ind_list = []
    for i in regionindex:
        region = list(range(i*regionsize, (i+1)*regionsize))
        df_ind = make_df_by_index(
            df_indicator_diagram, descriptor_names, region)
        df_ind_list.append(df_ind)
    _df = pd.concat(df_ind_list, axis=0).reset_index(drop=True)

    names = list(_df.columns)
    names.remove("N")
    v0 = _df["N"]
    for name in names:
        _df[name] = _df[name]/v0
        
    if False:
        fig, ax = plt.subplots()
        _df[names].T.plot(ax=ax)
        ax.set_ylabel("frequency")
        ax.set_xticks(list(range(len(names))))
        ax.set_xticklabels(names, rotation=90)
        ax.set_ylim((0, 1))
    return _df


g_regions = [_i for _i in range(5)]
g_regionsize = 300
g_df_imp_by_index = make_all_ind_by_index(
    g_df_indicator_diagram, g_descriptor_names, g_regions, g_regionsize)

In [None]:
g_df_result

In [None]:
def show_r2_by_index(df_result, regions, regionsize):
    """各領域の代表値のR2を表示する．

    regions = [0,...,N-1]
    for i in regions:
        # i*regionsize番目のR2を表示する．
        dfp = df_result.iloc[[regionsize*i], :]

    Args:
        df_result (pd.DataFrame): 全データインスタンスのデータ
        regions (list[int]): 領域リスト
        regionsize (int): 領域サイズ
    """
    fig, ax = plt.subplots()
    regions = np.array(regions)
    for i in regions:
        dfp = df_result.iloc[[regionsize*i], :] # 間違っている？
        dfp.plot(y="score_mean", yerr="score_std", ax=ax,label="yerr at {}th".format(regionsize*i))
    dfp1 = df_result.loc[regionsize*regions, ["score_mean"]]
    dfp1.plot(y="score_mean", ax=ax)
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    ax.set_ylabel("R2")
    ax.set_xlabel("Nst model")

show_r2_by_index(g_df_result, g_regions, g_regionsize)


In [None]:
from matplotlib.ticker import MaxNLocator

def show_df_imp_by_index(df_imp_by_index, descriptor_names, regions, regionsize,
                         comment: str = "importancebyindex", 
                         metadata: dict= METADATA,
                         tickfontsize=15, labelfontsize=15, legendfontsize=15):
    """各領域の説明変数の頻度を図示する．

    Args:
        df_imp_by_index (list[pd.DataFrame]): _description_
        descriptor_names (list[str]): 説明変数名リスト
        regions (list[int])): 領域リスト
        regionsize (int): 領域サイズ
        comment (str): 表示用コメント. Defaults to "importancebyindex".
        metadata (dict): 表示用データ. Defaults to METADATA. 
        tickfontsize (int, optional): ticks font size. Defaults to 15.
        labelfontsize (int, optional): ticks font size. Defaults to 15.
        legendfontsize (int, optional): legend font size. Defaults to 15.
    """
    xticks_str = []
    for i in regions:
        xticks_str.append("[{}:{}]".format(i*regionsize, (i+1)*regionsize))
    fig, ax = plt.subplots()
    ax.xaxis.set_major_locator(MaxNLocator(integer=True))
    if False:
        df_imp_by_index[descriptor_names].plot(marker="o", ax=ax)
    else:
        marker_list = [".", "o", "v", "^", "<", ">"]
        marker_list += ["8", "s", "p", "*", "h", "H", "+", "x", "D","d"]
        for exp_name, marker in zip(descriptor_names, marker_list):
            df_imp_by_index[exp_name].plot(marker=marker, ax=ax)
    ax.set_xticks(list(range(len(regions))))
    ax.set_xticklabels(xticks_str)
    ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=legendfontsize)
    ax.set_ylabel("occurrence", fontsize=labelfontsize)
    ax.tick_params(axis='x', rotation=90, labelsize=tickfontsize)
    ax.tick_params(axis='y', labelsize=tickfontsize)    
    fig.tight_layout()
    filename = "_".join([metadata["prefix"],metadata["dataname"],
                         metadata["regtype"],str(comment)])+".png"
    fig.savefig(os.path.join(metadata["outputdir"],filename))    
    print(filename)    

display(g_df_imp_by_index)
show_df_imp_by_index(g_df_imp_by_index, g_descriptor_names,
                     g_regions, g_regionsize)

In [None]:
def make_combination_R2(df, descriptor_names):
    nlist = []
    for combi in df["combination"]:
        nlist.append(len(combi))
    df["ncombi"] = nlist
    del nlist
    nresult=[]
    for n in range(1,len(descriptor_names)+1):
        _df = df[df["ncombi"]==n]
        print(_df.shape,n)
        _df.sort_values(by="score_mean", inplace=True, ascending=False)
        _df.reset_index(drop=True, inplace=True)
        nresult.append([n,_df.loc[0,"score_mean"],_df.loc[0,"score_std"]])
    _df = pd.DataFrame(nresult, columns=["n","score_mean","score_std"])
    return _df

g_df_combination_R2 = make_combination_R2(g_df_result, g_descriptor_names)

In [None]:
g_df_combination_R2

In [None]:
def plot_n_R2(df_combination_R2, df_result, nselect, comment=None, metadata=METADATA,
             labelfontsize=15, tickfontsize=15, legendfontsize=12):
    """plot n_vs R2 and index vs R2.
    
    Args:
        df_combination_R2 (pd.DataFrame): data of ncombination, R2).
        df_result (pd.DataFrame): exhausitve result.
        nselect (int): number of selection.
        comment (str): comment added to png filename.
        metadata (dict): data for png.
        labelfontsize (int, optional): label font size. Defaults to 15.
        tickfontsize (int, optional): ticks font size. Defaults to 15.
        legendfontsize (int, optional): legend font size. Defaults to 15.
    """
    fig, axes = plt.subplots(1,2)
    ax = axes[0]
    df_combination_R2.plot(x="n",y="score_mean", yerr="score_std", ax=ax)
    ax.set_ylabel("$R^2_{test}$", fontsize=labelfontsize)
    ax.set_xlabel(ax.get_xlabel(), fontsize=labelfontsize)
    ax.xaxis.set_major_locator(MaxNLocator(integer=True))
    ax.tick_params(axis='x', labelsize=tickfontsize)
    ax.tick_params(axis='y', labelsize=tickfontsize)   
    ax.legend(fontsize=legendfontsize, loc='lower right')
    
    ax = axes[1]
    _df = df_result[g_df_result["ncombi"]==nselect].reset_index(drop=True)
    _df.head(10).plot(y="score_mean",yerr="score_std", ax=ax)
    ax.set_ylabel("$R^2_{test}$", fontsize=labelfontsize)
    ax.set_xlabel(f"index", fontsize=labelfontsize)
    ax.xaxis.set_major_locator(MaxNLocator(integer=True))
    ax.set_ylim(axes[0].get_ylim())
    ax.tick_params(axis='x', labelsize=tickfontsize)
    ax.tick_params(axis='y', labelsize=tickfontsize)    
    ax.legend(fontsize=legendfontsize, loc='lower right')
    fig.tight_layout()

    filename = "_".join([metadata["prefix"],metadata["dataname"],
                         metadata["regtype"],str(comment)])+".png"
    print(filename)
    fig.savefig(os.path.join(metadata["outputdir"],filename))
    
NSELECT = 3
plot_n_R2(g_df_combination_R2, g_df_result, NSELECT, comment="n_R2_detail_3")

In [None]:
def plot_ncombi_heatmap(df_result, descriptor_names, nselect, ndata, comment, metadata=METADATA,
                       labelfontsize=15, tickfontsize=15):
    """plot index vs heatmap.
    
    Args:
        df_result (pd.DataFrame): exhaustive search results.
        descriptor_names (List[str]): descriptor names.
        nselect (int): number of selection.
        ndata (int): numnber of data instances to choose.
        comment (str): comment of png file.
        metadata (dict): data for png.
        labelfontsize (int, optional): label font size. Defaults to 15.
        tickfontsize (int, optional): tick font size. Defaults to 15.
    """
    _df = df_result[df_result["ncombi"]==nselect].reset_index(drop=True)
    fig, ax = plt.subplots()
    #_df = np.abs(_df[g_descriptor_names])
    sns.heatmap(_df.head(ndata)[descriptor_names].T, cmap='Greys', ax=ax)
    ax.set_xlabel("index", fontsize=labelfontsize)
    ax.tick_params(axis='x', labelsize=tickfontsize)
    ax.tick_params(axis='y', labelsize=tickfontsize)
    fig.tight_layout()
    
    filename = "_".join([metadata["prefix"],metadata["dataname"],
                         metadata["regtype"],str(comment),str(nselect), str(ndata)])+".png"
    print(filename)
    plt.savefig(os.path.join(metadata["outputdir"],filename))
    
NDATA= 6
plot_ncombi_heatmap(g_df_result, g_descriptor_names, NSELECT, NDATA, "ncombi_heatmap")

In [None]:
g_df_result[g_df_result["ncombi"]==NSELECT].reset_index(drop=True).head(NDATA)