In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from pymatgen.core import Element
from copy import deepcopy
import seaborn as sns
import os
from progressbar import ProgressBar

pd.set_option("display.max_rows", 1000)


In [None]:

def get_data():
    """データ取得

    Returns:
        tuple containins

        - pd.DataFrame: データ．
        - list[str]: 元素名リスト．
        - list[str]: 目的変数名リスト．
        - list[str]: メタカラム名リスト．
        - int: 目的へs縫うの分割数．
    """
    import json
    ROOT = ".."
    filepath = os.path.join(f"data/hea4_phys_condition.json")
    with open(filepath, "r") as f:
        cond = json.load(f)
    ndiv = cond["NDIV"] # digitizeする分割数．
    print("ndiv",ndiv)

    # 加工済みデータの読み込み
    element_labels = []
    for i in range(4):
        element_labels.append("element{}".format(i+1))
    target_names = ['M', 'TC', 'R', ]
    meta_names = ['heakey', ]
    filepath = "data/hea4_phys.csv"
    dfraw = pd.read_csv(filepath)
    return dfraw, element_labels, target_names, meta_names, ndiv


g_dfraw, g_element_labels, g_target_names, g_meta_names, g_ndiv = get_data()

In [None]:
METADATA = {"prefix": "image_executed", "dataname": "hea4_phys_condition"}


In [None]:
print(g_dfraw.shape,g_dfraw.columns)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import os
def apply_regression(df, descriptor_names, target_name,
                    labelfontsize=15, tickfontsize=15):
    """apply fit and predict 
    
    Args:
        df (pd.DataFrame): data.
        descriptor_names (list[str]): 説明変数カラム名リスト．
        target_naem (str): 目的変数カラム名．
        labelfontsize (int, optional): label font size. Defaults to 15.
        tickfontsize (int, optional): ticks font size. Defaults to 15.
    """
    X = df[descriptor_names].values
    y = df[target_name].values
    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=0)
    reg = RandomForestRegressor()
    reg.fit(X_train, y_train)
    yp_train = reg.predict(X_train)
    yp_test = reg.predict(X_test)
    r2_train = r2_score(y_train, yp_train)
    r2_test = r2_score(y_test,yp_test)
    print("R2",r2_train, r2_test)
    
    if True:
        ylim = (y.min(), y.max())
        fig, ax = plt.subplots(figsize=(5,5))
        ax.scatter(y_test,yp_test, s=1, alpha=0.1)
        ax.set_xlim(ylim)
        ax.set_ylim(ylim)
        ax.set_xlabel("$y_{obs}^{test}$", fontsize=labelfontsize)
        ax.set_ylabel("$y_{pred}^{test}$", fontsize=labelfontsize)
        ax.plot(ylim,ylim, "--", c="red")
        ax.tick_params(axis = 'x', labelsize =tickfontsize)
        ax.tick_params(axis = 'y', labelsize =tickfontsize)    
        filename = "itemset_R_group_row_randomforest.pdf"
        print(filename)
        fig.tight_layout()
        fig.savefig(os.path.join("image_executed",filename))
        plt.show()
        
apply_regression(g_dfraw, ['group_mean', 'group_std', 'row_mean', 'row_std'], "R")

In [None]:
from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.preprocessing import TransactionEncoder

In [None]:
# グラフ図示関数を定義しておく
from typing import List

def show_rules(df: pd.DataFrame, show_fig = True, filename=None, figsize=(5, 5)):
    """ruleの図示を行う．

    Args:
        df (pd.DataFrame): データ．
        show_fig (bool): show image or not. Defaults to True.
        filename (str): filename to output. Defaults to None.
        figsize (tuple(float, float), optional): 図のサイズ. Defaults to (5, 5).
    """
    _df = df.copy()
    # antecedentsとconsequentsはfrozen setという形式で入っている．
    # frozen setだと図示した時に見にくいのでフォーマットの変更を行う．
    if False:
        # これは１要素の場合しか機能しない．
        _df["antecedents"] = _df["antecedents"].apply(lambda x: next(iter(x)))
        _df["consequents"] = _df["consequents"].apply(lambda x: next(iter(x)))
    else:
        edgelist = []
        for ante, cons in zip(_df["antecedents"], _df["consequents"]):
            ante = list(ante)
            cons = list(cons)
            for ante1 in list(ante):
                for cons1 in list(cons):
                    edgelist.append([str(ante1),str(cons1)])
        _df = pd.DataFrame(edgelist, columns=["antecedents","consequents"])

    import networkx as nx
    import matplotlib.pyplot as plt
    GA = nx.from_pandas_edgelist(_df,
                                 source='antecedents', target='consequents',
                                 create_using=nx.MultiDiGraph())
    if show_fig:
        plt.figure(figsize=figsize)
        nx.draw(GA, node_color="yellow", edge_color="lightblue",
                arrowsize=20, connectionstyle="arc3,rad=0.1",
                font_color="red", with_labels=True)
        # plt.tight_layout() はincompatibleだと言われるのでmarginで制御する．
        plt.margins(0.3)
        if filename is not None:
            print(filename)
            plt.savefig(filename)
        plt.show()
        
    return GA

In [None]:
def convert_to_transaction(df, element_labels,
                           feature_id_labels=["M_id", "TC_id", "R_id",
                                              "group_mean_id", "group_std_id",
                                              "row_mean_id", "row_std_id",
                                              'n_group3', 'n_group4', 'n_group5', 'n_group6', 'n_group7',
                                              'n_group8', 'n_group9', 'n_group10', 'n_group11', 'n_group12',
                                              'n_group13', 'n_group14', 'n_group15',
                                              ]):
    """transactionへの変換．

    Args:
        df (pd.DataFrame): data.
        element_labels (list[str])): 元素名リスト
        feature_id_labels (list[str]), optional): itemとして使用するカラム名リスト. Defaults to ["M_id", "TC_id", "R_id", "group_mean_id", "group_std_id", "row_mean_id", "row_std_id", 'n_group3', 'n_group4', 'n_group5', 'n_group6', 'n_group7', 'n_group8', 'n_group9', 'n_group10', 'n_group11', 'n_group12', 'n_group13', 'n_group14', 'n_group15', ].

    Returns:
        list: transaction
    """

    discretevalues = {}

    for idname in element_labels:
        discretevalues[idname] = df[idname].values.tolist()

    for idname in feature_id_labels:
        value_list = []
        for value in df[idname].values.tolist():
            if idname == "R_id":
                valueid = "{}=={}".format(idname, value)
            elif idname in ["M_id", "TC_id"]:
                if value > 1:
                    valueid = "{}=={}".format(idname, value)
                else:
                    valueid = ""  # ignore M and TC
            elif idname.startswith("group") or idname.startswith("row"):
                if value > 0:
                    valueid = "{}=={}".format(idname, value)
                else:
                    valueid = ""
            elif idname.startswith("n_"):
                if value > 1:
                    valueid = "{}=={}".format(idname, value)
                else:
                    valueid = ""
            else:
                valueid = "{}=={}".format(idname, value)
            value_list.append(valueid)
        discretevalues[idname] = value_list

    df_discrete = pd.DataFrame(discretevalues)

    transaction = []
    for values_raw in df_discrete.values:
        values_raw = values_raw.tolist()
        values = list(filter(None, values_raw))  # listから””を除く．
        transaction.append(values)

    return transaction


In [None]:
def make_rule(dfraw, element_labels, ndiv, min_support=0.3, min_threshold=0.8,
             n_antecedent=1, n_consequents=1):
    """rule miningを行う．

    Args:
        dfraw (pd.DataFrame): データ．
        element_labels (list[str])): itemとして変換する要素．
        ndiv (int): 要素がintやfloatの型の場合のitemの分割数．
        min_support (float, optional): supportの最小値. Defaults to 0.3.
        min_threshold (float, optional): thresholdの最小値. Defaults to 0.8.

    Returns:
        pd.DataFrame: データ．
    """
    df_rules_list = []
    for i in range(ndiv):
        target_condition = "R_id=={}".format(i+1)
        # 「多数」に結果が引きずられる．
        # 制限しないとsupportも見るのでR_idの最も大きなpeakの情報が主として出てくる．
        dfq = dfraw.query(target_condition).reset_index(drop=True)
        transaction = convert_to_transaction(dfq, element_labels,)

        te = TransactionEncoder()
        te.fit(transaction)
        te_ary = te.fit(transaction).transform(transaction)
        df = pd.DataFrame(te_ary, columns=te.columns_)
        df_freq_items = apriori(
            df, min_support=min_support, max_len=10000, use_colnames=True, verbose=1)
        df_freq_items.sort_values(by="support", ascending=False)
        df_rules = association_rules(df_freq_items, metric="confidence",
                                     min_threshold=min_threshold)
        df_rules = df_rules.sort_values(
            by="support", ascending=False).reset_index(drop=True)
        # itemの数を加えて制限する．
        df_rules["antecedent_len"] = df_rules["antecedents"].apply(lambda x: len(x))
        df_rules["consequents_len"] = df_rules["consequents"].apply(lambda x: len(x))
        _df_rules = df_rules.query(f'antecedent_len<={n_antecedent} and consequents_len<={n_consequents}').reset_index(drop=True)
        display(_df_rules)
        df_rules_list.append(_df_rules.copy())
        # 可視化する．
        GA = show_rules(_df_rules, filename=os.path.join("image_executed/target_{}.png".format(i)), figsize=(5, 5))
    return df_rules_list

g_df_rules_list = make_rule(g_dfraw, g_element_labels, g_ndiv)

In [None]:
def output_cytoscape(df, comment="targetid", metadata=METADATA):
    """output cytoscape input
    
    Args:
        df (pd.DataFrame): data.
        comment (str): comment. Defaults to "hea4_R".
        metadata (dict): data for display. Defaults to METADATA.
    """
    df = pd.concat(df)
    df["antecedents"] = df["antecedents"].apply(lambda x: next(iter(x)))
    df["consequents"] = df["consequents"].apply(lambda x: next(iter(x)))
    import networkx as nx
    import matplotlib.pyplot as plt
    GA = nx.from_pandas_edgelist(df,
                                 source='antecedents', target='consequents',
                                 create_using=nx.MultiDiGraph())
    filename = "_".join([ metadata["dataname"], 
                         comment])+".cyjs"   
    with open(filename,"w") as f:
        import json
        f.write( json.dumps(nx.cytoscape_data(GA))  )
    print(filename)
output_cytoscape(g_df_rules_list)

In [None]:
def make_support(dfraw, sentense, elm):
    """元データからsupportの計算を行う．

    Args:
        dfraw (pd.DataFrame): データ．
        sentense (str): query文．
        elm (str): elementsカラムに含まれる元素名．
    """
    dfq = dfraw.query(sentense)
    print(dfq.shape)
    dfq2 = dfq[dfq["elements"].str.contains(",{},".format(elm))]
    print(dfq2.shape)
    print("support=", dfq2.shape[0]/dfq.shape[0])


make_support(g_dfraw, "R_id==10", "Sc")


In [None]:
# 全元素名を得る．
def get_all_elm(dfraw):
    """全元素を得る．

    Args:
        dfraw (pd.DataFrame):データ．

    Returns:
        np.ndarray: unique元素名リスト
    """
    elm1 = dfraw["element1"].values
    elm2 = dfraw["element2"].values
    elm3 = dfraw["element3"].values
    elm4 = dfraw["element4"].values
    uique_elms = np.unique(np.hstack([elm1, elm2, elm3, elm4]))
    return uique_elms


g_uique_elms = get_all_elm(g_dfraw)
print(g_uique_elms)


In [None]:
import collections

def plot_selected_hist(df, elements, ndiv, label="R_id", filename=None,
                      tickfontsize=15, labelfontsize=15, legendfontsize=15):
    """elementsを含むlabelカラムのhistogramを示す．

    Args:
        df (pd.DataFrame): データ．
        elements (list[str]): 元素名リスト．
        ndiv (int): labelカラムの値の分割数．
        label (str, optional): 物理量のカラム. Defaults to "R_id".
        filename (str, optional): image filename. Defaults to None.
        tickfontsize (int, optional): ticks font size. Defaults to 15.
        labelfontsize (int, optional): label font size. Defaults to 15.
        legendfontsize (int, optional): legend font size. Defaults to 15.
    """
    dfselect = df.copy()
    for elm in elements:
        dfselect = dfselect[dfselect["elements"].str.contains(elm)]

    # Rのhistogramを生成して規格化
    Rall = df[label].values
    counter = collections.Counter(Rall)
    hist_Rall = []
    for i in range(ndiv):
        hist_Rall.append(counter[i])
    n_sum = np.sum(hist_Rall)
    hist_Rall = hist_Rall/n_sum

    # dfselect["R"]のhistogramを生成して規格化
    Rselect = dfselect[label].values
    counter = collections.Counter(Rselect)
    hist_Rselect = []
    for i in range(ndiv):
        hist_Rselect.append(counter[i])
    n_sum = np.sum(hist_Rselect)
    hist_Rselect = hist_Rselect/n_sum

    # 可視化
    fig, ax = plt.subplots()
    width = 1
    center = list(range(1, ndiv+1))
    ax.bar(center, hist_Rall, width=width,
           alpha=0.5,  align='center', label="all")
    ax.bar(center, hist_Rselect, width=width, alpha=0.5,
           align='center', label=str(elements))
    ax.set_xlabel(label, fontsize=labelfontsize)
    ax.set_ylabel("normalized occurrence", fontsize=labelfontsize)
    ax.legend(fontsize=legendfontsize)
    ax.tick_params(axis = 'x', labelsize =tickfontsize)
    ax.tick_params(axis = 'y', labelsize =tickfontsize)    
    fig.tight_layout()
    if filename is not None:
        fig.savefig(filename)
        print("save to", filename)
    plt.show()

for elm in ["Sc", "In","Cd"]:
    g_filename = "image_executed/R_distrib_{}.pdf".format("_".join([elm]))
    plot_selected_hist(g_dfraw, [elm], g_ndiv,
                      filename=g_filename
                      )

# 分布が偏っている．

In [None]:
from scipy.stats import ttest_ind
from itertools import combinations
import random


def calc_tpvalues(df,  elms, ncombi=1, name="R", ):
    """t-valueの計算

    Args:
        df (pd.DataFrame): データ．
        elms (list[str]): 元素名リスト．
        ncombi (int, optional): 元素組み合わせ数. Defaults to 1.
        name (str, optional): 対象カラム名. Defaults to "R".

    Returns:
        pd.DataFrame: データ．
    """
    Rall = df[name].values
    tplist = []
    comblist = list(combinations(elms, ncombi))

    pbar = ProgressBar(max_value=len(comblist))

    for i, elm2 in enumerate(comblist):
        if i % 5 == 0:
            pbar.update(i+1)
        dfs = df
        for elm1 in elm2:
            dfs = dfs[dfs["elements"].str.contains(","+elm1+",")]
        Rselect = dfs[name].values

        if True:
            # 今の場合は母集団は分かっているが，
            # ランダム化法で母集団からランダムに取る．
            # 全部使っても同じ．
            population = list(range(Rall.shape[0]))
            id_ = np.array(random.sample(population, Rselect.shape[0]))
            Rrand = df.loc[id_, name]
        else:
            Rrand = Rall

        t, p = ttest_ind(Rrand, Rselect)
        tplist.append([elm2, t, p])
    df_tp = pd.DataFrame(tplist, columns=["elmements", "tvalue", "pvalue"])
    df_tp.sort_values(by="tvalue")
    return df_tp.sort_values(by="tvalue").reset_index(drop=True)


g_df_tp1 = calc_tpvalues(g_dfraw,  g_uique_elms, ncombi=1)


In [None]:
display(g_df_tp1.head())
display(g_df_tp1.tail())


In [None]:
plot_selected_hist(g_dfraw, ["Sc"], g_ndiv, )
plot_selected_hist(g_dfraw, ["Rh"], g_ndiv,)


In [None]:
g_df_tp2 = calc_tpvalues(g_dfraw,  g_uique_elms, ncombi=2)


In [None]:
display(g_df_tp2.head(15))
display(g_df_tp2.tail())


In [None]:
g_filename = "image_executed/R_distrib_{}.pdf".format("_".join(["In", "Sc"]))
plot_selected_hist(g_dfraw, ["In", "Sc"], g_ndiv, filename=g_filename)
g_filename = "image_executed/R_distrib_{}.pdf".format("_".join(["Ge", "Si"]))
plot_selected_hist(g_dfraw, ["Ge", "Si"], g_ndiv, filename=g_filename)

In [None]:
g_dfraw.plot.scatter(x="group_std", y="R", s=1)
plt.show()

g_dfraw.plot.scatter(x="group_mean", y="R", s=1)
plt.show()

g_dfraw.plot.scatter(x="group_std", y="group_mean", s=1)
plt.show()
