In [None]:
import pandas as pd
import numpy as np
import os
from pymatgen.core import Element

pd.set_option("display.max_rows", 1000)
pd.set_option('max_colwidth', 100)

In [None]:
SELECT15 = False # True: select top 15, False: select all 

MIN_SUPPORT = 0.1
MIN_CONFIDENCE = 0.8


In [None]:
import json

FILENAME = "atom_transaction.json"
# FILENAME = "atom_transaction_additional.json"

ROOT = ".."
filepath = os.path.join("data", FILENAME)
with open(filepath,"r") as f:
    g_transaction = json.load(f)

In [None]:
METADATA = {"outputdir": "image_executed", "prefix": "freqmining", 
              "dataname": FILENAME.replace(".","_"), "support": str(MIN_SUPPORT),
           "confidence": str(MIN_CONFIDENCE) }

In [None]:
g_transaction

In [None]:
# それぞれのtransactionのitemの数を表示する．各transactionの個数がバラバラであることが分かる．
for name, value in g_transaction.items():
    print(name, len(value))

In [None]:
from mlxtend.frequent_patterns import apriori, association_rules, fpgrowth
from mlxtend.preprocessing import TransactionEncoder


In [None]:
def get_transaction_df(transaction):
    """mlxtendのtransaction データに変換する．

    Args:
        transaction (List[str]): トランザクション

    Returns:
        pd.Data: データ
    """
    te = TransactionEncoder()
    te.fit(transaction)
    te_ary = te.transform(transaction)
    df = pd.DataFrame(te_ary, columns=te.columns_)
    return df


g_df = get_transaction_df([v  for v in g_transaction.values()])

In [None]:
g_df.head(10)

In [None]:
g_df_freq_items = fpgrowth(g_df, min_support=MIN_SUPPORT, use_colnames=True)
g_df_freq_items.head(10)


In [None]:
g_df_rules = association_rules(
    g_df_freq_items, metric="confidence", min_threshold=MIN_CONFIDENCE)

# itemの数を加える．
g_df_rules["antecedent_len"] = g_df_rules["antecedents"].apply(
    lambda x: len(x))
g_df_rules["consequents_len"] = g_df_rules["consequents"].apply(
    lambda x: len(x))
# supportが多いruleに並び直す．
g_df_rules = g_df_rules.sort_values(
    by="support", ascending=False).reset_index(drop=True)
print(g_df_rules.shape)
g_df_rules.head(15)


In [None]:
import os
import uuid
def select_rules(df,  n_ante=1, n_cons=1):
    """transaction ruleの図示をする．

    Args:
        df (pd.DataFrame): データ    
        n_ante (int, optional): size of 前提部. Defaults to 1.
        n_cons (int, optional): size of 帰結部. Defaults to 1.
    
    Returns:
        df (pd.DataFrame): データ
    """
    _df = df.copy().reset_index(drop=True)
    edgelist = []
    for ante, cons in zip(_df["antecedents"], _df["consequents"]):
        ante = list(ante)
        cons = list(cons)
        if len(ante) <= n_ante and len(cons)<=n_cons:
            for ante1 in list(ante):
                for cons1 in list(cons):
                    edgelist.append([str(ante1),str(cons1)])
    _df = pd.DataFrame(edgelist, columns=["antecedents","consequents"])
    return _df

def show_rules(df, figsize=(8,5), seed=1, metadata=METADATA):
    """transaction ruleの図示をする．

    Args:
        df (pd.DataFrame): データ
        figsize (tuple, optional): 図のサイズ. Defaults to (10, 10).
        seed (int, optional): nx.drawのseed. Defaults to 1.

        metadata (dict, optional): 図のデータ. Defaults to METADATA.
    """


    import networkx as nx
    import matplotlib.pyplot as plt
    GA = nx.from_pandas_edgelist(df,
                                 source='antecedents', target='consequents',
                                 create_using=nx.MultiDiGraph())

    if True:
        fig, ax = plt.subplots(figsize=figsize)
        # use an appropriate method
        # see also networkx manual
        # nx.draw(GA, node_color="yellow", with_labels=True, ax=ax)
        # nx.draw(GA, pos=nx.spring_layout(GA, k=2000,iterations=300, seed=seed), node_color="yellow", with_labels=True, ax=ax)
        nx.draw(GA, pos=nx.circular_layout(GA), node_color="yellow", with_labels=True, ax=ax)
        # nx.draw(GA, pos=nx.shell_layout(GA), node_color="yellow", with_labels=True, ax=ax)
        # nx.draw(GA, pos=nx.spiral_layout(GA), node_color="yellow", with_labels=True, ax=ax)
        # nx.draw(GA, pos=nx.random_layout(GA, seed=seed), node_color="yellow", with_labels=True, ax=ax)
        xlim = ax.get_xlim()
        xrange = ax.get_xlim()[1]-ax.get_xlim()[0]
        scale_factor = 0.5
        xlim = [xlim[0]-xrange*scale_factor,xlim[1]+xrange*scale_factor]
        ax.set_xlim(xlim)
        fig.tight_layout()

        filename = "_".join([metadata["prefix"], metadata["dataname"], 
                             "nxdraw", "circular_layout", 
                             "support",metadata["support"], "confidence", metadata["confidence"],
                             "shape", str(df.shape[0])])+".png"
        print(filename)
        fig.savefig(os.path.join(metadata["outputdir"], filename))
        plt.show()
    
    filename = "_".join([metadata["prefix"], metadata["dataname"], 
                         "support",metadata["support"], "confidence", metadata["confidence"],
                         "shape",str(df.shape[0])])+".cyjs"    
    with open(filename,"w") as f:
        import json
        f.write( json.dumps(nx.cytoscape_data(GA))  )
    print(filename)
    
    return GA


# g_df_tmp = g_df_rules[g_df_rules["antecedent_len"]==2]
# g_df_tmp = g_df_rules.iloc[:15, :]
# g_df_tmp = g_df_rules.iloc[:30, :]
if SELECT15:
    g_df_rules_selected = select_rules(g_df_rules.iloc[:15, :])
    print(g_df_rules_selected.shape)
    GA = show_rules(g_df_rules_selected)
else:
    g_df_rules_selected = select_rules(g_df_rules, n_ante=2, n_cons=2)
    GA = show_rules(g_df_rules_selected)


In [None]:
# 元素特徴量の読み込み．
g_df_atom = pd.read_csv("data/atomicprop.csv", index_col=[0])
g_df_atom.head(10)

In [None]:
def query_rule(df_atom, ante, cons):
    """データ似たいして仮定部に対する結論部でmatchするデータの数を得る．

    Args:
        df_atom (pd.Data): データ
        ante (str): 仮定部
        cons (str): 結論部
    """
    dfq_antecedents = df_atom.query(ante)
    dfq_consequents = df_atom.query("{} and {}".format(ante, cons))
    print("confidence {}/{}={}".format(dfq_consequents.shape[0],dfq_antecedents.shape[0],
          dfq_consequents.shape[0]/dfq_antecedents.shape[0]))


g_ante = "log10_electrical_resistivity<-6.90"
g_cons = "log10_thermal_conductivity>1.73"
query_rule(g_df_atom, g_ante, g_cons)


In [None]:
from pymatgen.core.periodic_table import Element
import math
import matplotlib.pyplot as plt
%matplotlib inline


def plot_xy_symbol(df, x, y, ante=None, cons=None, filename=None):
    """plot symbols in 2D defined by x and y labels
       The points satisfying antecedents are colored in blue.
       The points satisfying antecedents and consequents are colored in red.

    Args:
        df (pd.DataFrame): data
        x (str): x label
        y (str): y label
        filename (str, optional): filename. Defaults to None.
    """
    fig, ax = plt.subplots()
    df = df.reset_index()
    dfq = df[[x, y, "index"]].dropna()
    for xval, yval, elm in zip(dfq[x].values,
                             dfq[y].values,
                             dfq["index"].values):
        if math.isnan(xval) or math.isnan(yval):
            pass
        else:
            ax.text(xval, yval, elm)
    ax.scatter(dfq[x].values,
               dfq[y].values, c="green")
    ax.set_xlabel(x)
    ax.set_ylabel(y)

    if ante is not None:
        print("antecedents={}".format(ante))
        df_ante = df.query(ante)
        ax.scatter(df_ante[x].values,
                   df_ante[y].values, c="blue")
    if ante is not None and cons is not None:
        print("consequents={}".format(cons))
        df_cons = df.query("{} and {}".format(ante, cons))
        ax.scatter(df_cons[x].values,
                   df_cons[y].values, c="r")
    if filename is not None:
        fig.savefig(filename)
    # fig.show()
    plt.show()


In [None]:
# 順序相関関数を同時に出す．
from scipy.stats import spearmanr, pearsonr
def calc_correlation(df_atom,x,y):
    """
    nanを除いてspearman correlationを求める．
    
    Args:
        df_atom (pd.DataFrame): data.
        x (str): x column name.
        y (str): y column name.
    """
    df = df_atom[[x,y]].dropna()
    
    psr = pearsonr(df[x].values, df[y].values)
    spr  = spearmanr(df[x].values, df[y].values)
    print("Pearson R", psr)
    print( spr)

In [None]:
plot_xy_symbol(g_df_atom, 'log10_electrical_resistivity', 'log10_thermal_conductivity',
               ante=g_ante, cons=g_cons)
calc_correlation(g_df_atom, 'log10_electrical_resistivity', 'log10_thermal_conductivity',)

In [None]:
g_ante = "bulk_modulus>100.00"
g_cons = "molar_volume<12.29"
plot_xy_symbol(g_df_atom, 'bulk_modulus', 'molar_volume',
               ante=g_ante, cons=g_cons)
calc_correlation(g_df_atom , 'bulk_modulus', 'molar_volume')

In [None]:
g_ante = "youngs_modulus>105.00"
g_cons = "molar_volume<12.29"
plot_xy_symbol(g_df_atom, 'youngs_modulus', 'molar_volume',
               ante=g_ante, cons=g_cons)
calc_correlation(g_df_atom,'youngs_modulus', 'molar_volume')

In [None]:
g_ante = "youngs_modulus>105.00"
g_cons = "bulk_modulus>100.00"
plot_xy_symbol(g_df_atom, 'youngs_modulus', 'bulk_modulus',
               ante=g_ante, cons=g_cons)
calc_correlation(g_df_atom,'youngs_modulus', 'bulk_modulus',)

In [None]:
g_ante = "bulk_modulus>100.00"
g_cons = "youngs_modulus>105.00"
plot_xy_symbol(g_df_atom, 'bulk_modulus', 'youngs_modulus',
               ante=g_ante, cons=g_cons)
calc_correlation(g_df_atom,'bulk_modulus', 'youngs_modulus',)