![](image_text/160_13.png)

![](image_text/160_14.png)


In [None]:
import numpy as np
import pandas as pd

# DataFrameが全て表示されない場合は以下で表示可能範囲を増やす。
pd.set_option("display.max_rows", 20) 
pd.set_option("display.max_columns", 60)

In [None]:
NRANK = 10 # 低ランク近似する次元
THRESHOLD = 0.35 # 低ランク近似した次元と元行列の差＞しきい値のセルを明るく表示する。

In [None]:
def get_data():
    ROOT = ".."
    df = pd.read_csv(f"{ROOT}/data/group131415_div1.csv", index_col=[0])
    print(df.shape)
    return df

g_df = get_data() # データ取得

In [None]:
from recomm_misc import plot_df_heatmap
plot_df_heatmap(g_df, dpi=100) # 元データ表示

In [None]:
from recomm_misc import plot_svd_sdiag
plot_svd_sdiag(g_df.values)

In [None]:
def make_recom_svd(df, nrank):
    """line up candicates by SVD

    Args:
        df (pd.DataFrame): data
        nrank (int): the maximum rank to reconstruct data

    Returns:
        pd.DataFrame: reconstruct data
    """
    X = df.values
    u, sdiag, v = np.linalg.svd(X)
    s = np.zeros((u.shape[1], v.shape[0]))
    s[:nrank, :nrank] = np.diag(sdiag[:nrank])
    u = np.matrix(u)
    v = np.matrix(v)
    s = np.matrix(s)
    recom_svd = u * s * v
    return pd.DataFrame(recom_svd, index=df.index, columns=df.columns)

In [None]:
from sklearn.decomposition import NMF
def make_recom_nmf(df, nrank):
    """line up candicates by NMF

    Args:
        df (pd.DataFrame): data
        nrank (int): the maximum rank to reconstruct data

    Returns:
        pd.DataFrame: reconstruct data
    """
    X = df.values
    model = NMF(n_components=nrank, init='random',
                shuffle=True, random_state=3)
    W = model.fit_transform(X)
    H = model.components_
    W = np.matrix(W)
    H = np.matrix(H)
    WH = W*H
    if False:
        """
        どの程度同じか調べる．
        """
        WHM = WH - X
        for i in range(WHM.shape[0]):
            for j in range(WHM.shape[1]):
                if np.abs(WHM[i, j]) > 0.1:
                    print(i, j, WHM[i, j])

    recom_nmf = WH
    return pd.DataFrame(recom_nmf, index=df.index, columns=df.columns)

In [None]:
def make_recom_correlation(df, nrank=None):
    """line up candicates by correlation

     X[material , structuretype]とすると
    ( X.T * X )[structuretype,structuretype] でstructuretype間の相関を与えるだろう．
    更にXをかけると[material, structuretype]の行列になる．
    recom = X[material , structuretype] * ( X.T * X )[structuretype,structuretype]
    Args:
        df (pd.DataFrame): data
        nrank (int): the maximum rank to reconstruct data

    Returns:
        pd.DataFrame: reconstruct data
    """
    # nrank はdummy
    X = np.matrix(df.values)
    """
    X[material , structuretype]とすると
    ( X.T * X )[structuretype,structuretype] でstructuretype間の相関を与えるだろう．
    更にXをかけると[material, structuretype]の行列になる．
    recom = X[material , structuretype] * ( X.T * X )[structuretype,structuretype]
    """
    recom = X * X.T * X
    # X^3のオーダーになっているので[0,1]に規格化する．
    vmax = recom.reshape(-1).max()
    vmin = recom.reshape(-1).min()
    recom = (recom - vmin)/(vmax-vmin)

    return pd.DataFrame(recom, index=df.index, columns=df.columns)

In [None]:
print("nrank=", NRANK)
g_df_recom = make_recom_svd(g_df, NRANK)
from recomm_misc import plot_2df
plot_2df(g_df, g_df_recom, NRANK, THRESHOLD)
# 低ランク近似行列と元行列との表示。差＞THRESHOLDを白く表示する。

In [None]:
# interactive 可視化
# intel Macの方は問題が発生するかもしれない。
import plotly.express as px
import matplotlib.pyplot as plt
g_fig = px.imshow(g_df_recom-g_df)
g_fig.show()
# plt.show()

In [None]:
def print_existence(df, df_ref, threshold=0.35):
    """print the points the value of which is more than threshold

    Args:
        df (pd.DataFrame): data
        df_ref (pd.DataFrame): reference data
        threshold (float, optional): the threshold value. Defaults to 0.3.
    """
    df_ = df
    resultlist = []
    for name1 in df_.index:
        for name2 in df_.columns:
            value = df_.loc[name1, name2]
            exist_in_ref = df_ref.loc[name1, name2]
            if value >= threshold and exist_in_ref < 1:
                resultlist.append([name1, name2, value, exist_in_ref < 1])

    dfresult = pd.DataFrame(resultlist,
                            columns=["name1", "name2", "recom-ref", "not_exist_ref"])
    return dfresult.sort_values(by="recom-ref", ascending=False)

print_existence(g_df_recom-g_df, g_df, threshold=THRESHOLD)
# cellの値>THERSHOLDを存在を推薦するとして、
# 存在が推薦される物質リストを表示する。
# 物質の重複がある。

![](image_text/160_21.png)

![](image_text/160_22.png)


