# Visualize Network1 : MDS

* MDS

## Data

In [1]:
import json
import numpy as np
np.set_printoptions( precision=2, edgeitems=6, linewidth=240 )

In [2]:
data_paths = [ "../data/shanghan_formulas.json",  "../data/shanghan_herbs.json" ]
s_fmls_, s_herbs_ = [ json.loads( open(data_path, 'r', encoding='utf-8').read() ) for data_path in  data_paths ]
s_fmls = s_fmls_.get('formulas').items()
fml_list = [ list( fml[1].get('ingOrg').keys() ) for fml in s_fmls ]
fml_corpus = [ " ".join(fml) for fml in fml_list ]

In [3]:
from sklearn.feature_extraction.text import CountVectorizer

min_df = 5
vectorizer = CountVectorizer( min_df=min_df )
X = vectorizer.fit_transform( fml_corpus)
herb_names =  vectorizer.get_feature_names()
herb_freq = np.sum( X, 0 ).tolist()[0]
print( X.shape )
print( herb_names )
print( herb_freq )

(219, 34)
['감초', '갱미', '건강', '계지', '과루근', '귤피', '대조', '대황', '도인', '마황', '망초', '모려', '반하', '복령', '부자', '생강', '석고', '세신', '시호', '아교', '오미자', '용골', '인삼', '작약', '정력', '지실', '치자', '택사', '행인', '향시', '황금', '황기', '황련', '후박']
[106, 7, 31, 61, 5, 5, 59, 29, 7, 23, 8, 7, 36, 33, 27, 61, 12, 9, 7, 6, 6, 6, 31, 41, 5, 16, 10, 8, 16, 6, 20, 7, 13, 10]


In [4]:
herb_co = X.T * X

print( "# HERB NETWORK MATRIX according to co-occurrence", herb_co.shape )
print( herb_co.toarray() )

# HERB NETWORK MATRIX according to co-occurrence (34, 34)
[[106   6  19  46   3   1 ...  12   1  12   5   6   3]
 [  6   7   1   1   0   0 ...   0   0   0   0   0   0]
 [ 19   1  31   5   1   0 ...   2   0   6   0   5   0]
 [ 46   1   5  61   2   0 ...   6   0   4   5   1   3]
 [  3   0   1   2   5   0 ...   0   0   2   0   0   0]
 [  1   0   0   0   0   5 ...   0   0   0   0   0   0]
 ...
 [ 12   0   2   6   0   0 ...  16   0   0   0   0   2]
 [  1   0   0   0   0   0 ...   0   6   0   0   0   0]
 [ 12   0   6   4   2   0 ...   0   0  20   0   8   0]
 [  5   0   0   5   0   0 ...   0   0   0   7   0   0]
 [  6   0   5   1   0   0 ...   0   0   8   0  13   0]
 [  3   0   0   3   0   0 ...   2   0   0   0   0  10]]


## Libs

In [5]:
from sklearn.preprocessing import scale

def re_scale( arr, zoom=1, bottom=2):
    arr_scaled = scale( herb_freq, axis=0, with_mean=True, with_std=True, copy=True )
    arr_zoomed = arr_scaled * zoom
    arr_min = np.min( arr_zoomed )
    add_mount = bottom - arr_min
    return arr_zoomed + add_mount

In [6]:
from bokeh.plotting import figure, show, output_notebook, ColumnDataSource

def scatter_plot( embedded, title="", size=4):
    lst = embedded.tolist()
    x, y = zip( *lst )
    
    p = figure( plot_width=600, plot_height=600, title=title )
    p.circle(x, y, size=size, color="navy", alpha=0.5 )

    return p

def scatter_plot_tooltip( embedded, size, labels="", title=""  ):
    lst = embedded.tolist()
    x, y = zip( *lst )
    
    source = ColumnDataSource(data=dict( x=x, y=y, size=size, label=labels ))

    TOOLTIPS = [
        ("label", "@label"),
        ("index", "$index"),
        ("(x,y,size)", "($x, $y,@size)"),
    ]
    
    p = figure( plot_width=600, plot_height=600, title=title, tooltips=TOOLTIPS )
    p.circle('x', 'y', size='size', color="navy", alpha=0.5, source=source)

    return p


## MDS 

In [7]:
C = herb_co.toarray()

n, _ = C.shape

dissimilarities = np.max( C ) - C

dg = np.zeros( n )
np.fill_diagonal( dissimilarities, dg )
print( dissimilarities )

[[  0 100  87  60 103 105 ...  94 105  94 101 100 103]
 [100   0 105 105 106 106 ... 106 106 106 106 106 106]
 [ 87 105   0 101 105 106 ... 104 106 100 106 101 106]
 [ 60 105 101   0 104 106 ... 100 106 102 101 105 103]
 [103 106 105 104   0 106 ... 106 106 104 106 106 106]
 [105 106 106 106 106   0 ... 106 106 106 106 106 106]
 ...
 [ 94 106 104 100 106 106 ...   0 106 106 106 106 104]
 [105 106 106 106 106 106 ... 106   0 106 106 106 106]
 [ 94 106 100 102 104 106 ... 106 106   0 106  98 106]
 [101 106 106 101 106 106 ... 106 106 106   0 106 106]
 [100 106 101 105 106 106 ... 106 106  98 106   0 106]
 [103 106 106 103 106 106 ... 104 106 106 106 106   0]]


In [8]:
from sklearn.manifold import smacof

# morden MDS via smacof
embedded_mds, stress = smacof( dissimilarities )


In [9]:
output_notebook() 
show( scatter_plot_tooltip( embedded_mds, labels=herb_names, size=re_scale( herb_freq, zoom=10, bottom=6 ), title="MDS" ) )