In [1]:
import json
import numpy as np
np.set_printoptions( precision=2, edgeitems=6, linewidth=240 )

In [2]:
data_paths = [ "../data/shanghan_formulas.json",  "../data/shanghan_herbs.json" ]
s_fmls_, s_herbs_ = [ json.loads( open(data_path, 'r', encoding='utf-8').read() ) for data_path in  data_paths ]
s_fmls = s_fmls_.get('formulas').items()
fml_list = [ list( fml[1].get('ingOrg').keys() ) for fml in s_fmls ]
fml_corpus = [ " ".join(fml) for fml in fml_list ]

In [3]:
from sklearn.feature_extraction.text import CountVectorizer

min_df = 5
vectorizer = CountVectorizer( min_df=min_df )
X = vectorizer.fit_transform( fml_corpus)
herb_names =  vectorizer.get_feature_names()

print( X.shape )
print( herb_names )

(219, 34)
['감초', '갱미', '건강', '계지', '과루근', '귤피', '대조', '대황', '도인', '마황', '망초', '모려', '반하', '복령', '부자', '생강', '석고', '세신', '시호', '아교', '오미자', '용골', '인삼', '작약', '정력', '지실', '치자', '택사', '행인', '향시', '황금', '황기', '황련', '후박']


In [4]:
herb_co = X.T * X

print( "# HERB NETWORK MATRIX according to co-occurrence", herb_co.shape )
print( herb_co.toarray() )

# HERB NETWORK MATRIX according to co-occurrence (34, 34)
[[106   6  19  46   3   1 ...  12   1  12   5   6   3]
 [  6   7   1   1   0   0 ...   0   0   0   0   0   0]
 [ 19   1  31   5   1   0 ...   2   0   6   0   5   0]
 [ 46   1   5  61   2   0 ...   6   0   4   5   1   3]
 [  3   0   1   2   5   0 ...   0   0   2   0   0   0]
 [  1   0   0   0   0   5 ...   0   0   0   0   0   0]
 ...
 [ 12   0   2   6   0   0 ...  16   0   0   0   0   2]
 [  1   0   0   0   0   0 ...   0   6   0   0   0   0]
 [ 12   0   6   4   2   0 ...   0   0  20   0   8   0]
 [  5   0   0   5   0   0 ...   0   0   0   7   0   0]
 [  6   0   5   1   0   0 ...   0   0   8   0  13   0]
 [  3   0   0   3   0   0 ...   2   0   0   0   0  10]]


In [5]:
output_file_path = "../data/shanghan_herb_matrix.tsv"
header = "\t".join( herb_names )
np.savetxt( output_file_path, herb_co.toarray(), fmt='%d', delimiter='\t', newline='\n', header=header, footer='', comments='# ')

### MDS 

In [6]:
from bokeh.plotting import figure, show, output_notebook

def scatter_plot( embedded, title ):
    lst = embedded.tolist()
    x, y = zip( *lst )

    # add a circle renderer with a size, color, and alpha
    p = figure( plot_width=400, plot_height=400, title=title )
    p.scatter( x, y, size=2, color="navy", alpha=0.5)
    return p


In [20]:
C = np.loadtxt( output_file_path, dtype=np.int, comments='#', delimiter="\t")

n, _ = C.shape

dissimilarities = np.max( C ) - C

dg = np.zeros( n )
np.fill_diagonal( dissimilarities, dg )
print( dissimilarities )

[[  0 100  87  60 103 105 ...  94 105  94 101 100 103]
 [100   0 105 105 106 106 ... 106 106 106 106 106 106]
 [ 87 105   0 101 105 106 ... 104 106 100 106 101 106]
 [ 60 105 101   0 104 106 ... 100 106 102 101 105 103]
 [103 106 105 104   0 106 ... 106 106 104 106 106 106]
 [105 106 106 106 106   0 ... 106 106 106 106 106 106]
 ...
 [ 94 106 104 100 106 106 ...   0 106 106 106 106 104]
 [105 106 106 106 106 106 ... 106   0 106 106 106 106]
 [ 94 106 100 102 104 106 ... 106 106   0 106  98 106]
 [101 106 106 101 106 106 ... 106 106 106   0 106 106]
 [100 106 101 105 106 106 ... 106 106  98 106   0 106]
 [103 106 106 103 106 106 ... 104 106 106 106 106   0]]


In [21]:
from sklearn.manifold import smacof

# morden MDS via smacof
embedded_mds, stress = smacof( C )


In [22]:
output_notebook() 
show( scatter_plot( embedded_mds, "MDS" ) )   

In [90]:
C = np.loadtxt( output_file_path, dtype=np.int, comments='#', delimiter="\t")

n, _ = C.shape

dg = np.zeros( n )

Co = C.astype(np.float32)
np.fill_diagonal( Co, dg )
print( Co )

[[ 0.  6. 19. 46.  3.  1. ... 12.  1. 12.  5.  6.  3.]
 [ 6.  0.  1.  1.  0.  0. ...  0.  0.  0.  0.  0.  0.]
 [19.  1.  0.  5.  1.  0. ...  2.  0.  6.  0.  5.  0.]
 [46.  1.  5.  0.  2.  0. ...  6.  0.  4.  5.  1.  3.]
 [ 3.  0.  1.  2.  0.  0. ...  0.  0.  2.  0.  0.  0.]
 [ 1.  0.  0.  0.  0.  0. ...  0.  0.  0.  0.  0.  0.]
 ...
 [12.  0.  2.  6.  0.  0. ...  0.  0.  0.  0.  0.  2.]
 [ 1.  0.  0.  0.  0.  0. ...  0.  0.  0.  0.  0.  0.]
 [12.  0.  6.  4.  2.  0. ...  0.  0.  0.  0.  8.  0.]
 [ 5.  0.  0.  5.  0.  0. ...  0.  0.  0.  0.  0.  0.]
 [ 6.  0.  5.  1.  0.  0. ...  0.  0.  8.  0.  0.  0.]
 [ 3.  0.  0.  3.  0.  0. ...  2.  0.  0.  0.  0.  0.]]


In [101]:
import tensorflow as tf
# for reproducibility
tf.set_random_seed( 99 )
    
def pairwise_dist( X ):
    # Pairwise Euclidean distance ( Squared )
    r_ = tf.reduce_sum( X * X, 1 )
    r = tf.reshape( r_, [-1, 1] )
    d_square = r - 2 * tf.matmul( X, tf.transpose( X ) ) + tf.transpose( r )
    return d_square

def vos_mapping( arr_sim, learning_rate=1e-5, iter_n =2000, verbose=True ):

    n, _ = arr_sim.shape

    S = tf.placeholder( tf.float32, [n, n] )
    X = tf.Variable( tf.random_normal( [n, 2] ), name='embedded')

    """
    d_square = pairwise_dist( X )
    """
    
    r_ = tf.reduce_sum( X * X, 1 )
    r = tf.reshape( r_, [-1, 1] )
    d_square = r - 2 * tf.matmul( X, tf.transpose( X ) ) + tf.transpose( r )
    
    # cost function
#     cost = tf.reduce_sum( tf.multiply(S, d_square) - 2 * tf.sqrt( d_square ) )
    c2 =  tf.reduce_sum( tf.sqrt( d_square ) )
    cost = tf.reduce_sum( tf.multiply(S, d_square) )


    train = tf.train.GradientDescentOptimizer( learning_rate=learning_rate ).minimize( cost )

    # Launch the graph in a session.
    sess = tf.Session()
    sess.run( tf.global_variables_initializer() )

    """
    for step in range( iter_n ):
        _, cost_val, c1_v, c2_v = sess.run([ train, cost, c1, c2 ], feed_dict={ S: arr_sim } )
        if verbose:
            if step % (iter_n // 10) == 0: 
                print( "Step {:05d}  ==> Cost: {} ( {} - {} )".format( step, cost_val, c1_v, c2_v ) )

    """
    for step in range( iter_n ):
        _, cost_val, c2_v = sess.run([ train, cost, c2 ], feed_dict={ S: arr_sim } )
        if verbose:
            if step % (iter_n // 10) == 0: 
                print( "Step {:05d}  ==> Cost: {} .. {}".format( step, cost_val, c2_v ) )
    embedded = sess.run( X, feed_dict={ S: arr_sim } )
    return embedded


In [107]:
embedded = vos_mapping( Co, iter_n = 2001, learning_rate=1e-5 )

output_notebook() 
show( scatter_plot( embedded, "VOSviewer" ) )

Step 00000  ==> Cost: 14570.087890625 .. 1944.89892578125
Step 00200  ==> Cost: 1968.107666015625 .. 1162.99658203125
Step 00400  ==> Cost: 811.176513671875 .. 862.2573852539062
Step 00600  ==> Cost: 424.9082946777344 .. 680.7232055664062
Step 00800  ==> Cost: 257.07232666015625 .. 557.9899291992188
Step 01000  ==> Cost: 169.66920471191406 .. 468.7416076660156
Step 01200  ==> Cost: 117.98001861572266 .. 400.43017578125
Step 01400  ==> Cost: 84.78981018066406 .. 346.26080322265625
Step 01600  ==> Cost: 62.355979919433594 .. 302.218017578125
Step 01800  ==> Cost: 46.68815231323242 .. 265.751220703125
Step 02000  ==> Cost: 35.49714660644531 .. 235.07086181640625


In [39]:
margin_x = np.sum( C, 1 )
margin_y = np.sum( C, 0 )
sum_all = np.sum( margin_x )

margin_x_mx = np.repeat( margin_x, n ).reshape( -1, n )
margin_y_mx = np.repeat( margin_y, n ).reshape( n, -1 ).T

S_ = sum_all * C 
base= np.multiply( margin_x_mx,  margin_y_mx )

S = np.divide( S_, base )
print( S )

[[0.   2.03 1.36 1.34 0.97 0.59 ... 1.37 0.71 0.87 1.42 1.01 0.63]
 [2.03 0.   1.32 0.54 0.   0.   ... 0.   0.   0.   0.   0.   0.  ]
 [1.36 1.32 0.   0.57 1.26 0.   ... 0.89 0.   1.69 0.   3.29 0.  ]
 [1.34 0.54 0.57 0.   1.02 0.   ... 1.09 0.   0.46 2.25 0.27 0.99]
 [0.97 0.   1.26 1.02 0.   0.   ... 0.   0.   2.54 0.   0.   0.  ]
 [0.59 0.   0.   0.   0.   0.   ... 0.   0.   0.   0.   0.   0.  ]
 ...
 [1.37 0.   0.89 1.09 0.   0.   ... 0.   0.   0.   0.   0.   2.6 ]
 [0.71 0.   0.   0.   0.   0.   ... 0.   0.   0.   0.   0.   0.  ]
 [0.87 0.   1.69 0.46 2.54 0.   ... 0.   0.   0.   0.   5.32 0.  ]
 [1.42 0.   0.   2.25 0.   0.   ... 0.   0.   0.   0.   0.   0.  ]
 [1.01 0.   3.29 0.27 0.   0.   ... 0.   0.   5.32 0.   0.   0.  ]
 [0.63 0.   0.   0.99 0.   0.   ... 2.6  0.   0.   0.   0.   0.  ]]


In [40]:
embedded = vos_mapping( S, iter_n = 2001 )

output_notebook() 
show( scatter_plot( embedded, "VOSviewer" ) )

Step 00000  ==> Cost: 7377.55029296875 ( 7447.00390625 - 69.45358276367188 )
Step 00200  ==> Cost: 60.1114387512207 ( 72.92604064941406 - 12.81460189819336 )
Step 00400  ==> Cost: 4.727111339569092 ( 9.901121139526367 - 5.174009799957275 )
Step 00600  ==> Cost: -0.04565596580505371 ( 2.7823450565338135 - 2.828001022338867 )
Step 00800  ==> Cost: -0.6533598899841309 ( 1.3496294021606445 - 2.0029892921447754 )
Step 01000  ==> Cost: -0.7504798173904419 ( 0.9577059745788574 - 1.7081857919692993 )
Step 01200  ==> Cost: -0.7766924500465393 ( 0.833805501461029 - 1.6104979515075684 )
Step 01400  ==> Cost: -0.7921470403671265 ( 0.7960282564163208 - 1.5881752967834473 )
Step 01600  ==> Cost: -0.8052389621734619 ( 0.7894203662872314 - 1.5946593284606934 )
Step 01800  ==> Cost: -0.8169010877609253 ( 0.794540524482727 - 1.6114416122436523 )
Step 02000  ==> Cost: -0.8270618915557861 ( 0.8037921190261841 - 1.6308540105819702 )
