# Comparing

## Libs

In [11]:
import numpy as np
np.set_printoptions( precision=2, edgeitems=6, linewidth=240 )

In [12]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

from bokeh.layouts import gridplot
from bokeh.plotting import figure, show, output_notebook, output_file
from bokeh.models import ColumnDataSource, Label, LabelSet, Plot, LinearAxis, Grid, Slope, Span
from bokeh.models.glyphs import Text

## Data

In [13]:
# 동의보감
with open("../data/DYBG_tn.txt", 'r', encoding='utf-8') as fl: corpusA = fl.read()
# 의학입문
with open("../data/YHYM_tn.txt", 'r', encoding='utf-8') as fl: corpusB = fl.read()
# 경악전서
with open("../data/GAZS_tn.txt", 'r', encoding='utf-8') as fl: corpusC = fl.read()


## Table

In [14]:
terms = "肝ㆍ心ㆍ脾ㆍ肺ㆍ腎ㆍ風ㆍ寒ㆍ暑ㆍ濕ㆍ燥ㆍ火ㆍ精ㆍ氣ㆍ神ㆍ血ㆍ痰ㆍ鬱ㆍ陰ㆍ陽ㆍ表ㆍ裏ㆍ熱ㆍ虛ㆍ實".split("ㆍ")

corpus_length = [ len(corpusA), len(corpusB), len(corpusC) ]
import pandas as pd

term_freq = {
    'freq_a': [ corpusA.count(t) for t in terms ],
    'freq_b': [ corpusB.count(t) for t in terms ],
    'freq_c': [ corpusC.count(t) for t in terms ],
}

term_freq_df = pd.DataFrame( term_freq )
term_freq_df.index = terms

term_rfreq_df = term_freq_df / corpus_length * 1000
term_rfreq_df.columns = ['r_freq_a', 'r_freq_b', 'r_freq_c']
term_rfreq_df.index = terms

In [15]:
term_freqs_df = pd.concat([term_freq_df, term_rfreq_df], axis=1)
term_freqs_df.columns = ['freq_a', 'freq_b', 'freq_c', 'r_freq_a', 'r_freq_b', 'r_freq_c']
term_freqs_df.index = terms
term_freqs_df

Unnamed: 0,freq_a,freq_b,freq_c,r_freq_a,r_freq_b,r_freq_c
肝,1164,1100,1359,1.289778,1.580112,1.575758
心,3835,3418,2489,4.249397,4.90984,2.885991
脾,1322,1563,2565,1.464851,2.245196,2.974113
肺,1299,1298,1185,1.439365,1.864533,1.374005
腎,1523,1496,1587,1.68757,2.148953,1.840124
風,3805,4107,3159,4.216155,5.899565,3.662855
寒,3373,3850,6241,3.737475,5.530393,7.236429
暑,313,352,352,0.346822,0.505636,0.408143
濕,1265,1671,1448,1.401692,2.400334,1.678953
燥,519,701,598,0.575081,1.006963,0.69338


## Correlation Coefficient & Scatter plot

In [16]:
from scipy import stats

# Build Bokeh Plot

source = ColumnDataSource( term_rfreq_df )

TOOLTIPS1 = [
    ("(x,y)", "(@r_freq_a, @r_freq_b)"),
]

TOOLTIPS2 = [
    ("(x,y)", "(@r_freq_a, @r_freq_c)"),
]

# plot1

p1 = figure( title="Term Distribution between 동의보감 & 의학입문", 
              plot_width=600, plot_height=600, 
              x_range=(0, 16), y_range=(0, 16),
              tooltips=TOOLTIPS1 )
p1.scatter( x='r_freq_a', y='r_freq_b', size=6, color="gray", alpha=0.5, source=source )

labels1 = LabelSet( x='r_freq_a', y='r_freq_b', text='index', level='glyph',
              x_offset=0, y_offset=0, source=source, render_mode='canvas')

gradient1, y_intercept1, r_value, p_value, std_err = stats.linregress(term_freqs_df['r_freq_a'], term_freqs_df['r_freq_b'] )
slope1 = Slope( gradient=gradient1, y_intercept=y_intercept1, line_color='skyblue', line_dash='dashed', line_width=1.0)

p1.add_layout( labels1 )
p1.add_layout( slope1 )
p1.xaxis.axis_label = "Relative Frequency of Terms within A"
p1.yaxis.axis_label = "Relative Frequency of Terms within B"

# plot2

p2 = figure( title="Term Distribution between 동의보감 & 경악전서", 
              plot_width=600, plot_height=600, 
              x_range=(0, 16), y_range=(0, 16),
              tooltips=TOOLTIPS2 )
p2.scatter( x='r_freq_a', y='r_freq_c', size=6, color="gray", alpha=0.5, source=source )

labels2 = LabelSet( x='r_freq_a', y='r_freq_c', text='index', level='glyph',
              x_offset=0, y_offset=0, source=source, render_mode='canvas')

gradient2, y_intercept2, r_value, p_value, std_err = stats.linregress(term_freqs_df['r_freq_a'], term_freqs_df['r_freq_c'] )
slope2 = Slope( gradient=gradient2, y_intercept=y_intercept2, line_color='skyblue', line_dash='dashed', line_width=1.0)

p2.add_layout( labels2 )
p2.add_layout( slope2 )
p2.xaxis.axis_label = "Relative Frequency of Terms within A"
p2.yaxis.axis_label = "Relative Frequency of Terms within C"

output_notebook()
show( gridplot([p1,p2], ncols=2, plot_width=450, plot_height=450 ) )

# Correlation Coefficient
ce = np.corrcoef( term_freqs_df.T )
print( "# Correlation Coefficient")
print( "* between corpusA & corpusB :{}".format( ce[3,4]) )
print( "* between corpusA & corpusC :{}".format( ce[3,5]) )




# Correlation Coefficient
* between corpusA & corpusB :0.9825012849315475
* between corpusA & corpusC :0.9250016101820752


## Standarized Residual & Assoc plot

In [17]:
# https://stackoverflow.com/questions/20453729/what-is-the-equivalent-of-r-data-chisqresiduals-in-python

from scipy.stats.contingency import margins

def residuals(observed, expected):
    return (observed - expected) / np.sqrt(expected)

def stdres(observed, expected):
    n = observed.sum()
    rsum, csum = margins( observed )
    v = csum * rsum * (n - rsum) * (n - csum) / n**3
#     return (observed - expected) / np.sqrt(v)

In [18]:
# from scipy.stats import chisquare
# chisquare( observed_v, expected_v )

# 의학입문을 기준으로 비교했을 때 동의보감의 키워드 분포
rs_ab = pd.DataFrame()
rs_ab['o'] = term_freq_df['freq_a']
rs_ab['e'] = term_freq_df['freq_b']
rs_ab['sqrt_e'] = np.sqrt( rs_ab['e'] )
rs_ab['rs'] = residuals( rs_ab['o'], rs_ab['e'] )

# 경악전서을 기준으로 비교했을 때 동의보감의 키워드 분포
rs_ac = pd.DataFrame()
rs_ac['o'] = term_freq_df['freq_a']
rs_ac['e'] = term_freq_df['freq_c']
rs_ac['sqrt_e'] = np.sqrt( rs_ac['e'] )
rs_ac['rs'] = residuals( rs_ac['o'], rs_ac['e'] )

In [19]:
from IPython.display import display_html
def display_side_by_side(*args):
    html_str=''
    for df in args:
        html_str+=df.to_html()
    display_html(html_str.replace('table','table style="display:inline"'),raw=True)
    
display_side_by_side( rs_ab,rs_ac )

Unnamed: 0,o,e,sqrt_e,rs
肝,1164,1100,33.166248,1.929673
心,3835,3418,58.463664,7.132635
脾,1322,1563,39.534795,-6.095896
肺,1299,1298,36.027767,0.027756
腎,1523,1496,38.678159,0.698068
風,3805,4107,64.08588,-4.712427
寒,3373,3850,62.048368,-7.687551
暑,313,352,18.761663,-2.078707
濕,1265,1671,40.877867,-9.932025
燥,519,701,26.476405,-6.874045

Unnamed: 0,o,e,sqrt_e,rs
肝,1164,1359,36.864617,-5.289625
心,3835,2489,49.889879,26.97942
脾,1322,2565,50.645829,-24.542989
肺,1299,1185,34.423829,3.311659
腎,1523,1587,39.837169,-1.60654
風,3805,3159,56.204982,11.493643
寒,3373,6241,79.0,-36.303797
暑,313,352,18.761663,-2.078707
濕,1265,1448,38.052595,-4.809133
燥,519,598,24.454039,-3.23055


In [21]:
# Build Bokeh Plot

source1 = ColumnDataSource( rs_ab )
source2 = ColumnDataSource( rs_ac )

TOOLTIPS = [
    ("(x,y)", "(@sqrt_e, @rs)"),
    ("term", "@index")
]

# plot1

p1 = figure( title="Term Distribution 동의보감 compared with 의학입문", 
              plot_width=600, plot_height=600, 
              y_range=(-32, 32),
              tooltips=TOOLTIPS )
p1.scatter( x='sqrt_e', y='rs', size=6, color="gray", alpha=0.5, source=source1 )

labels1 = LabelSet( x='sqrt_e', y='rs', text='index', level='glyph', 
              x_offset=0, y_offset=0, source=source1, render_mode='canvas')
neutral_line = Span( location=0, dimension='width', line_color='green', line_dash='dashed', line_width=1)

p1.add_layout( labels1 )
p1.add_layout(neutral_line)
p1.xaxis.axis_label = "Sqrt Expected Value"
p1.yaxis.axis_label = "Residuals"


# plot2

p2 = figure( title="Term Distribution 동의보감 compared with 의학입문", 
              plot_width=600, plot_height=600, 
              y_range=(-32, 32),
              tooltips=TOOLTIPS )
p2.scatter( x='sqrt_e', y='rs', size=6, color="gray", alpha=0.5, source=source2 )

labels2 = LabelSet( x='sqrt_e', y='rs', text='index', level='glyph',
              x_offset=0, y_offset=0, source=source2, render_mode='canvas')
neutral_line = Span( location=0, dimension='width', line_color='green', line_dash='dashed', line_width=1)

p2.add_layout( labels2 )
p2.add_layout(neutral_line)
p2.xaxis.axis_label = "Sqrt Expected Value"
p2.yaxis.axis_label = "Residuals"

output_notebook()
show( gridplot([p1,p2], ncols=2, plot_width=450, plot_height=450 ) )


## REFs 

* [오준호. 의학 사상의 유사성은 계량 분석 될 수 있는가. 대한한의학원전학회지. 2018;31(2):71-82](https://jkmc.jams.or.kr/po/volisse/sjPubsArtiPopView.kci?soceId=INS000001846&artiId=SJ0000001350&sereId=SER000000001&submCnt=2)
