# Data Embedding

* word embedding
* word vectorization

## Data

In [31]:
from collections import Counter
import numpy as np

np.set_printoptions( threshold=100, edgeitems=10  )

In [2]:
corpus_path = "../data/formulas.txt"
corpus_ = open( corpus_path, 'r', encoding='utf-8' ).read()
header, corpus_raw = corpus_.split("***")
corpus_raw = corpus_raw.strip()
corpus = [ line.strip() for line in corpus_raw.split("\n") ]
corpus_tokenized = [ line.split() for line in corpus ]

print( "# Corpus Description" )
print( header.strip() )
print()
print( "# Corpus Size: ", len(corpus) )

# Corpus Description
- 출처 : 한국전통지식포탈(www.koreantk.com) > 전통의료 > 처방
- 특징 : 본초 구성만 추출한 데이터
- 데이터 생성일 : 2016.01.16

# Corpus Size:  19162


## 1. 카운트 기반 방법(Counting-based word embedding)


### 1차 벡터 ( 1st order vector )

* TF vector
* TF-IDF vector

In [3]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from time import time

In [4]:
min_df = 6

In [5]:
# Build TF Matrix and TF-IDF Matrix

t0 = time()
tf_vectorizer = CountVectorizer( min_df=min_df )
tf = tf_vectorizer.fit_transform( corpus )
tf_feature_names = tf_vectorizer.get_feature_names()
print( "TF Matrix done in {:03f}s.".format(time() - t0) )

t0 = time()
tfidf_vectorizer = TfidfVectorizer( min_df=min_df )
tfidf = tfidf_vectorizer.fit_transform( corpus )
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print( "TF-IDF Matrix done in {:03f}s.".format(time() - t0) )

term_vector_via_tf = tf.T
term_vector_via_tfidf = tfidf.T

TF Matrix done in 0.113003s.
TF-IDF Matrix done in 0.119001s.


In [6]:
# Save TF and TFIDF matrix : sparse matrix이기 때문에 파일크기가 매우 크다. 
# np.savetxt("../../_data/formulas_tf.tsv", tf.toarray(), delimiter='\t', fmt="%10.3f" )
# np.savetxt("../../_data/formulas_tfidf.tsv", tfidf.toarray(), delimiter='\t', fmt="%10.3f" )

In [32]:
doc_size, feature_size = tf.shape
print( "# 처방 개수: ", doc_size)
print( "# 본초 개수: ", feature_size, " ({}회 이상 사용된 본초)".format( min_df ) ) 
print()
print( "# Term Vector via TF")
print( "* Shape:", term_vector_via_tf.shape )
print( np.array_str( term_vector_via_tf, precision=3, suppress_small=True ) )
print()
print( "# Term Vector via TFIDF")
print( "* Shape:", term_vector_via_tfidf.shape )
print( np.array_str( term_vector_via_tfidf.toarray(), precision=3, suppress_small=True ) )

# 처방 개수:  19162
# 본초 개수:  916  (6회 이상 사용된 본초)

# Term Vector via TF
* Shape: (916, 19162)
[[0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0]
 ...
 [0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 

In [8]:
freq_per_term = np.sum( tf, axis=0 ).flatten().tolist()[0]
size_per_doc = np.sum( tf, axis=1 ).flatten().tolist()[0]

### 2차 벡터 ( 2nd order vector )

* co-word vector
* t-score vector


#### 1.2. co-word vector

In [9]:
co_word = ( tf.T * tf )
term_vector_via_coword = co_word
print( "{}개 본초에 대해 {} 길이의 벡터 생성".format( *co_word.shape ) )

916개 본초에 대해 916 길이의 벡터 생성


#### 2.2. t-score vector

In [10]:
# Observed Value
observed_v = co_word.toarray()

# Expected Value
margin_x = np.full( co_word.shape, freq_per_term )
margin_y = margin_x.T
expected_v = np.divide(  np.multiply( margin_x, margin_y ),  sum( freq_per_term )   )

# T-score with addone smoothing
observed_v_add1 = np.add( observed_v, 1 )
t_score = np.divide( np.subtract( observed_v, expected_v ) , np.sqrt( observed_v_add1 ) )

from scipy import sparse
term_vector_via_tscore = sparse.csc_matrix( t_score )
print( "{}개 본초에 대해 {} 길이의 벡터 생성".format( *term_vector_via_tscore.shape ) )

916개 본초에 대해 916 길이의 벡터 생성


### 2. 예측 기반 방법(Prediction-based word embedding)

* word2vec
* GloVe

#### 2.1. word2vec

In [11]:
# !conda install gensim

In [12]:
import gensim

vec_size = 64
pochs = 32
max_formula_length = max( size_per_doc ) 
print( "Window size: ", max_formula_length )

w2v = gensim.models.Word2Vec( size=vec_size, window=max_formula_length, min_count=min_df, workers=10 )
w2v_feature_names = w2v.build_vocab( corpus_tokenized )
w2v.train( corpus_tokenized, total_examples=len( corpus_tokenized ), epochs=pochs)




Window size:  55


(2816484, 4486336)

## 결과

### Vectors

In [13]:


def get_vector( matrix, term, feature_names ):
    i = feature_names.index( term )
    return matrix[i, :].toarray().reshape( 1, -1 )[0]

def print_vector( vc ):
    print( np.array_str( vc, precision=3, suppress_small=True ) )

In [14]:
ex_term = "인삼"

ex_method = "Count-based (1st order vector)TF "
print( "# {} 방법으로 만들어진 '{}'의 벡터(Vector)".format( ex_method, ex_term ) )
print_vector( get_vector( term_vector_via_tf, ex_term, tf_feature_names ) )

# Count-based (1st order vector)TF  방법으로 만들어진 '인삼'의 벡터(Vector)
[0 0 0 0 0 0 0 0 0 0 ... 0 0 1 0 0 0 0 0 0 0]


In [15]:
ex_method = "TFIDF (1st order vector)"
print( "# {} 방법으로 만들어진 '{}'의 벡터(Vector)".format( ex_method, ex_term ) )
print_vector( get_vector( term_vector_via_tfidf, ex_term, tfidf_feature_names ) )

# TFIDF (1st order vector) 방법으로 만들어진 '인삼'의 벡터(Vector)
[0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    ... 0.
 0.    0.265 0.    0.    0.    0.    0.    0.    0.   ]


In [16]:
ex_method = "Co-word (2nd order vector)"
print( "# {} 방법으로 만들어진 '{}'의 벡터(Vector)".format( ex_method, ex_term ) )
print_vector( get_vector( term_vector_via_coword, ex_term, tf_feature_names ) )

# Co-word (2nd order vector) 방법으로 만들어진 '인삼'의 벡터(Vector)
[ 3  3  0 59  5  7  8  0 95  1 ...  2  0  2  4  5  0  0  4  1  0]


In [17]:
ex_method = "t-score (2nd order vector)"
print( "# {} 방법으로 만들어진 '{}'의 벡터(Vector)".format( ex_method, ex_term ) )
print_vector( get_vector( term_vector_via_tscore, ex_term, tf_feature_names ) )

# t-score (2nd order vector) 방법으로 만들어진 '인삼'의 벡터(Vector)
[ 1.154  1.348 -0.166  6.985  1.612  2.006  2.593 -0.249  8.567  0.59  ...
  0.213 -1.189  1.043  0.664  1.77  -0.525 -0.636  0.565  0.492 -0.166]


In [18]:
ex_method = "Word2vec"
print( "# {} 방법으로 만들어진 '{}'의 벡터(Vector)".format( ex_method, ex_term ) )
print_vector( w2v.wv[ ex_term ] )

# Word2vec 방법으로 만들어진 '인삼'의 벡터(Vector)
[-0.92   0.913 -0.035  0.902 -0.678  3.247 -0.382 -0.723 -0.448  0.812
 -1.84  -0.327 -0.819  2.639  1.036  0.256 -0.718 -0.867  1.356 -0.751
  0.188  1.851 -0.298  0.734  1.664 -1.046  0.212  0.265 -1.843  0.509
  0.417 -0.018  0.803 -0.235  0.076 -0.047  1.691 -0.628 -1.006 -0.338
 -1.405 -0.281 -0.968  0.935 -2.108 -0.249  0.443  1.052 -1.15   0.44
  0.844 -0.557 -0.663 -0.065 -2.723  0.93  -0.34  -2.416  1.299 -0.081
  0.597 -1.956  0.802  2.053]


### Plot

In [19]:
from bokeh.io import output_notebook
from bokeh.plotting import figure, output_file, show, ColumnDataSource
from sklearn.manifold import TSNE
output_notebook()

In [20]:
def tsne_model_fit_transform( matrix ):
    tsne_model = TSNE( n_components=2, n_iter=2000, init='pca', random_state=23 )
    new_matrix = tsne_model.fit_transform( matrix )
    return new_matrix


In [21]:
def tsne_plot( transformed_2d, label, title="plot" ):
    TOOLS="hover,crosshair,pan,wheel_zoom,zoom_in,zoom_out,box_zoom,undo,redo,reset,tap,save,box_select,poly_select,lasso_select,"
    TOOLTIPS = [
        ("label", "@label"),
        ("(index, x, y)", "($index, $x, $y)"),
    ]
   
    x, y = transformed_2d.T
    source = ColumnDataSource(data=dict( x=x, y=y, label=label ))
    plt = figure( tools=TOOLS, tooltips=TOOLTIPS, plot_width=600, plot_height=600, title=title )
    plt.scatter('x', 'y', source=source, radius=1, fill_alpha=0.6, line_color=None)
    return plt

In [22]:
t0 = time()
transformed_tf = tsne_model_fit_transform( term_vector_via_tf.todense() )
print( "TF TSNE model done in {:03f}s.".format( time() - t0 ) )

TF TSNE model done in 32.107294s.


In [23]:
show( tsne_plot( transformed_tf, tf_feature_names, "TF Model" ) )

In [24]:
t0 = time()
transformed_tfidf = tsne_model_fit_transform( term_vector_via_tfidf.todense() )
print( "TFIDF TSNE model done in {:03f}s.".format( time() - t0 ) )

TFIDF TSNE model done in 31.429655s.


In [25]:
show( tsne_plot( transformed_tfidf, tfidf_feature_names, "TFIDF Model" ) )

In [26]:
t0 = time()
transformed_coword = tsne_model_fit_transform( term_vector_via_coword.todense() )
print( "Co-Word TSNE model done in {:03f}s.".format( time() - t0 ) )

Co-Word TSNE model done in 18.025024s.


In [27]:
show( tsne_plot( transformed_coword, tf_feature_names, "Co-Word Model" ) )

In [28]:
x, y = transformed_coword.T
print( max(x) )
print( min(x) )

9611.131
-392.1179


In [29]:
t0 = time()
transformed_tscore = tsne_model_fit_transform( term_vector_via_tscore.todense() )
print( "T-score TSNE model done in {:03f}s.".format( time() - t0 ) )

T-score TSNE model done in 13.161997s.


In [30]:
show( tsne_plot( transformed_tscore, tf_feature_names, "t-score Model" ) )

## 결론
