# Document Representation
문서를 DocumentTermMatrix로 표현하는 방법을 다루며 Term weighting 방식으로 term frequency, tf-idf를 활용해봅니다. 실제 예제로 arXiv에서 scraping한 text mining 관련 논문의 초록들로 DocumentTermMatrix (tf, tf-idf)를 만들어봅니다. 이 떄 Term은 명사계열(NN?)만 활용하고 많이 출현한 100개 단어만 활용합니다. 또한 DocumentTermMatrix를 만들 때, tokenizer를 정의해서 활용합니다.
  
* _nltk와 sklearn의 sub-module인 feature-extraction.text를 활용합니다._
* nltk : http://www.nltk.org/book/
* gensim : https://radimrehurek.com/gensim/index.html

## Simple example 

### Load modules 

In [1]:
import os, sys
import nltk
import re
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

### DocumentTermMatrix with tf 

In [2]:
corpus = ['I want to enjoy things and have fun and live like every day is the last day',
         'When the Lore closes a door, somewher he opens a window',
         'Great power always comes with Great responsibility',
         'Luck favors the prepared']

In [3]:
# 3글자 이상의 영어단어만 각 문서에서 추출
corpus = ['//'.join(re.findall('[A-z]{3,}', doc)) for doc in corpus]

In [4]:
def string_tokenizer(doc):
    return doc.split('//')

In [5]:
# tokenizer option에 내가 정의한 tokenizer를 명시
# min_df로 최소 토큰의 빈도 설정 (Term의 list를 먼저 만들어두지 않았을 경우 활용할만한 옵션)
tf_dtm = CountVectorizer(tokenizer = string_tokenizer, min_df = 1).fit(corpus) 

In [6]:
tf_dtm.transform(corpus).toarray()

array([[0, 2, 0, 0, 2, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
        1, 1, 1, 0, 0, 0],
       [0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1,
        1, 0, 0, 1, 1, 0],
       [1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
        0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0,
        1, 0, 0, 0, 0, 0]], dtype=int64)

### DocumentTermMatrix with tf-idf 

In [7]:
tfidf_dtm= TfidfVectorizer(tokenizer = string_tokenizer).fit(corpus)

In [8]:
tfidf_dtm.transform(corpus).toarray()

array([[ 0.        ,  0.47936124,  0.        ,  0.        ,  0.47936124,
         0.        ,  0.23968062,  0.23968062,  0.        ,  0.23968062,
         0.        ,  0.23968062,  0.23968062,  0.23968062,  0.23968062,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.15298503,  0.23968062,  0.23968062,
         0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.36742339,  0.        ,  0.        ,
         0.36742339,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.36742339,  0.        ,  0.36742339,  0.        ,  0.        ,
         0.        ,  0.36742339,  0.23452159,  0.        ,  0.        ,
         0.36742339,  0.36742339,  0.        ],
       [ 0.33333333,  0.        ,  0.        ,  0.33333333,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.66666667,  0.    

## Practical example 

In [9]:
os.listdir()

['.ipynb_checkpoints',
 'Document Representation (term frequency, tf-idf).ipynb',
 'Scrapping text mining papers in arXiv.py',
 'Simple NLP for English.ipynb',
 'Simple NLP for Korean.ipynb',
 'text_mining_paper.csv']

### Load data

In [10]:
papers = pd.read_csv('./text_mining_paper.csv', encoding = 'cp949')
print(papers.shape)
papers.head()

(168, 5)


Unnamed: 0,abstract,author,meta,subject,title
0,"The complicated, evolving landscape of cancer ...","Rocco Piazza, Daniele Ramazzotti, Roberta Spin...","Thu, 9 Mar 2017 01:24:23 GMT (948kb)",Genomics (q-bio.GN),"OncoScore: a novel, Internet-based tool to ass..."
1,"Mining textual patterns in news, tweets, paper...","Meng Jiang, Jingbo Shang, Taylor Cassidy, Xian...","Mon, 13 Mar 2017 01:06:19 GMT (1150kb,D) [v2] ...",Computation and Language (cs.CL),MetaPAD: Meta Pattern Discovery from Massive T...
2,This paper is a tutorial on Formal Concept Ana...,Dmitry I. Ignatov,"Wed, 8 Mar 2017 12:53:21 GMT (3541kb,D)",Information Retrieval (cs.IR),Introduction to Formal Concept Analysis and It...
3,Topic models have been widely used in discover...,"Jarvan Law, Hankz Hankui Zhuo, Junhua He, Erhu...","Thu, 23 Feb 2017 07:16:03 GMT (96kb,D)",Computation and Language (cs.CL),LTSG: Latent Topical Skip-Gram for Mutually Le...
4,Entity extraction is fundamental to many text ...,"Zeyi Wen, Dong Deng, Rui Zhang, Kotagiri Ramam...","Sun, 12 Feb 2017 12:46:40 GMT (89kb)",Databases (cs.DB),A Technical Report: Entity Extraction using Bo...


### Custom tokenizer

In [11]:
corpus = list(papers['abstract'])
corpus[0:3]

['The complicated, evolving landscape of cancer mutations poses a formidable\r\nchallenge to identify cancer genes among the large lists of mutations typically\r\ngenerated in NGS experiments. The ability to prioritize these variants is\r\ntherefore of paramount importance. To address this issue we developed\r\nOncoScore, a text-mining tool that ranks genes according to their association\r\nwith cancer, based on available biomedical literature. Receiver operating\r\ncharacteristic curve and the area under the curve (AUC) metrics on manually\r\ncurated datasets confirmed the excellent discriminating capability of OncoScore\r\n(OncoScore cut-off threshold = 21.09; AUC = 90.3%, 95% CI: 88.1-92.5%),\r\nindicating that OncoScore provides useful results in cases where an efficient\r\nprioritization of cancer-associated genes is needed.\r\n',
 'Mining textual patterns in news, tweets, papers, and many other kinds of text\r\ncorpora has been an active theme in text mining and NLP research. Pre

In [12]:
# Custom tokenizer
# 3글자 이상의 영어단어중 명사계열만 tokenize하는 tokenizer
def my_tokenizer(doc):
    tmp = re.findall('[A-z]{3,}', doc)
    tmp = [token[0] for token in nltk.pos_tag(tmp) if token[1].find('NN') == 0]
    return tmp

In [13]:
# Term의 목록을 미리 설정
from collections import Counter
tmps = [re.findall('[A-z]{3,}', doc) for doc in corpus]
tmps = sum(tmps, [])
tmps = [token[0] for token in nltk.pos_tag(tmps) if token[1].find('NN') == 0]
word_count = Counter(tmps)
voca = [token[0] for token in word_count.most_common(100)]
len(voca)

100

### Generate DocumentTermMatrix with tf 

In [14]:
tf_dtm = CountVectorizer(tokenizer = my_tokenizer, vocabulary = voca).fit(corpus)

In [15]:
len(tf_dtm.vocabulary_)

100

In [16]:
my_tf_dtm= tf_dtm.transform(corpus).toarray()
my_tf_dtm

array([[0, 1, 0, ..., 0, 0, 0],
       [1, 1, 0, ..., 0, 0, 0],
       [1, 2, 3, ..., 0, 0, 0],
       ..., 
       [1, 3, 2, ..., 0, 0, 0],
       [1, 3, 0, ..., 0, 0, 0],
       [4, 2, 0, ..., 0, 0, 0]], dtype=int64)

In [17]:
my_tf_dtm.shape

(168, 100)

### Generate DocumentTermMatrix with tfidf

In [18]:
tfidf_dtm = TfidfVectorizer(tokenizer = my_tokenizer, vocabulary = voca).fit(corpus)

In [19]:
len(tfidf_dtm.vocabulary_)

100

In [20]:
my_tfidf_dtm = tfidf_dtm.transform(corpus).toarray()
my_tfidf_dtm

array([[ 0.        ,  0.18524405,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.04411306,  0.0397914 ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.10389214,  0.18742813,  0.39033973, ...,  0.        ,
         0.        ,  0.        ],
       ..., 
       [ 0.07425438,  0.20093954,  0.18599055, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.08228962,  0.22268367,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.55188593,  0.24890946,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

In [21]:
my_tf_dtm.shape

(168, 100)