<a href="https://colab.research.google.com/github/lyla-lee/TIL/blob/master/1_Document_Representation(TF_IDF).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 2 TDM(Term-Document Matrix)

## 2.1 직접구현

In [None]:
docs = ['오늘 동물원에서 원숭이를 봤어',
        '오늘 동물원에서 코끼리를 봤어 봤어',
        '동물원에서 원숭이에게 바나나를 줬어 바나나를']

### 1) 띄어쓰기 단위로 토큰화

In [None]:
doc_ls = [doc.split() for doc in docs]
doc_ls

[['오늘', '동물원에서', '원숭이를', '봤어'],
 ['오늘', '동물원에서', '코끼리를', '봤어', '봤어'],
 ['동물원에서', '원숭이에게', '바나나를', '줬어', '바나나를']]

### 2) 각 고유 토큰에 인덱스(Index)를 지정

In [None]:
from collections import defaultdict

word2idx = defaultdict(lambda: len(word2idx))
[word2idx[i] for doc in doc_ls for i in doc]
word2idx

defaultdict(<function __main__.<lambda>>,
            {'동물원에서': 1,
             '바나나를': 6,
             '봤어': 3,
             '오늘': 0,
             '원숭이를': 2,
             '원숭이에게': 5,
             '줬어': 7,
             '코끼리를': 4})

### 3) TDM 생성

In [None]:
tdm = np.zeros((len(word2idx),len(doc_ls)),dtype=int)
for i,doc in enumerate(doc_ls):
  for token in doc:
    tdm[word2idx[token]][i] +=1
tdm

array([[1, 1, 0],
       [1, 1, 1],
       [1, 0, 0],
       [1, 2, 0],
       [0, 1, 0],
       [0, 0, 1],
       [0, 0, 2],
       [0, 0, 1]])

In [None]:
import pandas as pd

doc_names = ['문서'+ str(i) for i in range(len(doc_ls))]
sorted_vocab = sorted((value, key) for key, value in word2idx.items())
vocab = [v[1] for v in sorted_vocab]
df_TDM = pd.DataFrame(tdm, columns=doc_names)
df_TDM['단어'] = vocab
df_TDM.set_index('단어')

Unnamed: 0_level_0,문서0,문서1,문서2
단어,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
오늘,1,1,0
동물원에서,1,1,1
원숭이를,1,0,0
봤어,1,2,0
코끼리를,0,1,0
원숭이에게,0,0,1
바나나를,0,0,2
줬어,0,0,1


## 2.2 sklearn 활용

In [None]:
docs = ['오늘 동물원에서 원숭이를 봤어',
        '오늘 동물원에서 코끼리를 봤어 봤어',
        '동물원에서 원숭이에게 바나나를 줬어 바나나를']

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer()
DTM = count_vect.fit_transform(docs)
DTM.toarray()

array([[1, 0, 1, 1, 1, 0, 0, 0],
       [1, 0, 2, 1, 0, 0, 0, 1],
       [1, 2, 0, 0, 0, 1, 1, 0]])

In [None]:
# TDM = np.transpose(DTM)
# TDM.toarray()

array([[1, 1, 1],
       [0, 0, 2],
       [1, 2, 0],
       [1, 1, 0],
       [1, 0, 0],
       [0, 0, 1],
       [0, 0, 1],
       [0, 1, 0]])

In [None]:
import pandas as pd

doc_names = ['문서'+ str(i) for i in range(len(doc_ls))]
vocab = count_vect.get_feature_names()
df_TDM = pd.DataFrame(DTM.toarray().T, columns=doc_names)
df_TDM['단어'] = vocab
df_TDM.set_index('단어')

Unnamed: 0_level_0,문서0,문서1,문서2
단어,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
동물원에서,1,1,1
바나나를,0,0,2
봤어,1,2,0
오늘,1,1,0
원숭이를,1,0,0
원숭이에게,0,0,1
줬어,0,0,1
코끼리를,0,1,0


## 2.3 gensim 활용

In [None]:
import gensim
from gensim import corpora

doc_ls = [doc.split() for doc in docs] #공백으로 토큰화
id2word = corpora.Dictionary(doc_ls)
TDM = [id2word.doc2bow(doc) for doc in doc_ls]
TDM

[[(0, 1), (1, 1), (2, 1), (3, 1)],
 [(0, 1), (1, 2), (2, 1), (4, 1)],
 [(0, 1), (5, 2), (6, 1), (7, 1)]]

In [None]:
import pandas as pd

doc_names = ['문서'+ str(i) for i in range(len(doc_ls))]
vocab = [id2word[i] for i in id2word.keys()]
DTM_matrix = [sparse2full(doc, len(vocab)).tolist() for doc in TDM]

df_TDM = pd.DataFrame(np.array(DTM_matrix, dtype=int).T, columns=doc_names)
df_TDM['단어'] = vocab
df_TDM.set_index('단어')

NameError: ignored

# 3 TF-IDF (Term Frequency-Inverse Document Frequency)

<img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/10109d0e60cc9d50a1ea2f189bac0ac29a030a00" />



*  TF(단어 빈도, Term Frequency) : 단어가 문서 내에 등장하는 빈도
*  IDF(역문서 빈도, Inverse Document Frequency) : 단어가 여러 문서에 공통적으로 등장하는 빈도
*  한 문서 내에 자주 등장하고 다른 문서에 자주 등장하지 않는 단어를 주요 단어로 판별할 수 있음


https://en.wikipedia.org/wiki/Tf%E2%80%93idf

## 3.1 직접계산하기 1

weighting schema|weight
--|--
tf (term frequency)|<img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/91699003abf4fe8bdf861bbce08e73e71acf5fd4" />
idf(inverse document frequency) |<img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/864fcfdc0c16344c11509f724f1aa7081cf9f657" />

In [None]:
docs = ['오늘 동물원에서 원숭이를 봤어',
        '오늘 동물원에서 코끼리를 봤어 봤어',
        '동물원에서 원숭이에게 바나나를 줬어 바나나를']

In [None]:
doc_ls = [doc.split() for doc in docs]
doc_ls

[['오늘', '동물원에서', '원숭이를', '봤어'],
 ['오늘', '동물원에서', '코끼리를', '봤어', '봤어'],
 ['동물원에서', '원숭이에게', '바나나를', '줬어', '바나나를']]

### 2) 각 고유 토큰에 인덱스(Index)를 지정

In [None]:
from collections import defaultdict

word2id = defaultdict(lambda: len(word2id))
[word2id[i] for doc in doc_ls for i in doc]
word2id

defaultdict(<function __main__.<lambda>>,
            {'동물원에서': 1,
             '바나나를': 6,
             '봤어': 3,
             '오늘': 0,
             '원숭이를': 2,
             '원숭이에게': 5,
             '줬어': 7,
             '코끼리를': 4})

### 3) DTM 생성

In [None]:
DTM = np.zeros((len(doc_ls),len(word2id)),dtype=int)
for i,doc in enumerate(doc_ls):
  for token in doc:
    DTM[i][word2id[token]] +=1
DTM

array([[1, 1, 1, 1, 0, 0, 0, 0],
       [1, 1, 0, 2, 1, 0, 0, 0],
       [0, 1, 0, 0, 0, 1, 2, 1]])

### 4) TF 계산

<img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/91699003abf4fe8bdf861bbce08e73e71acf5fd4" />

TF = 문서내 토크빈도/ 문서내 전체토큰갯수

In [None]:
def computeTF(DTM):
  doc_len = len(DTM)
  word_len = len(DTM[0])
  
  tf = np.zeros(DTM.shape)
  for i in range(doc_len):
    for j in range(word_len):
      tf[i][j] = DTM[i][j]/DTM[i].sum()
  return tf


computeTF(DTM)

array([[0.25, 0.25, 0.25, 0.25, 0.  , 0.  , 0.  , 0.  ],
       [0.2 , 0.2 , 0.  , 0.4 , 0.2 , 0.  , 0.  , 0.  ],
       [0.  , 0.2 , 0.  , 0.  , 0.  , 0.2 , 0.4 , 0.2 ]])

### 5) IDF  계산

<img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/864fcfdc0c16344c11509f724f1aa7081cf9f657" />

IDF = log(총문서수/토큰이등장한문서수)

In [None]:
import numpy as np
def computeIDF(DTM):
  doc_len = len(DTM)
  word_len = len(DTM[0])

  idf = np.zeros(word_len)

  for i in range(word_len):
    idf[i] = -math.log10(np.count_nonzero(DTM[:,i])/ doc_len)
  return idf
computeIDF(DTM)

array([ 0.17609126, -0.        ,  0.47712125,  0.17609126,  0.47712125,
        0.47712125,  0.47712125,  0.47712125])

In [None]:
def computeTFIDF(DTM):
  tf = computeTF(DTM)
  idf = computeIDF(DTM)

  tfidf = np.zeros(DTM.shape)
  for i in range(tf.shape[0]):
    for j in range(tf.shape[1]):
      tfidf[i][j] = tf[i][j] * idf[j]
  return tfidf

computeTFIDF(DTM)

array([[ 0.04402281, -0.        ,  0.11928031,  0.04402281,  0.        ,
         0.        ,  0.        ,  0.        ],
       [ 0.03521825, -0.        ,  0.        ,  0.0704365 ,  0.09542425,
         0.        ,  0.        ,  0.        ],
       [ 0.        , -0.        ,  0.        ,  0.        ,  0.        ,
         0.09542425,  0.1908485 ,  0.09542425]])

In [None]:
import pandas as pd

sorted_vocab = sorted((value,key) for key,value in word2idx.items())
vocab = [v[1] for v in sorted_vocab]
tfidf = computeTFIDF(DTM)
pd.DataFrame(tfidf, columns = vocab)

Unnamed: 0,오늘,동물원에서,원숭이를,봤어,코끼리를,원숭이에게,바나나를,줬어
0,0.044023,-0.0,0.11928,0.044023,0.0,0.0,0.0,0.0
1,0.035218,-0.0,0.0,0.070437,0.095424,0.0,0.0,0.0
2,0.0,-0.0,0.0,0.0,0.0,0.095424,0.190849,0.095424
