# Scikit-learn, tf-idf 결과 확인 

In [44]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize
from typing import List
from collections import Counter
import re
import numpy as np 

In [25]:
corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
]

In [39]:
# Scikit-learn tf-idf 
# 단어 기준 토큰화 (default)
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)

## Vocab 체크

In [30]:
def build_vocab(corpus:List[str]):
    """
    단어 오름차순 정렬 vocab
    """
    counter=Counter()
    for doc in corpus:
        counter.update(re.findall(r'\w+',doc.lower()))
    return { term:idx for idx,term in enumerate(sorted(counter.keys())) }

print(f'custom : {build_vocab(corpus)}')
print(f'sklearn : {dict(sorted(vectorizer.vocabulary_.items()))}')
    

custom : {'and': 0, 'document': 1, 'first': 2, 'is': 3, 'one': 4, 'second': 5, 'the': 6, 'third': 7, 'this': 8}
sklearn : {'and': 0, 'document': 1, 'first': 2, 'is': 3, 'one': 4, 'second': 5, 'the': 6, 'third': 7, 'this': 8}


## TF  생성

In [42]:
def build_tf(terms:List[str],corpus:List[str]):
    """
    각 문서(d) 마다  TF 생성 
    """
    doc_tf=[]
    for d in corpus:
        tf=[]
        for t in terms:
            tf.append(
                re.findall(r'\w+',d.lower()).count(t)
            )
        doc_tf.append(tf)
    return np.array(doc_tf)
vocab=build_vocab(corpus)
print(build_tf(vocab,corpus))

[[0 1 1 1 0 0 1 0 1]
 [0 2 0 1 0 1 1 0 1]
 [1 0 0 1 1 0 1 1 1]
 [0 1 1 1 0 0 1 0 1]]


## IDF 체크

In [32]:
def build_df(terms:List[str],corpus:List[str]):
    """
    해당 단어(t) 포함된 문서(d) 의 수 
    """
    df=[]
    for t in terms : 
        count=0
        for d in corpus:
            if t in re.findall(r'\w+',d.lower()):
                count+=1
        df.append(count)
    return np.array(df)


def build_idf(n:int,df:np.ndarray):
    """
    idf = log((1+n)/(df(t)+1))+1 
    """
    
    return np.log((1+n)/(1+df))+1

vocab=build_vocab(corpus)
df=build_df(vocab.keys(),corpus)
n=len(corpus)

print(f'custom : {build_idf(n,df)}')
print(f'sklearn : {vectorizer.idf_}')

custom : [1.91629073 1.22314355 1.51082562 1.         1.91629073 1.91629073
 1.         1.91629073 1.        ]
sklearn : [1.91629073 1.22314355 1.51082562 1.         1.91629073 1.91629073
 1.         1.91629073 1.        ]


## TF-IDF 체크

In [46]:
def build_tf_idf(tf:np.ndarray,idf:np.ndarray):
    return normalize(tf*idf,norm='l2')

vocab=build_vocab(corpus)
tf=build_tf(vocab,corpus)
df=build_df(vocab.keys(),corpus)
n=len(corpus)
idf=build_idf(n,df)

print(f'custom : {build_tf_idf(tf,idf)}')
print(f'sklearn : {X.toarray()}')

custom : [[0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]
 [0.         0.6876236  0.         0.28108867 0.         0.53864762
  0.28108867 0.         0.28108867]
 [0.51184851 0.         0.         0.26710379 0.51184851 0.
  0.26710379 0.51184851 0.26710379]
 [0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]]
sklearn : [[0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]
 [0.         0.6876236  0.         0.28108867 0.         0.53864762
  0.28108867 0.         0.28108867]
 [0.51184851 0.         0.         0.26710379 0.51184851 0.
  0.26710379 0.51184851 0.26710379]
 [0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]]
