In [4]:
# BoW 모델 직접 구현 (TF-IDF) (wikipiedia)

In [5]:
# 구두점 제거 -> 소문자화 -> 토큰화 -> 개수

import re, string
import numpy as np
from collections import Counter

punc_regex = re.compile('[{}]'.format(re.escape(string.punctuation)))

def strip_punc(corpus):
    return punc_regex.sub('', corpus)

doc = "Apples rule. Apples are the best. Truly, they are. Truly... Truly"

doc = strip_punc(doc)
counter = Counter(doc.lower().split())
descriptor = np.array([counter[word] for word in sorted(counter)], dtype=float)
print(sorted(counter))
print(descriptor)

['apples', 'are', 'best', 'rule', 'the', 'they', 'truly']
[2. 2. 1. 1. 1. 1. 3.]


In [6]:
def to_counter(doc):
    return Counter(strip_punc(doc).lower().split())

def to_vocab(counters):
    vocab = set()
    for counter in counters:
        vocab.update(counter)
    return sorted(vocab)

def to_tf(counter, vocab):
    return np.array([counter[word] for word in vocab], dtype=float)

In [7]:
doc_1 = "I am a dog."
doc_2 = "I am a cat!"
doc_3 = "I am not a dog"
doc_4 = "I am not a cat, am I!?!"

word_counts = [to_counter(doc) for doc in [doc_1, doc_2, doc_3, doc_4]]
bag = to_vocab(word_counts)
tfs = np.vstack([to_tf(counter, bag) for counter in word_counts])
print(bag)
print(tfs)

['a', 'am', 'cat', 'dog', 'i', 'not']
[[1. 1. 0. 1. 1. 0.]
 [1. 1. 1. 0. 1. 0.]
 [1. 1. 0. 1. 1. 1.]
 [1. 2. 1. 0. 2. 1.]]


In [13]:
# 상위 k개 추출
# 불용어 처리

def to_vocab(counters, k=None, stop_words=tuple()):
    vocab = Counter()
    for counter in counters:
        vocab.update(counter)
    
    for word in set(stop_words):
        if(word in counter.keys()):
            vocab.pop(word)
        
    return sorted([word for (word, cnt) in vocab.most_common(k)])

In [9]:
# wiki에서 상위 50개 빈출단어 출력

path = "../../dataset/wikipedia2text-extracted.txt"
with open(path, "rb") as f:
    wiki = f.read().decode()

wiki_count = to_counter(wiki)
wik = to_vocab([wiki_count], k=50)
print(wik)

['a', 'after', 'all', 'also', 'an', 'and', 'are', 'as', 'at', 'be', 'been', 'but', 'by', 'first', 'for', 'from', 'had', 'has', 'have', 'he', 'his', 'in', 'into', 'is', 'it', 'its', 'many', 'more', 'most', 'new', 'not', 'of', 'on', 'one', 'or', 'other', 'some', 'such', 'that', 'the', 'their', 'there', 'they', 'this', 'to', 'was', 'were', 'which', 'who', 'with']


In [11]:
with open("../../dataset/stopwords.txt", 'r') as r:
    stops = []
    for line in r:
        stops += [i.strip() for i in line.split('\t')]
        
print(stops[:5], stops[-5:])

['a', 'able', 'about', 'above', 'according'] ['your', 'yours', 'yourself', 'yourselves', 'zero']


In [14]:
wiki_count = to_counter(wiki)
wik2 = to_vocab([wiki_count], k=50, stop_words=stops)
print(wik2)

['american', 'area', 'began', 'british', 'called', 'century', 'city', 'country', 'due', 'early', 'form', 'found', 'french', 'government', 'great', 'high', 'i', 'important', 'include', 'including', 'international', 'large', 'largest', 'life', 'made', 'major', 'million', 'modern', 'music', 'national', 'north', 'number', 'part', 'people', 'political', 'population', 'power', 'public', 'river', 'south', 'state', 'states', 'system', 'time', 'united', 'war', 'work', 'world', 'year', 'years']


In [15]:
doc_1 = "I am a dog"
doc_2 = "I am a cat!"
doc_3 = "I am not a dog?"
doc_4 = "I am not a cat, am I!?!"

word_counts = [to_counter(doc) for doc in [doc_1, doc_2, doc_3, doc_4]]
vocab = to_vocab(word_counts, stop_words=stops)
tfs = np.vstack([to_tf(counter, vocab) for counter in word_counts])
tfs

array([[0., 1., 1.],
       [1., 0., 1.],
       [0., 1., 1.],
       [1., 0., 2.]])

In [16]:
def to_tf(counter, vocab):
    x = np.array([counter[word] for word in vocab], dtype=float)
    return x / x.sum()

In [17]:
doc_1 = "I am a dog"
doc_2 = "I am a cat!"
doc_3 = "I am not a dog?"
doc_4 = "I am not a cat, am I!?!"

word_counts = [to_counter(doc) for doc in [doc_1, doc_2, doc_3, doc_4]]
vocab = to_vocab(word_counts, stop_words=stops)
tfs = np.vstack([to_tf(counter, vocab) for counter in word_counts])
tfs

array([[0.        , 0.5       , 0.5       ],
       [0.5       , 0.        , 0.5       ],
       [0.        , 0.5       , 0.5       ],
       [0.33333333, 0.        , 0.66666667]])

In [21]:
def to_idf(vocab, counters):
    N = len(counters)
    nt = [sum(1 if t in counter else 0 for counter in counters) for t in vocab]
    nt = np.array(nt, dtype=float)
    return np.log10(N / nt)

In [23]:
doc_1 = "Apple cider is delicious."
doc_2 = "A recipe for apple cider, using apple."
doc_3 = "Donuts are delicious"
doc_4 = "Apple cider donuts, anyone? Donuts?"

word_counts = [to_counter(doc) for doc in [doc_1, doc_2, doc_3, doc_4]]
vocab = to_vocab(word_counts, stop_words=stops)
tfs = np.vstack([to_tf(counter, vocab) for counter in word_counts])
idf = to_idf(vocab, word_counts)
tf_idfs = tfs * idf

print(tfs)
print(idf)
print(tf_idfs)

[[0.         0.25       0.         0.25       0.25       0.
  0.         0.25       0.         0.        ]
 [0.14285714 0.28571429 0.         0.14285714 0.         0.
  0.14285714 0.         0.14285714 0.14285714]
 [0.         0.         0.33333333 0.         0.33333333 0.33333333
  0.         0.         0.         0.        ]
 [0.         0.25       0.         0.25       0.         0.5
  0.         0.         0.         0.        ]]
[0.60205999 0.12493874 0.60205999 0.12493874 0.30103    0.30103
 0.60205999 0.60205999 0.60205999 0.60205999]
[[0.         0.03123468 0.         0.03123468 0.0752575  0.
  0.         0.150515   0.         0.        ]
 [0.08600857 0.03569678 0.         0.01784839 0.         0.
  0.08600857 0.         0.08600857 0.08600857]
 [0.         0.         0.20068666 0.         0.10034333 0.10034333
  0.         0.         0.         0.        ]
 [0.         0.03123468 0.         0.03123468 0.         0.150515
  0.         0.         0.         0.        ]]
