<a href="https://colab.research.google.com/github/pdh93621/Deep-learning/blob/main/BoW.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Bag of Words

In [35]:
doc1 = 'John likes to watch movies and Mary likes movies too'
doc2 = 'Mary also likes to watch football games'
doc3 = doc1 + ' ' + doc2
  

In [4]:
print(doc3)

John likes to watch movies. Mary likes movies too. Mary also likes to watch football games


## Keras Tokenizer를 활용한 BoW

In [5]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [8]:
def print_bow(sentence):
  tokenizer = Tokenizer()
  tokenizer.fit_on_texts(sentence)
  bow = dict(tokenizer.word_counts)
  print('Bag of Words:',bow)
  print('단어장(vocabulary)의 크기:', len(tokenizer.word_counts))

In [12]:
sentence = []
sentence.append(doc3)

print_bow(sentence)

Bag of Words: {'john': 1, 'likes': 3, 'to': 2, 'watch': 2, 'movies': 2, 'mary': 2, 'too': 1, 'also': 1, 'football': 1, 'games': 1}
단어장(vocabulary)의 크기: 10


In [13]:
from sklearn.feature_extraction.text import CountVectorizer

vector = CountVectorizer()
print('Bag of Words:', vector.fit_transform(sentence).toarray())
print('각 단어의 인덱스:', vector.vocabulary_)

Bag of Words: [[1 1 1 1 3 2 2 2 1 2]]
각 단어의 인덱스: {'john': 3, 'likes': 4, 'to': 7, 'watch': 9, 'movies': 6, 'mary': 5, 'too': 8, 'also': 0, 'football': 1, 'games': 2}


In [14]:
print('단어장(vocabulary)의 크기:', len(vector.vocabulary_))

for i in vector.vocabulary_:
  


단어장(vocabulary)의 크기: 10


## DTM(Document-Term Matrix)

In [15]:
doc1 = 'I like dog'
doc2 = 'I like cat'
doc3 = doc1 + doc2

In [16]:
import numpy as np
from numpy import dot
from numpy.linalg import norm

mat1 = np.array([0,1,1,1])
mat2 = np.array([1,0,1,1])
mat3 = np.array([2,0,2,2])

def cos_sim(A,B):
  return dot(A,B)/(norm(A)*norm(B))

In [18]:
print(cos_sim(mat1, mat2))
print(cos_sim(mat1, mat3))
print(cos_sim(mat2, mat3))

0.6666666666666667
0.6666666666666667
1.0000000000000002


## sckit-learn CountVectorizer 활용한 DTM 구현

In [21]:
from sklearn.feature_extraction.text import CountVectorizer

corpus = []
corpus.append(doc1)
corpus.append(doc2)
corpus.append(doc3)

In [22]:
vector = CountVectorizer()
print(vector.fit_transform(corpus).toarray())
print(vector.vocabulary_)

[[0 0 0 1 2 1 2 1 1 1]
 [1 1 1 0 1 1 0 1 0 1]
 [1 1 1 1 3 2 2 2 1 2]]
{'john': 3, 'likes': 4, 'to': 7, 'watch': 9, 'movies': 6, 'mary': 5, 'too': 8, 'also': 0, 'football': 1, 'games': 2}


## TF-IDF

In [24]:
from math import log
import pandas as pd

docs = corpus.copy()

In [25]:
vocab = list(set(w for doc in docs for w in doc.split()))

In [26]:
vocab.sort()
print('단어장의 크기:',len(vocab))
print(vocab)

단어장의 크기: 11
['John', 'Mary', 'also', 'football', 'games', 'likes', 'movies', 'movies.', 'to', 'too.', 'watch']


In [27]:
N = len(docs)

In [28]:
def tf(t,d):
  return d.count(t)

def idf(t):
  df = 0
  for doc in docs:
    df += t in doc
  return log(N/(df+1))+1

def tfidf(t, d):
  return tf(t,d) * idf(t)

TF 함수를 사용하여 DTM을 만들어보자

In [32]:
result = []
for i in range(N):
  result.append([])
  d = docs[i]
  for j in range(len(vocab)):
    t = vocab[j]

    result[-1].append(tf(t,d))

tf_ = pd.DataFrame(result, columns = vocab)
print(tf_)

   John  Mary  also  football  games  likes  movies  movies.  to  too.  watch
0     1     1     0         0      0      2       2        1   2     1      1
1     0     1     1         1      1      1       0        0   1     0      1
2     1     2     1         1      1      3       2        1   3     1      2


In [33]:
result = []
for j in range(len(vocab)):
  t = vocab[j]
  result.append(idf(t))

idf_ = pd.DataFrame(result, index= vocab, columns= ['IDF'])
idf_

Unnamed: 0,IDF
John,1.0
Mary,0.712318
also,1.0
football,1.0
games,1.0
likes,0.712318
movies,1.0
movies.,1.0
to,0.712318
too.,1.0


TF-IDF 행렬을 출력 DTM에 있는 각 단어의 TF에 각 단어의 IDF를 곱해준 값

In [34]:
result = []
for i in range(N):
  result.append([])
  d = docs[i]
  for j in range(len(vocab)):
    t = vocab[j]

    result[-1].append(tfidf(t,d))

tfidf_ = pd.DataFrame(result, columns= vocab)
tfidf_

Unnamed: 0,John,Mary,also,football,games,likes,movies,movies.,to,too.,watch
0,1.0,0.712318,0.0,0.0,0.0,1.424636,2.0,1.0,1.424636,1.0,0.712318
1,0.0,0.712318,1.0,1.0,1.0,0.712318,0.0,0.0,0.712318,0.0,0.712318
2,1.0,1.424636,1.0,1.0,1.0,2.136954,2.0,1.0,2.136954,1.0,1.424636


## scikit-learn TFIDFVectorizer

In [37]:
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = [
          'John likes to watch movies and Mary likes movies too',
          'James likes to watch TV',
          'Mary also likes to watch football game'
]

tfidf = TfidfVectorizer().fit(corpus)
vocab = list(set(tfidf.vocabulary_.keys()))
vocab.sort()

tfdif_ = pd.DataFrame(tfidf.transform(corpus).toarray(), columns=vocab)
tfidf_

Unnamed: 0,John,Mary,also,football,games,likes,movies,movies.,to,too.,watch
0,1.0,0.712318,0.0,0.0,0.0,1.424636,2.0,1.0,1.424636,1.0,0.712318
1,0.0,0.712318,1.0,1.0,1.0,0.712318,0.0,0.0,0.712318,0.0,0.712318
2,1.0,1.424636,1.0,1.0,1.0,2.136954,2.0,1.0,2.136954,1.0,1.424636
