# 1. Integer Encoding & Padding

In [1]:
trn_raw_text = """
A barber is a person. a barber is good person. a barber is huge person. 
he Knew A Secret! The Secret He Kept is huge secret. Huge secret. His barber kept his word. 
a barber kept his word. His barber kept his secret. 
But keeping and keeping such a huge secret to himself was driving the barber crazy. the barber went up a huge mountain.
"""

- <span style = 'font-size:1.2em;line-height:1.5em'>먼저, 다음과 같은 전처리들을 수행합니다.</span>
    - <span style = 'font-size:1.1em;line-height:1.5em'>(1) 대문자 --> 소문자</span>
    - <span style = 'font-size:1.1em;line-height:1.5em'>(2) 불용어, 길이가 2이하인 단어 제거</span>

In [2]:
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize

In [3]:
# 줄바꿈 문자(\n) 제거 (Remove new line char)
trn_raw_text = trn_raw_text.replace('\n', ' ')

# stopwords set 생성 (Make stopword set)
stopwords_eng = set(stopwords.words('english'))

In [4]:
trn_raw_text

' A barber is a person. a barber is good person. a barber is huge person.  he Knew A Secret! The Secret He Kept is huge secret. Huge secret. His barber kept his word.  a barber kept his word. His barber kept his secret.  But keeping and keeping such a huge secret to himself was driving the barber crazy. the barber went up a huge mountain. '

In [5]:
# 문장 토큰화 (sentence tokenization)
sentences = sent_tokenize(trn_raw_text)

In [6]:
sentences

[' A barber is a person.',
 'a barber is good person.',
 'a barber is huge person.',
 'he Knew A Secret!',
 'The Secret He Kept is huge secret.',
 'Huge secret.',
 'His barber kept his word.',
 'a barber kept his word.',
 'His barber kept his secret.',
 'But keeping and keeping such a huge secret to himself was driving the barber crazy.',
 'the barber went up a huge mountain.']

In [7]:
all_tokens = []
for sentence in sentences: # 각 문장에 대해서
    sentence = sentence.lower()
    words = word_tokenize(sentence)
    sent_tokens = []
    for word in words:
        if (word not in stopwords_eng) & (len(word)>2):
            sent_tokens.append(word)
    all_tokens.append(sent_tokens)

In [8]:
all_tokens

[['barber', 'person'],
 ['barber', 'good', 'person'],
 ['barber', 'huge', 'person'],
 ['knew', 'secret'],
 ['secret', 'kept', 'huge', 'secret'],
 ['huge', 'secret'],
 ['barber', 'kept', 'word'],
 ['barber', 'kept', 'word'],
 ['barber', 'kept', 'secret'],
 ['keeping', 'keeping', 'huge', 'secret', 'driving', 'barber', 'crazy'],
 ['barber', 'went', 'huge', 'mountain']]

## (1) vocabulary set 만들기

### Method 1. Counter 사용

In [9]:
from collections import Counter

In [10]:
all_words = []
for sent_tokens in all_tokens:
    all_words.extend(sent_tokens)

In [12]:
cnt = Counter(all_words)
print(cnt)

Counter({'barber': 8, 'secret': 6, 'huge': 5, 'kept': 4, 'person': 3, 'word': 2, 'keeping': 2, 'good': 1, 'knew': 1, 'driving': 1, 'crazy': 1, 'went': 1, 'mountain': 1})


In [13]:
# 빈도수가 상위 5위안에 속하는 단어 사전 만들기
# Create a word dictionary with the top 5 frequencies
vocab = cnt.most_common(5)
print(vocab)

[('barber', 8), ('secret', 6), ('huge', 5), ('kept', 4), ('person', 3)]


In [14]:
# 빈도수가 2번 이상 등장한 단어 사전 만들기
# Generate a word dictionary with words that occurs more than 2 times
vocab = [(k,v) for (k,v) in zip(cnt.keys(), cnt.values()) if v>=2]
print("빈도수 기준 정렬 전: ", vocab)
vocab = sorted(vocab, key = lambda x: x[1], reverse=True)
print("빈도수 기준 정렬 후: ", vocab)

빈도수 기준 정렬 전:  [('barber', 8), ('person', 3), ('huge', 5), ('secret', 6), ('kept', 4), ('word', 2), ('keeping', 2)]
빈도수 기준 정렬 후:  [('barber', 8), ('secret', 6), ('huge', 5), ('kept', 4), ('person', 3), ('word', 2), ('keeping', 2)]


In [15]:
# {단어: index}형태의 dictionary 만들기
# Generate a dictionary with {word: index}
word_dict = dict()
for i, (word, freq) in enumerate(vocab):
    word_dict[word] = i
print(word_dict)

{'barber': 0, 'secret': 1, 'huge': 2, 'kept': 3, 'person': 4, 'word': 5, 'keeping': 6}


### Method 2: nltk 사용

In [16]:
import nltk

In [17]:
cnt = nltk.FreqDist(all_words)

In [18]:
cnt

FreqDist({'barber': 8, 'secret': 6, 'huge': 5, 'kept': 4, 'person': 3, 'word': 2, 'keeping': 2, 'good': 1, 'knew': 1, 'driving': 1, ...})

In [20]:
# 빈도수가 상위 5위안에 속하는 단어 사전 만들기
# Create a word dictionary with the top 5 frequencies
vocab = cnt.most_common(5)
print(vocab)

[('barber', 8), ('secret', 6), ('huge', 5), ('kept', 4), ('person', 3)]


In [21]:
# 빈도수가 2번 이상 등장한 단어 사전 만들기
# Generate a word dictionary with words that occurs more than 2 times
vocab = [(k,v) for (k,v) in zip(cnt.keys(), cnt.values()) if v>=2]
print("빈도수 기준 정렬 전: ", vocab)
vocab = sorted(vocab, key = lambda x: x[1], reverse=True)
print("빈도수 기준 정렬 후: ", vocab)

빈도수 기준 정렬 전:  [('barber', 8), ('person', 3), ('huge', 5), ('secret', 6), ('kept', 4), ('word', 2), ('keeping', 2)]
빈도수 기준 정렬 후:  [('barber', 8), ('secret', 6), ('huge', 5), ('kept', 4), ('person', 3), ('word', 2), ('keeping', 2)]


In [22]:
# {단어: index}형태의 dictionary 만들기
# Generate a dictionary with {word: index}
word_dict = dict()
for i, (word, freq) in enumerate(vocab):
    word_dict[word] = i
print(word_dict)

{'barber': 0, 'secret': 1, 'huge': 2, 'kept': 3, 'person': 4, 'word': 5, 'keeping': 6}


#### Vocabulary set을 만들어 줄 때, 다음과 같은 추가 key, value를 입력해줘야 합니다.

- <span style = 'font-size:1.2em;line-height:1.5em'><b>\<Pad\>: 0</b></span>
    - <span style = 'font-size:1.1em;line-height:1.5em'>문장의 길이를 맞춰줄때 필요합니다.</span>
    - <span style = 'font-size:1.1em;line-height:1.5em'>일반적인 text mining할 때는 필요 없지만, Neural Network, 특히 RNN계열의 모델을 사용할 때 필요합니다.</span>
    - <span style = 'font-size:1.1em;line-height:1.5em'>이렇게 길이를 맞춰주면 어느정도 병렬 처리가 가능하는 장점이 있습니다.</span>
- <span style = 'font-size:1.2em;line-height:1.5em'><b>OOV: 1</b></span>
    - <span style = 'font-size:1.1em;line-height:1.5em'>OOV는 Out-of-Vocabulary를 의미합니다. Vocabulary에 등록되지 않은 단어들을 OOV로 일괄처리 합니다.</span>
    - <span style = 'font-size:1.1em;line-height:1.5em'>앞의 예제에서 went, mountain과 같은 단어들은 단어 사전에 빠져있는데, 이들을 OOV로 일괄처리 합니다.</span>


In [23]:
new_word_dict = dict()
new_word_dict['<PAD>'] = 0
new_word_dict['OOV'] = 1
for k, v in word_dict.items():
    new_word_dict[k] = v+2

del word_dict

In [24]:
new_word_dict

{'<PAD>': 0,
 'OOV': 1,
 'barber': 2,
 'secret': 3,
 'huge': 4,
 'kept': 5,
 'person': 6,
 'word': 7,
 'keeping': 8}

## (2) Integer Encoding

In [25]:
all_tokens

[['barber', 'person'],
 ['barber', 'good', 'person'],
 ['barber', 'huge', 'person'],
 ['knew', 'secret'],
 ['secret', 'kept', 'huge', 'secret'],
 ['huge', 'secret'],
 ['barber', 'kept', 'word'],
 ['barber', 'kept', 'word'],
 ['barber', 'kept', 'secret'],
 ['keeping', 'keeping', 'huge', 'secret', 'driving', 'barber', 'crazy'],
 ['barber', 'went', 'huge', 'mountain']]

In [26]:
all_tokens_enc = [] # 모든 문장들을 정수 인코딩된 형태로 변환시킬 리스트 생성
for sent in all_tokens: # 각 문장(token sequence)에 대해서,
    sent_tokens_enc = [] # 각 문장을 정수 인코딩된 형태로 변환시킬 리스트 생성 
    for word in sent: # 문장의 각 단어(토큰)들에 대해서,
        if word in new_word_dict.keys(): # 해당 단어가 word_dict의 key에 포함되어 있으면,
            sent_tokens_enc.append(new_word_dict[word]) # 그에 해당하는 value로 정수 인코딩 
        else: # 해당 단어가 word_dict의 key에 없으면
            sent_tokens_enc.append(new_word_dict['OOV']) # OOV에 해당하는 value(1)로 정수 인코딩
    all_tokens_enc.append(sent_tokens_enc) # 인코딩된 문장을 최종 결과에 추가

In [27]:
all_tokens_enc

[[2, 6],
 [2, 1, 6],
 [2, 4, 6],
 [1, 3],
 [3, 5, 4, 3],
 [4, 3],
 [2, 5, 7],
 [2, 5, 7],
 [2, 5, 3],
 [8, 8, 4, 3, 1, 2, 1],
 [2, 1, 4, 1]]

## (3) Padding

- <span style = 'font-size:1.2em;line-height:1.5em'>패딩은 항상 사용하는게 아니고, 기계 번역처럼 neural network 계열의 모델링시, 특히 RNN계열의 모델을 활용할 때 필요하다고 생각하셔도 됩니다.</span>
- <span style = 'font-size:1.2em;line-height:1.5em'>즉, 필요할 때가 있고, 필요하지 않을 때가 있는데 이를 잘 구분하셔야 합니다.</span>

In [28]:
# 모든 문장(token sequence)의 길이가 있는 리스트를 생성하고 tmp = ([len(x) for x in all_tokens_enc])
# 그 리스트의 최대값을 구함 (max(tmp))
max_len = max([len(x) for x in all_tokens_enc])

In [29]:
max_len

7

In [30]:
all_tokens_enc_pad = [] ## 패딩 처리한 결과를 저장할 리스트 생성
for sent_tokens_enc in all_tokens_enc: # 정수 인코딩된 각 문장에 대해서,
    if len(sent_tokens_enc) <  max_len: # 해당 문장의 길이가 최대 문장 길이보다 작으면
        num_pad = (max_len - len(sent_tokens_enc)) # 패딩 처리할 길이 = (최대 문장 길이 - 해당 문장 길이)
        sent_tokens_enc += [0]*num_pad # 정수 인코딩된 각 문장에 0을 추가 (패딩 처리할 길이 만큼!)
    all_tokens_enc_pad.append(sent_tokens_enc) # 패딩 처리한 문장 결과를 최종 결과에 추가

In [31]:
all_tokens_enc_pad

[[2, 6, 0, 0, 0, 0, 0],
 [2, 1, 6, 0, 0, 0, 0],
 [2, 4, 6, 0, 0, 0, 0],
 [1, 3, 0, 0, 0, 0, 0],
 [3, 5, 4, 3, 0, 0, 0],
 [4, 3, 0, 0, 0, 0, 0],
 [2, 5, 7, 0, 0, 0, 0],
 [2, 5, 7, 0, 0, 0, 0],
 [2, 5, 3, 0, 0, 0, 0],
 [8, 8, 4, 3, 1, 2, 1],
 [2, 1, 4, 1, 0, 0, 0]]

# 2. Vector Representation

## (1) Word Representation

### one-hot encoding

- <span style = 'font-size:1.1em;line-height:1.5em'>단어 집합의 크기를 벡터 차원으로 하고, 표현하고 싶은 단어의 인덱스에 1의 값을 부여</span>
- <span style = 'font-size:1.1em;line-height:1.5em'>다른 인덱스는 0을 부여하는 단어의 벡터 표현 방식</span>

In [32]:
new_word_dict

{'<PAD>': 0,
 'OOV': 1,
 'barber': 2,
 'secret': 3,
 'huge': 4,
 'kept': 5,
 'person': 6,
 'word': 7,
 'keeping': 8}

In [33]:
word_dict = new_word_dict.copy()
del word_dict['<PAD>']
del word_dict['OOV']

In [34]:
print(word_dict)

{'barber': 2, 'secret': 3, 'huge': 4, 'kept': 5, 'person': 6, 'word': 7, 'keeping': 8}


In [35]:
for k, v in word_dict.items():
    word_dict[k] -= 2
print(word_dict)

{'barber': 0, 'secret': 1, 'huge': 2, 'kept': 3, 'person': 4, 'word': 5, 'keeping': 6}


In [36]:
def one_hot_encoding(word, word_dict):
    one_hot_vector = [0] * len(word_dict)
    print(one_hot_vector)
    idx = word_dict[word]
    print(idx)
    one_hot_vector[idx] = 1
    print(one_hot_vector)
    return one_hot_vector

In [37]:
one_hot_encoding('barber', word_dict)

[0, 0, 0, 0, 0, 0, 0]
0
[1, 0, 0, 0, 0, 0, 0]


[1, 0, 0, 0, 0, 0, 0]

## (2) Document Representation

- <span style = 'font-size:1.2em;line-height:1.5em'>One-hot Encoding은 <b>단어</b>의 벡터 표현 방식이었다면, 지금 부터는 <b>문서</b>의 벡터 표현 방식</span>

### Bag of words

In [38]:
doc1 = "A barber is a person. a barber is good person. a barber is huge person. "
doc2 = "he Knew A Secret! The Secret He Kept is huge secret. Huge secret. His barber kept his word. "
doc3 = "a barber kept his word. His barber kept his secret. "
doc4 = "But keeping and keeping such a huge secret to himself was driving the barber crazy. the barber went up a huge mountain. "

- <span style = 'font-size:1.2em;line-height:1.5em'>OOV와 stopword removal없이 일단 모든 단어의 단어 사전을 다시 만들어 볼게요.</span>

In [39]:
all_docs = [doc1, doc2, doc3, doc4]
all_tokens = []
for doc in all_docs:
    doc_tokens = []
    sentences = sent_tokenize(doc)
    for sent in sentences:
        sent_tokens = []
        sent = sent.lower()
        words = word_tokenize(sent)
        for word in words:
            if (word not in stopwords_eng) & (len(word) > 2):
                sent_tokens.append(word)
        doc_tokens.append(sent_tokens)
    all_tokens.append(doc_tokens)

In [40]:
all_tokens

[[['barber', 'person'],
  ['barber', 'good', 'person'],
  ['barber', 'huge', 'person']],
 [['knew', 'secret'],
  ['secret', 'kept', 'huge', 'secret'],
  ['huge', 'secret'],
  ['barber', 'kept', 'word']],
 [['barber', 'kept', 'word'], ['barber', 'kept', 'secret']],
 [['keeping', 'keeping', 'huge', 'secret', 'driving', 'barber', 'crazy'],
  ['barber', 'went', 'huge', 'mountain']]]

In [41]:
all_words = []
for doc_tokens in all_tokens:
    for sent_tokens in doc_tokens:
        all_words.extend(sent_tokens)
print(all_words)

['barber', 'person', 'barber', 'good', 'person', 'barber', 'huge', 'person', 'knew', 'secret', 'secret', 'kept', 'huge', 'secret', 'huge', 'secret', 'barber', 'kept', 'word', 'barber', 'kept', 'word', 'barber', 'kept', 'secret', 'keeping', 'keeping', 'huge', 'secret', 'driving', 'barber', 'crazy', 'barber', 'went', 'huge', 'mountain']


In [42]:
cnt = Counter(all_words)

In [43]:
cnt

Counter({'barber': 8,
         'person': 3,
         'good': 1,
         'huge': 5,
         'knew': 1,
         'secret': 6,
         'kept': 4,
         'word': 2,
         'keeping': 2,
         'driving': 1,
         'crazy': 1,
         'went': 1,
         'mountain': 1})

In [44]:
cnt = {k:v for k, v in sorted(cnt.items(), key = lambda x: x[1], reverse=True)}

In [45]:
cnt

{'barber': 8,
 'secret': 6,
 'huge': 5,
 'kept': 4,
 'person': 3,
 'word': 2,
 'keeping': 2,
 'good': 1,
 'knew': 1,
 'driving': 1,
 'crazy': 1,
 'went': 1,
 'mountain': 1}

In [46]:
# {단어: index}형태의 dictionary 만들기
word_dict = dict()
for i, word in enumerate(cnt):
    word_dict[word] = i
print(word_dict)

{'barber': 0, 'secret': 1, 'huge': 2, 'kept': 3, 'person': 4, 'word': 5, 'keeping': 6, 'good': 7, 'knew': 8, 'driving': 9, 'crazy': 10, 'went': 11, 'mountain': 12}


- <span style = 'font-size:1.2em;line-height:1.5em'>doc1에 대한 BoW를 나타내면 다음과 같습니다</span>

In [47]:
print(doc1)

A barber is a person. a barber is good person. a barber is huge person. 


In [48]:
def doc_to_bow(documents, word_dict):
    bow = [0]*len(word_dict)
    doc_words = []
    for sent in documents:
        doc_words.extend(sent)
    cnt = Counter(doc_words)
    for word, word_cnt in cnt.items():
        idx = word_dict[word]
        bow[idx] = word_cnt
    return bow

In [49]:
all_tokens

[[['barber', 'person'],
  ['barber', 'good', 'person'],
  ['barber', 'huge', 'person']],
 [['knew', 'secret'],
  ['secret', 'kept', 'huge', 'secret'],
  ['huge', 'secret'],
  ['barber', 'kept', 'word']],
 [['barber', 'kept', 'word'], ['barber', 'kept', 'secret']],
 [['keeping', 'keeping', 'huge', 'secret', 'driving', 'barber', 'crazy'],
  ['barber', 'went', 'huge', 'mountain']]]

In [50]:
doc_to_bow(all_tokens[0], word_dict)

[3, 0, 1, 0, 3, 0, 0, 1, 0, 0, 0, 0, 0]

In [51]:
all_tokens[0]

[['barber', 'person'],
 ['barber', 'good', 'person'],
 ['barber', 'huge', 'person']]

In [52]:
word_dict

{'barber': 0,
 'secret': 1,
 'huge': 2,
 'kept': 3,
 'person': 4,
 'word': 5,
 'keeping': 6,
 'good': 7,
 'knew': 8,
 'driving': 9,
 'crazy': 10,
 'went': 11,
 'mountain': 12}

- <span style = 'font-size:1.2em;line-height:1.5em'>Scikit-learn의 CountVectorizer를 사용하면 쉽게 BoW를 만들 수도 있습니다.</span>

In [53]:
corpus = [doc1, doc2, doc3, doc4]

In [54]:
corpus

['A barber is a person. a barber is good person. a barber is huge person. ',
 'he Knew A Secret! The Secret He Kept is huge secret. Huge secret. His barber kept his word. ',
 'a barber kept his word. His barber kept his secret. ',
 'But keeping and keeping such a huge secret to himself was driving the barber crazy. the barber went up a huge mountain. ']

In [55]:
from sklearn.feature_extraction.text import CountVectorizer
vector = CountVectorizer(vocabulary = word_dict)

print('bag of words vector :', vector.fit_transform([doc1]).toarray())

print('vocabulary :',vector.vocabulary_)

bag of words vector : [[3 0 1 0 3 0 0 1 0 0 0 0 0]]
vocabulary : {'barber': 0, 'secret': 1, 'huge': 2, 'kept': 3, 'person': 4, 'word': 5, 'keeping': 6, 'good': 7, 'knew': 8, 'driving': 9, 'crazy': 10, 'went': 11, 'mountain': 12}


- <span style = 'font-size:1.2em;line-height:1.5em'>그러나, 이렇게 접근하면 text를 전처리 한 결과를 BoW로 만들 수 없습니다.</span>
- <span style = 'font-size:1.2em;line-height:1.5em'>그러기 위해서는 전처리 한 후의 결과를 ' '.join(corpus)로 token들은 문장형태로 join한 뒤에 사용해야 합니다.</span>

In [57]:
vector.transform([doc1]).toarray()

array([[3, 0, 1, 0, 3, 0, 0, 1, 0, 0, 0, 0, 0]], dtype=int64)

### DTM (Document Term Matrix)

In [58]:
import numpy as np

In [59]:
documents = [doc1, doc2, doc3, doc4]
vector = CountVectorizer(vocabulary=word_dict)
results = vector.fit_transform(documents).toarray()

In [60]:
results

array([[3, 0, 1, 0, 3, 0, 0, 1, 0, 0, 0, 0, 0],
       [1, 4, 2, 2, 0, 1, 0, 0, 1, 0, 0, 0, 0],
       [2, 1, 0, 2, 0, 1, 0, 0, 0, 0, 0, 0, 0],
       [2, 1, 2, 0, 0, 0, 2, 0, 0, 1, 1, 1, 1]], dtype=int64)

### TF-IDF

In [61]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [62]:
documents = [doc1, doc2, doc3, doc4]
vector = CountVectorizer(vocabulary=word_dict)
dtm = vector.fit_transform(documents).toarray()

In [66]:
documents

['A barber is a person. a barber is good person. a barber is huge person. ',
 'he Knew A Secret! The Secret He Kept is huge secret. Huge secret. His barber kept his word. ',
 'a barber kept his word. His barber kept his secret. ',
 'But keeping and keeping such a huge secret to himself was driving the barber crazy. the barber went up a huge mountain. ']

In [63]:
vector = TfidfVectorizer(vocabulary=word_dict)
tfidf = vector.fit_transform(documents).toarray()

In [64]:
dtm

array([[3, 0, 1, 0, 3, 0, 0, 1, 0, 0, 0, 0, 0],
       [1, 4, 2, 2, 0, 1, 0, 0, 1, 0, 0, 0, 0],
       [2, 1, 0, 2, 0, 1, 0, 0, 0, 0, 0, 0, 0],
       [2, 1, 2, 0, 0, 0, 2, 0, 0, 1, 1, 1, 1]], dtype=int64)

In [65]:
tfidf

array([[0.43658465, 0.        , 0.1780019 , 0.        , 0.83662312,
        0.        , 0.        , 0.27887437, 0.        , 0.        ,
        0.        , 0.        , 0.        ],
       [0.14743115, 0.72131784, 0.36065892, 0.44548552, 0.        ,
        0.22274276, 0.        , 0.        , 0.28252095, 0.        ,
        0.        , 0.        , 0.        ],
       [0.48637404, 0.29745263, 0.        , 0.73482636, 0.        ,
        0.36741318, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        ],
       [0.31289074, 0.19135514, 0.38271029, 0.        , 0.        ,
        0.        , 0.59958962, 0.        , 0.        , 0.29979481,
        0.29979481, 0.29979481, 0.29979481]])

In [None]:
print(dtm.shape)
print(tfidf.shape)