## Integer Encoding

In [1]:
text = "A barber is a person. a barber is good person. a barber is huge person. he Knew A Secret! The Secret He Kept is huge secret. Huge secret. His barber kept his word. a barber kept his word. His barber kept his secret. But keeping and keeping such a huge secret to himself was driving the barber crazy. the barber went up a huge mountain."

In [2]:
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [3]:
stopword = stopwords.words('english')

In [5]:
text = sent_tokenize(text)

In [8]:
text

['A barber is a person.',
 'a barber is good person.',
 'a barber is huge person.',
 'he Knew A Secret!',
 'The Secret He Kept is huge secret.',
 'Huge secret.',
 'His barber kept his word.',
 'a barber kept his word.',
 'His barber kept his secret.',
 'But keeping and keeping such a huge secret to himself was driving the barber crazy.',
 'the barber went up a huge mountain.']

In [21]:
vocab = {}
sentences=[]

for i in text:
    sentence = word_tokenize(i)
    result = []
    
    for word in sentence:
        word = word.lower()
        
        if word not in stopword:
            if len(word) > 2:
                result.append(word)
            
                if word not in vocab:
                    vocab[word] = 0
            
                vocab[word] += 1
            
    sentences.append(result)

In [22]:
vocab

{'barber': 8,
 'person': 3,
 'good': 1,
 'huge': 5,
 'knew': 1,
 'secret': 6,
 'kept': 4,
 'word': 2,
 'keeping': 2,
 'driving': 1,
 'crazy': 1,
 'went': 1,
 'mountain': 1}

In [23]:
sentences

[['barber', 'person'],
 ['barber', 'good', 'person'],
 ['barber', 'huge', 'person'],
 ['knew', 'secret'],
 ['secret', 'kept', 'huge', 'secret'],
 ['huge', 'secret'],
 ['barber', 'kept', 'word'],
 ['barber', 'kept', 'word'],
 ['barber', 'kept', 'secret'],
 ['keeping', 'keeping', 'huge', 'secret', 'driving', 'barber', 'crazy'],
 ['barber', 'went', 'huge', 'mountain']]

In [24]:
vocab_sorted = sorted(vocab.items(), key = lambda x : x[1], reverse=True)
vocab_sorted

[('barber', 8),
 ('secret', 6),
 ('huge', 5),
 ('kept', 4),
 ('person', 3),
 ('word', 2),
 ('keeping', 2),
 ('good', 1),
 ('knew', 1),
 ('driving', 1),
 ('crazy', 1),
 ('went', 1),
 ('mountain', 1)]

### 빈도수가 높은 단어에 낮은 정수 매핑

In [25]:
word_to_index = {}
i=0

for word, freq in vocab_sorted:
    if freq > 1:
        
        i += 1
        
        word_to_index[word] = i
        
        

In [26]:
word_to_index

{'barber': 1,
 'secret': 2,
 'huge': 3,
 'kept': 4,
 'person': 5,
 'word': 6,
 'keeping': 7}

In [31]:
vocab_size = 5 ## 상위 5개의 단어들만 사용

words_frequency = [w for w,c in word_to_index.items() if c >= vocab_size + 1]

for w in words_frequency:
    del word_to_index[w]

In [32]:
word_to_index

{'barber': 1, 'secret': 2, 'huge': 3, 'kept': 4, 'person': 5}

In [33]:
word_to_index['OOV'] = len(word_to_index) + 1 

In [37]:
encoded = []

for i in sentences:
    a = []
    for k in i:
        try:
            a.append(word_to_index[k])
        except KeyError:
            a.append(word_to_index['OOV'])
            
    encoded.append(a)

In [38]:
encoded

[[1, 5],
 [1, 6, 5],
 [1, 3, 5],
 [6, 2],
 [2, 4, 3, 2],
 [3, 2],
 [1, 4, 6],
 [1, 4, 6],
 [1, 4, 2],
 [6, 6, 3, 2, 6, 1, 6],
 [1, 6, 3, 6]]

## Counter 사용

In [39]:
from collections import Counter

In [43]:
words = sum(sentences, [])

In [44]:
words

['barber',
 'person',
 'barber',
 'good',
 'person',
 'barber',
 'huge',
 'person',
 'knew',
 'secret',
 'secret',
 'kept',
 'huge',
 'secret',
 'huge',
 'secret',
 'barber',
 'kept',
 'word',
 'barber',
 'kept',
 'word',
 'barber',
 'kept',
 'secret',
 'keeping',
 'keeping',
 'huge',
 'secret',
 'driving',
 'barber',
 'crazy',
 'barber',
 'went',
 'huge',
 'mountain']

In [45]:
vocab = Counter(words)
vocab

Counter({'barber': 8,
         'person': 3,
         'good': 1,
         'huge': 5,
         'knew': 1,
         'secret': 6,
         'kept': 4,
         'word': 2,
         'keeping': 2,
         'driving': 1,
         'crazy': 1,
         'went': 1,
         'mountain': 1})

In [49]:
vocab = vocab.most_common(5)
vocab

[('barber', 8), ('secret', 6), ('huge', 5), ('kept', 4), ('person', 3)]

In [50]:
word_to_index={}
i = 0

for w,f in vocab:
    i=i+1
    word_to_index[w] =i 
    
word_to_index

{'barber': 1, 'secret': 2, 'huge': 3, 'kept': 4, 'person': 5}

In [77]:
word_to_index['OOV'] = len(word_to_index) + 1

In [79]:
encoding = []

for i in sentences:
    a=[]
    for k in i:
        try:
            a.append(word_to_index[k])
        except KeyError:
            a.append(word_to_index['OOV'])
            
    encoding.append(a)

In [80]:
encoding

[[1, 5],
 [1, 6, 5],
 [1, 3, 5],
 [6, 2],
 [2, 4, 3, 2],
 [3, 2],
 [1, 4, 6],
 [1, 4, 6],
 [1, 4, 2],
 [6, 6, 3, 2, 6, 1, 6],
 [1, 6, 3, 6]]

### NLTK의 FreqDist

In [51]:
from nltk import FreqDist
import numpy as np

In [53]:
vocab = FreqDist(np.hstack(sentences))

In [57]:
vocab = vocab.most_common(5)

In [60]:
word_to_index = {word[0] : idx + 1 for idx, word in enumerate(vocab)}

In [82]:
word_to_index['OOV'] = len(word_to_index) + 1
word_to_index

{'barber': 1, 'secret': 2, 'huge': 3, 'kept': 4, 'person': 5, 'OOV': 7}

In [83]:
encoding = [] 

for i in sentences:
    a=[]
    for k in i:
        try:
            a.append(word_to_index[k])
        except KeyError:
            a.append(word_to_index['OOV'])
    
    encoding.append(a)

In [84]:
encoding

[[1, 5],
 [1, 7, 5],
 [1, 3, 5],
 [7, 2],
 [2, 4, 3, 2],
 [3, 2],
 [1, 4, 7],
 [1, 4, 7],
 [1, 4, 2],
 [7, 7, 3, 2, 7, 1, 7],
 [1, 7, 3, 7]]

# Keras

In [63]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [65]:
tokenizer = Tokenizer() ## 사용할 단어의 개수를 지정하지 않았다.

In [66]:
tokenizer.fit_on_texts(sentences) ## 빈도수를 기준으로 단어집합 생성

In [67]:
tokenizer.word_index

{'barber': 1,
 'secret': 2,
 'huge': 3,
 'kept': 4,
 'person': 5,
 'word': 6,
 'keeping': 7,
 'good': 8,
 'knew': 9,
 'driving': 10,
 'crazy': 11,
 'went': 12,
 'mountain': 13}

In [69]:
tokenizer.word_counts

OrderedDict([('barber', 8),
             ('person', 3),
             ('good', 1),
             ('huge', 5),
             ('knew', 1),
             ('secret', 6),
             ('kept', 4),
             ('word', 2),
             ('keeping', 2),
             ('driving', 1),
             ('crazy', 1),
             ('went', 1),
             ('mountain', 1)])

In [73]:
tokenizer.texts_to_sequences(sentences)

[[1, 5],
 [1, 8, 5],
 [1, 3, 5],
 [9, 2],
 [2, 4, 3, 2],
 [3, 2],
 [1, 4, 6],
 [1, 4, 6],
 [1, 4, 2],
 [7, 7, 3, 2, 10, 1, 11],
 [1, 12, 3, 13]]

In [74]:
vocab_size = 5 ## oov를 사용하여 상위 5개의 단어만 사용
tokenizer = Tokenizer(num_words = vocab_size+2, oov_token='OOV')
tokenizer.fit_on_texts(sentences)

In [76]:
tokenizer.texts_to_sequences(sentences)

[[2, 6],
 [2, 1, 6],
 [2, 4, 6],
 [1, 3],
 [3, 5, 4, 3],
 [4, 3],
 [2, 5, 1],
 [2, 5, 1],
 [2, 5, 3],
 [1, 1, 4, 3, 1, 2, 1],
 [2, 1, 4, 1]]