<a href="https://colab.research.google.com/github/nazzang49/boost-camp-projects/blob/main/assignments/P03_Sentence_Encoding_Practice.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Practice Converting Raw Sentences into Integer-Encoded Sentences
- (URL) https://wikidocs.net/31766

In [27]:
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import defaultdict
# import nltk
# nltk.download('stopwords')

In [28]:
text = "A barber is a person. a barber is good person. a barber is huge person. he Knew A Secret! The Secret He Kept is huge secret. Huge secret. His barber kept his word. a barber kept his word. His barber kept his secret. But keeping and keeping such a huge secret to himself was driving the barber crazy. the barber went up a huge mountain."

In [60]:
class Encoder():
    '''
    A class for converting raw sentences into integer-encoded sentences
    '''
    def __init__(self):
        pass
    
    @staticmethod
    def get_tokenized_sentences(raw_text: str):
        return sent_tokenize(raw_text)
    
    @staticmethod
    def get_vocab(tokenized_sentences: list):
        vocab = defaultdict(int)
        sentences = []
        stop_words = set(stopwords.words('english'))

        for sentence in tokenized_sentences:
            tokenized_words = word_tokenize(sentence)
            result = []
            for word in tokenized_words:
                word = word.lower()
                if not word in stop_words and len(word) > 2:
                    result.append(word)
                    vocab[word] += 1
            # tokenized sentences
            sentences.append(result)

        return sentences, vocab
    
    @staticmethod
    def get_sorted_vocab(vocab: defaultdict):
        return sorted(vocab.items(), key=lambda x: x[1], reverse=True)
    
    @staticmethod
    def get_word2idx(sorted_vocab:list):
        word2idx = dict()
        idx = 0
        for (word, freq) in sorted_vocab:
            if freq >= 2:
                idx += 1
                word2idx[word] = idx
        return word2idx
    
    @staticmethod
    def get_removed_word2idx(word2idx: dict):
        words = [w for w, f in word2idx.items() if f >= 6]
        for w in words:
            del word2idx[w]
        return word2idx
    
    @staticmethod
    def get_encoded_sentences(word2idx: dict, tokenized_sentences: list):
        encoded = []
        for sentence in tokenized_sentences:
            tmp = []
            for word in sentence:
                try:
                    tmp.append(word2idx[word])
                except KeyError:
                    print(f'[ Key Error ] {word}은/는 사전에 없는 단어입니다. OOV로 대체합니다.')
                    tmp.append(word2idx['OOV'])
            encoded.append(tmp)
        return encoded

In [30]:
tokenized_sentences = Encoder.get_tokenized_sentences(text)
tokenized_sentences

['A barber is a person.',
 'a barber is good person.',
 'a barber is huge person.',
 'he Knew A Secret!',
 'The Secret He Kept is huge secret.',
 'Huge secret.',
 'His barber kept his word.',
 'a barber kept his word.',
 'His barber kept his secret.',
 'But keeping and keeping such a huge secret to himself was driving the barber crazy.',
 'the barber went up a huge mountain.']

In [33]:
tokenized_sentences, vocab = Encoder.get_vocab(tokenized_sentences)
print(tokenized_sentences)
print(vocab)

[['barber', 'person'], ['barber', 'good', 'person'], ['barber', 'huge', 'person'], ['knew', 'secret'], ['secret', 'kept', 'huge', 'secret'], ['huge', 'secret'], ['barber', 'kept', 'word'], ['barber', 'kept', 'word'], ['barber', 'kept', 'secret'], ['keeping', 'keeping', 'huge', 'secret', 'driving', 'barber', 'crazy'], ['barber', 'went', 'huge', 'mountain']]
defaultdict(<class 'int'>, {'barber': 8, 'person': 3, 'good': 1, 'huge': 5, 'knew': 1, 'secret': 6, 'kept': 4, 'word': 2, 'keeping': 2, 'driving': 1, 'crazy': 1, 'went': 1, 'mountain': 1})


In [38]:
sorted_vocab = Encoder.get_sorted_vocab(vocab)
sorted_vocab

[('barber', 8),
 ('secret', 6),
 ('huge', 5),
 ('kept', 4),
 ('person', 3),
 ('word', 2),
 ('keeping', 2),
 ('good', 1),
 ('knew', 1),
 ('driving', 1),
 ('crazy', 1),
 ('went', 1),
 ('mountain', 1)]

In [54]:
word2idx = Encoder.get_word2idx(sorted_vocab)
word2idx

{'barber': 1,
 'huge': 3,
 'keeping': 7,
 'kept': 4,
 'person': 5,
 'secret': 2,
 'word': 6}

In [55]:
word2idx = Encoder.get_removed_word2idx(word2idx)
word2idx

{'barber': 1, 'huge': 3, 'kept': 4, 'person': 5, 'secret': 2}

In [56]:
word2idx['OOV'] = len(word2idx) + 1

In [61]:
encoded_sentences = Encoder.get_encoded_sentences(word2idx, tokenized_sentences)
encoded_sentences

[ Key Error ] good은/는 사전에 없는 단어입니다. OOV로 대체합니다.
[ Key Error ] knew은/는 사전에 없는 단어입니다. OOV로 대체합니다.
[ Key Error ] word은/는 사전에 없는 단어입니다. OOV로 대체합니다.
[ Key Error ] word은/는 사전에 없는 단어입니다. OOV로 대체합니다.
[ Key Error ] keeping은/는 사전에 없는 단어입니다. OOV로 대체합니다.
[ Key Error ] keeping은/는 사전에 없는 단어입니다. OOV로 대체합니다.
[ Key Error ] driving은/는 사전에 없는 단어입니다. OOV로 대체합니다.
[ Key Error ] crazy은/는 사전에 없는 단어입니다. OOV로 대체합니다.
[ Key Error ] went은/는 사전에 없는 단어입니다. OOV로 대체합니다.
[ Key Error ] mountain은/는 사전에 없는 단어입니다. OOV로 대체합니다.


[[1, 5],
 [1, 6, 5],
 [1, 3, 5],
 [6, 2],
 [2, 4, 3, 2],
 [3, 2],
 [1, 4, 6],
 [1, 4, 6],
 [1, 4, 2],
 [6, 6, 3, 2, 6, 1, 6],
 [1, 6, 3, 6]]