# 1. 문자열을 사용한 작업
## 3) 토큰의 대체 및 수정

토큰의 대체 및 수정

In [17]:
# replacers.py
import re
replacement_patterns = [
    (r'won\'t', 'will not'),
    (r'can\'t', 'cannot'),
    (r'i\'m', 'i am'),
    (r'ain\'t', 'is not'),
    (r'(\w+)\'ll', '\g<1> will'),
    (r'(\w+)n\'t', '\g<1> not'),
    (r'(\w+)\'ve', '\g<1> have'),
    (r'(\w+)\'s', '\g<1> is'),
    (r'(\w+)\'re', '\g<1> are'),
    (r'(\w+)\'d', '\g<1> would'),
]

class RegexpReplacer(object):
    def __init__(self, patterns=replacement_patterns):
        self.patterns = [(re.compile(regex), repl) for (regex, repl) in patterns]
        
    def replace(self, text):
        s = text
        for (pattern, repl) in self.patterns:
            (s, count) = re.subn(pattern, repl, s)
        
        return s


텍스트를 다른 텍스트로 대체

In [1]:
import nltk

In [2]:
from replacers import RegexpReplacer

In [3]:
replacer = RegexpReplacer()

In [4]:
replacer.replace("Don't hesitate to ask questions")

'Do not hesitate to ask questions'

In [5]:
replacer.replace("She must've gone to the market but she didn't go")

'She must have gone to the market but she did not go'

토큰화 전에 대체 수행

In [6]:
import nltk

In [7]:
from nltk.tokenize import word_tokenize

In [8]:
from replacers import RegexpReplacer

In [9]:
replacer = RegexpReplacer()

In [10]:
word_tokenize("Don't hesitate to ask questions")

['Do', "n't", 'hesitate', 'to', 'ask', 'questions']

In [11]:
word_tokenize(replacer.replace("Don't hesitate to ask questions"))

['Do', 'not', 'hesitate', 'to', 'ask', 'questions']

반복되는 문자 처리

In [30]:
#replcaers.py
class RepeatReplacer(object):
    def __init__(self):
        self.repeat_regexp = re.compile(r'(\w*)(\w)\2(\w*)')
        self.repl = r'\1\2\3'
    
    def replace(self, word):
        repl_word = self.repeat_regexp.sub(self.repl, word)
        if repl_word != word:
            return str.replace(repl_word)
        else:
            return repl_word

In [42]:
%reload_ext autoreload
%autoreload 2

In [23]:
import nltk

In [24]:
from replacers import RepeatReplacer

In [25]:
replacer = RepeatReplacer()

In [28]:
replacer.replace('lottttt')

'lot'

In [29]:
replacer.replace('ohhhhhhh')

'oh'

happy를 hapy로 변환하는 문제를 해결하기 위해서 wordnet 임베드

In [33]:
#replcaers.py
import re
from nltk.corpus import wordnet

class RepeatReplacer(object):
    def __init__(self):
        self.repeat_regexp = re.compile(r'(\w*)(\w)\2(\w*)')
        self.repl = r'\1\2\3'
    
    def replace(self, word):
        if wordnet.synsets(word):
            return word
        repl_word = self.repeat_regexp.sub(self.repl, word)
        if repl_word != word:
            return str.replace(repl_word)
        else:
            return repl_word

In [37]:
import nltk

In [38]:
from replacers import RepeatReplacer

In [39]:
replacer = RepeatReplacer()

In [40]:
replacer.replace('happy')

'happy'

단어를 동의어로 대체

In [41]:
# replacers.py

class WordReplacer(object):
    def __init__(self, word_map):
        self.word_map = word_map
    def replace(self, word):
        return self.word_map.get(word, word)

In [43]:
import nltk

In [44]:
from replacers import WordReplacer

In [45]:
replacer = WordReplacer({'congrats' : 'congratulations'})

In [46]:
replacer.replace('congrats')

'congratulations'

In [47]:
replacer.replace('mahts')

'mahts'