# 1. 문자열을 사용한 작업
## 1) 토큰화

텍스트를 문장으로 토큰화

In [1]:
import nltk

In [2]:
text = "Welcome readers. I hope you find it interesting. Please do reply"

In [3]:
from nltk.tokenize import sent_tokenize

In [4]:
sent_tokenize(text)

['Welcome readers.', 'I hope you find it interesting.', 'Please do reply']

여러 문장 토큰화

In [5]:
import nltk

In [8]:
tokenizer = nltk.data.load('tokenizer/punkt/english.pickle')

In [9]:
text = "Hello everyone. Hope all are fine and doing well. Hope you find the book interesting"

In [10]:
tokenizer.tokenize(text)

['Hello everyone.',
 'Hope all are fine and doing well.',
 'Hope you find the book interesting']

다양한 언어의 텍스트 토큰화

In [11]:
import nltk

In [12]:
french_tokenizer = nltk.data.load('tokenizer/punkt/french.pickle')

In [13]:
french_tokenizer.tokenize('Deux agressions en quelques jours.')

['Deux agressions en quelques jours.']

문장을 단어로 토큰화

In [14]:
# word_tokenize 사용
import nltk

In [15]:
text = nltk.word_tokenize("PierreVinken , 59 years old , will join as a nonexecutive director on Nov. 29 .")

In [16]:
print(text)

['PierreVinken', ',', '59', 'years', 'old', ',', 'will', 'join', 'as', 'a', 'nonexecutive', 'director', 'on', 'Nov.', '29', '.']


TreebankWordTokenizer를 사용한 토큰화

In [17]:
import nltk

In [18]:
from nltk.tokenize import TreebankWordTokenizer

In [19]:
tokenizer = TreebankWordTokenizer()

In [20]:
tokenizer.tokenize("Hava a nice day. I hope you find the book interesting")

['Hava',
 'a',
 'nice',
 'day.',
 'I',
 'hope',
 'you',
 'find',
 'the',
 'book',
 'interesting']

분리된 축약형으로 작용

In [21]:
import nltk

In [23]:
text = nltk.word_tokenize("Don't hesitate to ask questions")

In [25]:
print(text)

['Do', "n't", 'hesitate', 'to', 'ask', 'questions']


WordPunctTokenizer : 문장 부호를 완전히 새로운 토큰으로 분할하여 제공

In [26]:
from nltk.tokenize import WordPunctTokenizer

In [27]:
tokenizer = WordPunctTokenizer()

In [28]:
tokenizer.tokenize("Don't hesitate to ask questions")

['Don', "'", 't', 'hesitate', 'to', 'ask', 'questions']

정규 표현식을 사용한 토큰화

인스턴스 클래스 사용

In [29]:
import nltk

In [30]:
from nltk.tokenize import RegexpTokenizer

In [37]:
tokenizer = RegexpTokenizer("[\w']+")

In [38]:
tokenizer.tokenize("Don't hesitate to ask questions")

["Don't", 'hesitate', 'to', 'ask', 'questions']

함수 사용

In [39]:
import nltk

In [40]:
from nltk.tokenize import regexp_tokenize

In [41]:
sent = "Don't hesitate to ask questions"

In [42]:
print(regexp_tokenize(sent, pattern='\w+|\$[\d\.]+|\S+'))

['Don', "'t", 'hesitate', 'to', 'ask', 'questions']


화이트스페이스를 사용한 토큰화

In [43]:
import nltk

In [44]:
from nltk.tokenize import RegexpTokenizer

In [45]:
tokenizer = RegexpTokenizer('\s+', gaps=True)

In [46]:
tokenizer.tokenize("Don't hesitate to ask questions")

["Don't", 'hesitate', 'to', 'ask', 'questions']

대문자로 시작하는 단어 선택

In [47]:
import nltk

In [48]:
from nltk.tokenize import RegexpTokenizer

In [49]:
sent = "She secured 90.56% in class X. She is a meritorious student"

In [50]:
capt = RegexpTokenizer('[A-Z]\w+')

In [51]:
capt.tokenize(sent)

['She', 'She']

RegexpTokenizer의 서브클래스에서 미리 정의된 정규 표현식을 사용

In [1]:
import nltk

In [2]:
sent = "She secured 90.56 % in class X. She is a meritorious student"

In [3]:
from nltk.tokenize import BlanklineTokenizer

In [4]:
BlanklineTokenizer().tokenize(sent)

['She secured 90.56 % in class X. She is a meritorious student']

문자열의 토큰화 : 화이트스페이스-탭, 스페이스, 줄 바꿈

In [5]:
import nltk

In [6]:
sent = "She secured 90.56 % in class X. She is a meritorious student"

In [7]:
from nltk.tokenize import WhitespaceTokenizer

In [8]:
WhitespaceTokenizer().tokenize(sent)

['She',
 'secured',
 '90.56',
 '%',
 'in',
 'class',
 'X.',
 'She',
 'is',
 'a',
 'meritorious',
 'student']

split() 메소드를 사용한 토큰화

In [9]:
import nltk

In [10]:
sent = "She secured 90.56 % in class X. She is a meritorious student"

In [11]:
sent.split()

['She',
 'secured',
 '90.56',
 '%',
 'in',
 'class',
 'X.',
 'She',
 'is',
 'a',
 'meritorious',
 'student']

nltk.tokenize.util 모듈 : 문장에서 토큰의 오프셋인 튜플의 순서를 반환

In [12]:
import nltk

In [13]:
from nltk.tokenize import WhitespaceTokenizer

In [16]:
sent = " She secured 90.56 % in class X \n. She is a meritorious student"

In [17]:
list(WhitespaceTokenizer().span_tokenize(sent))

[(1, 4),
 (5, 12),
 (13, 18),
 (19, 20),
 (21, 23),
 (24, 29),
 (30, 31),
 (33, 34),
 (35, 38),
 (39, 41),
 (42, 43),
 (44, 55),
 (56, 63)]