# 영어 tokenizer 실습 - nltk

In [1]:
!pip install nltk



In [2]:
import nltk

In [3]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

## nltk - 문장을 토큰화

In [4]:
from nltk import word_tokenize

sentence = "The Matrix is everywhere its all around us, here even in this room."
words = word_tokenize(sentence)
print(words)

['The', 'Matrix', 'is', 'everywhere', 'its', 'all', 'around', 'us', ',', 'here', 'even', 'in', 'this', 'room', '.']


In [5]:
sentence = "Don't be fooled by the dark sounding name, Mr. Jone's Orphanage is as cheery as cheery goes for a pastry shop."
words = word_tokenize(sentence)
print(words)

['Do', "n't", 'be', 'fooled', 'by', 'the', 'dark', 'sounding', 'name', ',', 'Mr.', 'Jone', "'s", 'Orphanage', 'is', 'as', 'cheery', 'as', 'cheery', 'goes', 'for', 'a', 'pastry', 'shop', '.']


In [None]:
sentence = Your_sentence
words = word_tokenize(sentence)
print(words)

## nltk - 문장 분리

In [17]:
document = 'The Matrix is everywhere its all around us, here even in this room. You can see it out your window or on your television. You feel it when you go to work, or go to church or pay your taxes.'

In [20]:
custom_sent_tokenizer = lambda document: [ f"{i.strip()}." for i in document.split('.') if i ]

In [21]:
custom_sent_tokenizer(document)

['The Matrix is everywhere its all around us, here even in this room.',
 'You can see it out your window or on your television.',
 'You feel it when you go to work, or go to church or pay your taxes.']

In [25]:
document = "Dr. Mah is a famous reviewer. She eats various desserts and criticizes the taste."

In [26]:
custom_sent_tokenizer(document)

['Dr.',
 'Mah is a famous reviewer.',
 'She eats various desserts and criticizes the taste.']

#### 단순히 구두점으로만 분리할 경우 Mr.나 Dr. 등의 표현도 분리해버리기 때문에 문장분리기를 사용할 필요가 있음!

In [15]:
from nltk import sent_tokenize

tokenized_sentences = sent_tokenize(document)
print(tokenized_sentences)

['The Matrix is everywhere its all around us, here even in this room.', 'You can see it out your window or on your television.', 'You feel it when you go to work, or go to church or pay your taxes.']


In [11]:
document = 'I am actively looking for Ph.D. students. and you are a Ph.D student.'
tokenized_sentences = sent_tokenize(document)
print(tokenized_sentences)

['I am actively looking for Ph.D. students.', 'and you are a Ph.D student.']


# 한국어 tokenizer 실습 - KoNLPy

In [1]:
!pip install konlpy



In [2]:
!pip install python-mecab-ko



In [15]:
from konlpy.tag import Kkma, Komoran, Okt, Hannanum

In [4]:
from mecab import MeCab

In [16]:
kkma = Kkma()
komoran = Komoran()
okt = Okt()
hannanum = Hannanum()
mecab = MeCab()

In [6]:
sentence = "안녕하세요! 반가워요ㅋㅋㅋ 제 이름은 고원희입니다:)"

In [7]:
kkma.morphs(sentence)

['안녕',
 '하',
 '세요',
 '!',
 '반갑',
 '어요',
 'ㅋㅋㅋ',
 '저',
 '의',
 '이름',
 '은',
 '고원희입니',
 '다',
 ':',
 ')']

In [8]:
komoran.morphs(sentence)

['안녕하세요', '!', '반가워욬ㅋㅋ', '제', '이름', '은', '고원희', '이', 'ㅂ니다', ':', ')']

In [9]:
okt.morphs(sentence)

['안녕하세요', '!', '반가워요', 'ㅋㅋㅋ', '제', '이름', '은', '고', '원희', '입니다', ':)']

In [22]:
hannanum.morphs(sentence)

['안녕',
 '하',
 '세',
 '요',
 '!',
 '반가워욬ㅋㅋ',
 '저',
 '의',
 '이름',
 '은',
 '고원희',
 '이',
 'ㅂ니다',
 ':)']

In [10]:
mecab.morphs(sentence)

['안녕', '하', '세요', '!', '반가워요', 'ㅋㅋㅋ', '제', '이름', '은', '고원희', '입니다', ':', ')']

#### 한국어 형태소 분석

In [11]:
kkma.pos(sentence)

[('안녕', 'NNG'),
 ('하', 'XSV'),
 ('세요', 'EFN'),
 ('!', 'SF'),
 ('반갑', 'VV'),
 ('어요', 'EFN'),
 ('ㅋㅋㅋ', 'EMO'),
 ('저', 'NP'),
 ('의', 'JKG'),
 ('이름', 'NNG'),
 ('은', 'JX'),
 ('고원희입니', 'UN'),
 ('다', 'JC'),
 (':', 'SP'),
 (')', 'SS')]

In [12]:
komoran.pos(sentence)

[('안녕하세요', 'NNP'),
 ('!', 'SF'),
 ('반가워욬ㅋㅋ', 'NA'),
 ('제', 'XPN'),
 ('이름', 'NNG'),
 ('은', 'JX'),
 ('고원희', 'NNP'),
 ('이', 'VCP'),
 ('ㅂ니다', 'EC'),
 (':', 'SP'),
 (')', 'SS')]

In [13]:
okt.pos(sentence)

[('안녕하세요', 'Adjective'),
 ('!', 'Punctuation'),
 ('반가워요', 'Adjective'),
 ('ㅋㅋㅋ', 'KoreanParticle'),
 ('제', 'Noun'),
 ('이름', 'Noun'),
 ('은', 'Josa'),
 ('고', 'Modifier'),
 ('원희', 'Noun'),
 ('입니다', 'Adjective'),
 (':)', 'Punctuation')]

In [14]:
mecab.pos(sentence)

TypeError: ignored

## 한국어 문장 분리

In [1]:
!pip install kss

Collecting kss
  Downloading kss-4.5.4.tar.gz (79 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.1/79.1 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting emoji==1.2.0 (from kss)
  Downloading emoji-1.2.0-py3-none-any.whl (131 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m131.3/131.3 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
Collecting pecab (from kss)
  Downloading pecab-1.0.8.tar.gz (26.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.4/26.4 MB[0m [31m33.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: kss, pecab
  Building wheel for kss (setup.py) ... [?25l[?25hdone
  Created wheel for kss: filename=kss-4.5.4-py3-none-any.whl size=54467 sha256=fa3f860ba73d6e620f17becfbd4746077fa9ac789bf706483a9d425f3522e1d4
  Stored in directory: /root/.cache/pip/wheels/61/7b/ba/

In [7]:
from kss import split_sentences

In [8]:
document = "카페를 왔는데 원두 종류도 여러가지로 너무 맛있었다. 8시가 되면 불이 꺼지는데 은은하게 분위기도 있다. 다음에 또 와봐야지 ㅋㅋ"

In [9]:
split_sentences(document)

['카페를 왔는데 원두 종류도 여러가지로 너무 맛있었다.',
 '8시가 되면 불이 꺼지는데 은은하게 분위기도 있다.',
 '다음에 또 와봐야지 ㅋㅋ']