# NLP Chapter 2


기초적인 용어들
- corpus(말뭉치): 자연언어 연구를 위해 특정한 목적을 가지고 언어의 표본을 추출한 집합   
- token: 텍스트를 나누는 단위  
- tokenization(토큰화): 텍스트를 토큰으로 나누는 과정


In [1]:
!pip install spacy
!pip install nltk



In [2]:
import spacy
nlp = spacy.load('en_core_web_sm')
text = "Mary, don't slap the green witch."
print([str(token) for token in nlp(text.lower())])

['mary', ',', 'do', "n't", 'slap', 'the', 'green', 'witch', '.']


In [3]:
from nltk.tokenize import TweetTokenizer
tweet = "Snow White and the Seven Degrees #MakeAMovieCold@midnight:-)"
tokenizer = TweetTokenizer()
print(tokenizer.tokenize(tweet.lower()))

['snow', 'white', 'and', 'the', 'seven', 'degrees', '#makeamoviecold', '@midnight', ':-)']


# n-gram

n-gram: 길이 n의 연속된 토큰 시퀀스  
unigram: 토큰 1개  
bi-gram: 토큰 2개  

In [4]:
def n_grams(text, n):
    '''
    takes tokens or text, returns a list of n-grams
    '''
    return [text[i:i+n] for i in range(len(text)-n+1)]

In [5]:
tokenized = list(nlp(text.lower()))
print(tokenized)
print(n_grams(tokenized, 1))
print(n_grams(tokenized, 2))
print(n_grams(tokenized, 3))

[mary, ,, do, n't, slap, the, green, witch, .]
[[mary], [,], [do], [n't], [slap], [the], [green], [witch], [.]]
[[mary, ,], [,, do], [do, n't], [n't, slap], [slap, the], [the, green], [green, witch], [witch, .]]
[[mary, ,, do], [,, do, n't], [do, n't, slap], [n't, slap, the], [slap, the, green], [the, green, witch], [green, witch, .]]


# 표제어(lemma): 단어의 기본형

In [6]:
doc = nlp("he was running late")
for token in doc:
    print(f"{token} ---> {token.lemma_}")

he ---> he
was ---> be
running ---> run
late ---> late


# 단어 분류하기: 품사 태깅(POS tagging)

In [7]:
doc = nlp("Mary slapped the green witch.")
for token in doc:
    print(f"{token} ---> {token.pos_}")

Mary ---> PROPN
slapped ---> VERB
the ---> DET
green ---> ADJ
witch ---> NOUN
. ---> PUNCT


# chunking, shallow parsing

In [8]:
doc = nlp("Mary slapped the green witch.")
for chunk in doc.noun_chunks:
    print(f"{chunk} - {chunk.label_}")

Mary - NP
the green witch - NP


# 한국어

https://iostream.tistory.com/144

In [9]:
!pip install python-mecab-ko

Collecting python-mecab-ko
  Downloading python_mecab_ko-1.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (573 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m573.9/573.9 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting python-mecab-ko-dic (from python-mecab-ko)
  Downloading python_mecab_ko_dic-2.1.1.post2-py3-none-any.whl (34.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.5/34.5 MB[0m [31m55.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-mecab-ko-dic, python-mecab-ko
Successfully installed python-mecab-ko-1.3.3 python-mecab-ko-dic-2.1.1.post2


In [10]:
from mecab import MeCab
mecab = MeCab()

In [11]:
mecab.morphs('영등포구청역에 있는 맛집 좀 알려주세요.')

['영등포구청역', '에', '있', '는', '맛집', '좀', '알려', '주', '세요', '.']

In [12]:
mecab.nouns('우리나라에는 무릎 치료를 잘하는 정형외과가 없는가!')

['우리', '나라', '무릎', '치료', '정형', '외과']

In [13]:
mecab.pos('자연주의 쇼핑몰은 어떤 곳인가?')

[('자연주의', 'NNG'),
 ('쇼핑몰', 'NNG'),
 ('은', 'JX'),
 ('어떤', 'MM'),
 ('곳', 'NNG'),
 ('인가', 'VCP+EF'),
 ('?', 'SF')]

In [14]:
mecab.parse('즐거운 하루 보내세요!')

[Morpheme(span=Span(start=0, end=3), surface='즐거운', feature=Feature(pos='VA+ETM', semantic=None, has_jongseong=True, reading='즐거운', type='Inflect', start_pos='VA', end_pos='ETM', expression='즐겁/VA/*+ᆫ/ETM/*')),
 Morpheme(span=Span(start=4, end=6), surface='하루', feature=Feature(pos='NNG', semantic=None, has_jongseong=False, reading='하루', type=None, start_pos=None, end_pos=None, expression=None)),
 Morpheme(span=Span(start=7, end=9), surface='보내', feature=Feature(pos='VV', semantic=None, has_jongseong=False, reading='보내', type=None, start_pos=None, end_pos=None, expression=None)),
 Morpheme(span=Span(start=9, end=11), surface='세요', feature=Feature(pos='EP+EF', semantic=None, has_jongseong=False, reading='세요', type='Inflect', start_pos='EP', end_pos='EF', expression='시/EP/*+어요/EF/*')),
 Morpheme(span=Span(start=11, end=12), surface='!', feature=Feature(pos='SF', semantic=None, has_jongseong=None, reading=None, type=None, start_pos=None, end_pos=None, expression=None))]

In [15]:
mecab.parse('아버지가방에들어가신다.')

[Morpheme(span=Span(start=0, end=3), surface='아버지', feature=Feature(pos='NNG', semantic=None, has_jongseong=False, reading='아버지', type=None, start_pos=None, end_pos=None, expression=None)),
 Morpheme(span=Span(start=3, end=4), surface='가', feature=Feature(pos='JKS', semantic=None, has_jongseong=False, reading='가', type=None, start_pos=None, end_pos=None, expression=None)),
 Morpheme(span=Span(start=4, end=5), surface='방', feature=Feature(pos='NNG', semantic='장소', has_jongseong=True, reading='방', type=None, start_pos=None, end_pos=None, expression=None)),
 Morpheme(span=Span(start=5, end=6), surface='에', feature=Feature(pos='JKB', semantic=None, has_jongseong=False, reading='에', type=None, start_pos=None, end_pos=None, expression=None)),
 Morpheme(span=Span(start=6, end=9), surface='들어가', feature=Feature(pos='VV', semantic=None, has_jongseong=False, reading='들어가', type=None, start_pos=None, end_pos=None, expression=None)),
 Morpheme(span=Span(start=9, end=11), surface='신다', feature=Feat