# 1. 토큰화

In [1]:
import nltk

In [2]:
nltk.download('punkt')
nltk.download('webtext')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /Users/mkw/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package webtext to /Users/mkw/nltk_data...
[nltk_data]   Package webtext is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/mkw/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/mkw/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/mkw/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

## 문장 토큰화

In [2]:
para = "Hello everyone. It's good to see you. Let's start our text mining class!"

from nltk.tokenize import sent_tokenize

print(sent_tokenize(para))

['Hello everyone.', "It's good to see you.", "Let's start our text mining class!"]


## 단어 토큰화

In [3]:
from nltk.tokenize import word_tokenize

print(word_tokenize(para))

['Hello', 'everyone', '.', 'It', "'s", 'good', 'to', 'see', 'you', '.', 'Let', "'s", 'start', 'our', 'text', 'mining', 'class', '!']


In [4]:
from nltk.tokenize import WordPunctTokenizer

print(WordPunctTokenizer().tokenize(para))

['Hello', 'everyone', '.', 'It', "'", 's', 'good', 'to', 'see', 'you', '.', 'Let', "'", 's', 'start', 'our', 'text', 'mining', 'class', '!']


## 정규 표현식

In [5]:
import re

In [6]:
re.findall('[abc]', 'How are you, boy?')

['a', 'b']

In [7]:
re.findall('[0123456789]', '3a7b5c9d')

['3', '7', '5', '9']

In [8]:
re.findall('[\w]', "3a 7b_ '.^&5c9d")
# [\w] -> [a-zA-Z0-9_]

['3', 'a', '7', 'b', '_', '5', 'c', '9', 'd']

In [9]:
re.findall('[\w]+', 'How are you, boy?')

['How', 'are', 'you', 'boy']

In [10]:
from nltk.tokenize import RegexpTokenizer

tokenizer = RegexpTokenizer("[\w']+")
print(tokenizer.tokenize("Sorry, I can't go there."))

['Sorry', 'I', "can't", 'go', 'there']


In [11]:
text1 = "Sorry, I can't go there."
tokenizer = RegexpTokenizer("[\w']{3,}")

print(tokenizer.tokenize(text1.lower()))

['sorry', "can't", 'there']


## 노이즈와 불용어 제거

In [12]:
from nltk.corpus import stopwords

english_stops = set(stopwords.words('english'))

text1 = "Sorry, I couldn't go to movie yesterday."

tokenizer = RegexpTokenizer("[\w']+")
tokens = tokenizer.tokenize(text1.lower())

result = [word for word in tokens if word not in english_stops]

print(result)

['sorry', 'go', 'movie', 'yesterday']


In [13]:
print(english_stops)

{'own', 'was', "didn't", 'm', 'and', 'before', 'it', 'me', 'why', 'same', 'above', 'any', 'themselves', 'ourselves', 'yourselves', 'didn', 'of', 't', 'over', 'few', 'he', 'we', 'so', 'himself', 'hers', 'all', 'should', 'further', 'll', 'again', 'myself', 'your', "couldn't", 'as', "you're", "doesn't", 'here', 'against', 'these', 'such', 'who', 'no', 'they', 'our', 'but', 'on', 'are', 'being', 'doing', 'its', 'to', 'most', 'you', 'each', 'isn', "should've", 'yourself', 'an', 'do', 'below', 'did', 'how', 'a', 'mustn', 'needn', 'couldn', "haven't", "needn't", 'him', "you'd", 'too', 'into', 'now', 'than', 'am', 'very', 'd', 'just', 'when', 'under', 'about', 'his', "shan't", 'because', 'been', 'at', 'is', 'mightn', 'there', 'hasn', 'ma', "mightn't", 'itself', 'by', 'those', "weren't", 'theirs', 'after', 'more', 'ain', 'off', 'which', 're', 'their', 'nor', 'for', 'can', 'has', 'she', 'with', 'then', "hadn't", "that'll", "hasn't", 'haven', 'through', 'this', 'wasn', 'them', "she's", 'wouldn', 

# 2. 정규화

## 어간 추출

- 포터 스테머
    - 모든 단어가 같은 규칙에 따라 변환
    - 영어 분야에서 사실상의 표준

In [14]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()
print(stemmer.stem('cooking'), stemmer.stem('cookery'), stemmer.stem('cookbooks'))

cook cookeri cookbook


In [15]:
from nltk.tokenize import word_tokenize

para = "Hello everyone. It's good to see you. Let's start our text mining class!"
tokens = word_tokenize(para)
print(tokens)

result = [stemmer.stem(token) for token in tokens]
print(result)

['Hello', 'everyone', '.', 'It', "'s", 'good', 'to', 'see', 'you', '.', 'Let', "'s", 'start', 'our', 'text', 'mining', 'class', '!']
['hello', 'everyon', '.', 'it', "'s", 'good', 'to', 'see', 'you', '.', 'let', "'s", 'start', 'our', 'text', 'mine', 'class', '!']


- 랭카스터 스테머

In [16]:
from nltk.stem import LancasterStemmer

stemmer = LancasterStemmer()
print(stemmer.stem('cooking'), stemmer.stem('cookery'), stemmer.stem('cookbooks'))

cook cookery cookbook


## 표제어 추출

In [18]:
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to /Users/mkw/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [17]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

print(lemmatizer.lemmatize('cooking'))
print(lemmatizer.lemmatize('cooking', pos = 'v'))
print(lemmatizer.lemmatize('cookery'))
print(lemmatizer.lemmatize('cookbooks'))

cooking
cook
cookery
cookbook


In [18]:
#lemmatizing과 stemming 비교

stemmer = PorterStemmer()
print('stemming result: ', stemmer.stem('believes'))
print('lemmatizing result: ', lemmatizer.lemmatize('believes'))
print('lemmatizing result: ', lemmatizer.lemmatize('believes', pos = 'v'))

stemming result:  believ
lemmatizing result:  belief
lemmatizing result:  believe


# 3. 품사 태깅

In [19]:
import nltk
from nltk.tokenize import word_tokenize

In [20]:
tokens = word_tokenize("Hello everyone. It's good to see you. Let's start our text mining class!")
print(nltk.pos_tag(tokens))

[('Hello', 'NNP'), ('everyone', 'NN'), ('.', '.'), ('It', 'PRP'), ("'s", 'VBZ'), ('good', 'JJ'), ('to', 'TO'), ('see', 'VB'), ('you', 'PRP'), ('.', '.'), ('Let', 'VB'), ("'s", 'POS'), ('start', 'VB'), ('our', 'PRP$'), ('text', 'NN'), ('mining', 'NN'), ('class', 'NN'), ('!', '.')]


In [26]:
nltk.download('tagsets')

[nltk_data] Downloading package tagsets to /Users/mkw/nltk_data...
[nltk_data]   Unzipping help/tagsets.zip.


True

In [21]:
nltk.help.upenn_tagset('CC')

CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet


In [22]:
my_tag_set = ['NN', 'VB', 'JJ']
my_words = [word for word, tag in nltk.pos_tag(tokens) if tag in my_tag_set]
print(my_words)

['everyone', 'good', 'see', 'Let', 'start', 'text', 'mining', 'class']


In [23]:
words_with_tag = ['/'.join(item) for item in nltk.pos_tag(tokens)]
print(words_with_tag)

['Hello/NNP', 'everyone/NN', './.', 'It/PRP', "'s/VBZ", 'good/JJ', 'to/TO', 'see/VB', 'you/PRP', './.', 'Let/VB', "'s/POS", 'start/VB', 'our/PRP$', 'text/NN', 'mining/NN', 'class/NN', '!/.']


In [24]:
sentence = '''절망의 반대가 희망은 아니다.
어두운 밤하늘에 별이 빛나듯
희망은 절망 속에 싹트는 거지
만약에 우리가 희망함이 적다면
그 누가 세상을 비출어줄까.
정희성, 희망 공부'''

In [25]:
tokens = word_tokenize(sentence)
print(tokens)
print(nltk.pos_tag(tokens))

['절망의', '반대가', '희망은', '아니다', '.', '어두운', '밤하늘에', '별이', '빛나듯', '희망은', '절망', '속에', '싹트는', '거지', '만약에', '우리가', '희망함이', '적다면', '그', '누가', '세상을', '비출어줄까', '.', '정희성', ',', '희망', '공부']
[('절망의', 'JJ'), ('반대가', 'NNP'), ('희망은', 'NNP'), ('아니다', 'NNP'), ('.', '.'), ('어두운', 'VB'), ('밤하늘에', 'JJ'), ('별이', 'NNP'), ('빛나듯', 'NNP'), ('희망은', 'NNP'), ('절망', 'NNP'), ('속에', 'NNP'), ('싹트는', 'NNP'), ('거지', 'NNP'), ('만약에', 'NNP'), ('우리가', 'NNP'), ('희망함이', 'NNP'), ('적다면', 'NNP'), ('그', 'NNP'), ('누가', 'NNP'), ('세상을', 'NNP'), ('비출어줄까', 'NNP'), ('.', '.'), ('정희성', 'NN'), (',', ','), ('희망', 'NNP'), ('공부', 'NNP')]
