# 1. 토큰화

In [2]:
import nltk

In [3]:
nltk.download('punkt')
nltk.download('webtext')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /Users/mkw/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package webtext to /Users/mkw/nltk_data...
[nltk_data]   Unzipping corpora/webtext.zip.
[nltk_data] Downloading package wordnet to /Users/mkw/nltk_data...
[nltk_data] Downloading package stopwords to /Users/mkw/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/mkw/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

## 문장 토큰화

In [4]:
para = "Hello everyone. It's good to see you. Let's start our text mining class!"

from nltk.tokenize import sent_tokenize

print(sent_tokenize(para))

['Hello everyone.', "It's good to see you.", "Let's start our text mining class!"]


## 단어 토큰화

In [5]:
from nltk.tokenize import word_tokenize

print(word_tokenize(para))

['Hello', 'everyone', '.', 'It', "'s", 'good', 'to', 'see', 'you', '.', 'Let', "'s", 'start', 'our', 'text', 'mining', 'class', '!']


In [6]:
from nltk.tokenize import WordPunctTokenizer

print(WordPunctTokenizer().tokenize(para))

['Hello', 'everyone', '.', 'It', "'", 's', 'good', 'to', 'see', 'you', '.', 'Let', "'", 's', 'start', 'our', 'text', 'mining', 'class', '!']


## 정규 표현식

In [7]:
import re

In [8]:
re.findall('[abc]', 'How are you, boy?')

['a', 'b']

In [9]:
re.findall('[0123456789]', '3a7b5c9d')

['3', '7', '5', '9']

In [10]:
re.findall('[\w]', "3a 7b_ '.^&5c9d")
# [\w] -> [a-zA-Z0-9_]

['3', 'a', '7', 'b', '_', '5', 'c', '9', 'd']

In [12]:
re.findall('[\w]+', 'How are you, boy?')

['How', 'are', 'you', 'boy']

In [14]:
from nltk.tokenize import RegexpTokenizer

tokenizer = RegexpTokenizer("[\w']+")
print(tokenizer.tokenize("Sorry, I can't go there."))

['Sorry', 'I', "can't", 'go', 'there']


In [15]:
text1 = "Sorry, I can't go there."
tokenizer = RegexpTokenizer("[\w']{3,}")

print(tokenizer.tokenize(text1.lower()))

['sorry', "can't", 'there']


## 노이즈와 불용어 제거

In [17]:
from nltk.corpus import stopwords

english_stops = set(stopwords.words('english'))

text1 = "Sorry, I couldn't go to movie yesterday."

tokenizer = RegexpTokenizer("[\w']+")
tokens = tokenizer.tokenize(text1.lower())

result = [word for word in tokens if word not in english_stops]

print(result)

['sorry', 'go', 'movie', 'yesterday']


In [18]:
print(english_stops)

{"you're", 'haven', 'it', 'about', 'just', "hadn't", 'and', 'his', 'is', "shan't", 'into', 'll', 'yours', 'over', 'of', 'off', "didn't", "won't", 'him', 'out', 'further', 'ain', 'theirs', 'why', 'isn', 'where', 'than', 'few', 'above', 'can', 'or', 'to', "it's", 'an', 'hasn', "you'll", 'that', "mightn't", 'you', 'same', 'from', "don't", 'should', "she's", "you've", "isn't", 'again', 'only', 'itself', 'this', 'hers', 'will', "you'd", 'he', 'her', 'other', "shouldn't", 'before', 'didn', 'if', 'do', 'be', 'me', "doesn't", 'shouldn', 'aren', 'more', 'weren', 'at', 'what', 'she', 'between', 'd', 'needn', 'wouldn', "mustn't", 'did', 'the', 'during', 'who', "hasn't", 'does', 'its', 'your', 'then', 'a', 'some', 'these', "aren't", 'our', 'now', 'herself', 'don', 'their', 'y', 'ma', 'under', 'down', 'mustn', "weren't", 'after', "needn't", 'hadn', 'most', 'as', 'has', 'wasn', 'yourself', 'being', 'in', 'doesn', 'doing', 'each', 'ourselves', 't', 'won', 'very', 'when', 'nor', 'while', 'mightn', 'th

# 2. 정규화

## 어간 추출

- 포터 스테머
    - 모든 단어가 같은 규칙에 따라 변환
    - 영어 분야에서 사실상의 표준

In [19]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()
print(stemmer.stem('cooking'), stemmer.stem('cookery'), stemmer.stem('cookbooks'))

cook cookeri cookbook


In [20]:
from nltk.tokenize import word_tokenize

para = "Hello everyone. It's good to see you. Let's start our text mining class!"
tokens = word_tokenize(para)
print(tokens)

result = [stemmer.stem(token) for token in tokens]
print(result)

['Hello', 'everyone', '.', 'It', "'s", 'good', 'to', 'see', 'you', '.', 'Let', "'s", 'start', 'our', 'text', 'mining', 'class', '!']
['hello', 'everyon', '.', 'it', "'s", 'good', 'to', 'see', 'you', '.', 'let', "'s", 'start', 'our', 'text', 'mine', 'class', '!']


- 랭카스터 스테머

In [21]:
from nltk.stem import LancasterStemmer

stemmer = LancasterStemmer()
print(stemmer.stem('cooking'), stemmer.stem('cookery'), stemmer.stem('cookbooks'))

cook cookery cookbook


## 표제어 추출

In [26]:
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to /Users/mkw/nltk_data...


True

In [28]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

print(lemmatizer.lemmatize('cooking'))
print(lemmatizer.lemmatize('cooking', pos = 'v'))
print(lemmatizer.lemmatize('cookery'))
print(lemmatizer.lemmatize('cookbooks'))

cooking
cook
cookery
cookbook


In [31]:
#lemmatizing과 stemming 비교

stemmer = PorterStemmer()
print('stemming result: ', stemmer.stem('believes'))
print('lemmatizing result: ', lemmatizer.lemmatize('believes'))
print('lemmatizing result: ', lemmatizer.lemmatize('believes', pos = 'v'))

stemming result:  believ
lemmatizing result:  belief
lemmatizing result:  believe
