In [1]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize, WordPunctTokenizer, RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, LancasterStemmer, WordNetLemmatizer
from konlpy.tag import Okt
import re



In [2]:
para = "Hello everyone. It's good to see you. Let's start our text mining class!"
sent_tokenize(para)

['Hello everyone.',
 "It's good to see you.",
 "Let's start our text mining class!"]

In [3]:
para_kor = "안녕하세요, 여러분. 만나서 반갑습니다. 이제 텍스트마이닝 클래스를 시작해봅시다!"
sent_tokenize(para_kor)

['안녕하세요, 여러분.', '만나서 반갑습니다.', '이제 텍스트마이닝 클래스를 시작해봅시다!']

In [6]:
print(word_tokenize(para), WordPunctTokenizer().tokenize(para))

['Hello', 'everyone', '.', 'It', "'s", 'good', 'to', 'see', 'you', '.', 'Let', "'s", 'start', 'our', 'text', 'mining', 'class', '!'] ['Hello', 'everyone', '.', 'It', "'", 's', 'good', 'to', 'see', 'you', '.', 'Let', "'", 's', 'start', 'our', 'text', 'mining', 'class', '!']


In [7]:
print(word_tokenize(para_kor), WordPunctTokenizer().tokenize(para_kor))

['안녕하세요', ',', '여러분', '.', '만나서', '반갑습니다', '.', '이제', '텍스트마이닝', '클래스를', '시작해봅시다', '!'] ['안녕하세요', ',', '여러분', '.', '만나서', '반갑습니다', '.', '이제', '텍스트마이닝', '클래스를', '시작해봅시다', '!']


In [9]:
re.findall("[\w']", "3a 7b_ '.^&5c9d")

['3', 'a', '7', 'b', '_', "'", '5', 'c', '9', 'd']

In [10]:
re.findall("[\w']+", "How are you, boy?")

['How', 'are', 'you', 'boy']

In [12]:
tokenizer = RegexpTokenizer("[\w']{3,}")
tokenizer.tokenize("Sorry, I can't go there.".lower())

['sorry', "can't", 'there']

In [14]:
eng_stops = set(stopwords.words('english'))
len(eng_stops)

179

In [18]:
text1 = "Sorry, I couldn't go to movie yesterday."

In [19]:
tokenizer = RegexpTokenizer("[\w']+")
tokens = tokenizer.tokenize(text1.lower())
result = [word for word in tokens if word not in eng_stops]
result

['sorry', 'go', 'movie', 'yesterday']

In [21]:
stemmer = PorterStemmer()
stemmer.stem('cooking'), stemmer.stem('cookery'), stemmer.stem('cookbooks')

('cook', 'cookeri', 'cookbook')

In [22]:
stemmer = LancasterStemmer()
stemmer.stem('cooking'), stemmer.stem('cookery'), stemmer.stem('cookbooks')

('cook', 'cookery', 'cookbook')

In [23]:
para = "Hello everyone. It's good to see you. Let's start our text mining class!"
tokens = word_tokenize(para)
result = [stemmer.stem(word) for word in tokens]
print(result)

['hello', 'everyon', '.', 'it', "'s", 'good', 'to', 'see', 'you', '.', 'let', "'s", 'start', 'our', 'text', 'min', 'class', '!']


In [24]:
lemma = WordNetLemmatizer()
result = [lemma.lemmatize(word) for word in tokens]
print(result)

['Hello', 'everyone', '.', 'It', "'s", 'good', 'to', 'see', 'you', '.', 'Let', "'s", 'start', 'our', 'text', 'mining', 'class', '!']
