In [32]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize, WordPunctTokenizer, RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, LancasterStemmer, WordNetLemmatizer
import re

In [2]:
para = "Hello everyone. It's good to see you. Let's start our text mining class!"

In [5]:
sent_tokenize(para)

['Hello everyone.',
 "It's good to see you.",
 "Let's start our text mining class!"]

In [6]:
para_kor = "안녕하세요, 여러분. 만나서 반갑습니다. 이제 텍스트마이닝 클래스를 시작해봅시다!"

In [7]:
sent_tokenize(para_kor)

['안녕하세요, 여러분.', '만나서 반갑습니다.', '이제 텍스트마이닝 클래스를 시작해봅시다!']

In [9]:
word_tokenize(para)

['Hello',
 'everyone',
 '.',
 'It',
 "'s",
 'good',
 'to',
 'see',
 'you',
 '.',
 'Let',
 "'s",
 'start',
 'our',
 'text',
 'mining',
 'class',
 '!']

In [11]:
WordPunctTokenizer().tokenize(para)

['Hello',
 'everyone',
 '.',
 'It',
 "'",
 's',
 'good',
 'to',
 'see',
 'you',
 '.',
 'Let',
 "'",
 's',
 'start',
 'our',
 'text',
 'mining',
 'class',
 '!']

In [13]:
re.findall('''[abc]''', '''How are you, boy?''')

['a', 'b']

In [14]:
re.findall('''[0123456789]''', '''3a7b5c9d''')

['3', '7', '5', '9']

In [15]:
re.findall('''[\w]''', '''3a 7b_ '.^&5c9d''')

['3', 'a', '7', 'b', '_', '5', 'c', '9', 'd']

In [16]:
re.findall('''[_]+''', '''a_b, c__d, e___f''')

['_', '__', '___']

In [17]:
re.findall('''[\w]+''', '''How are you, boy?''')

['How', 'are', 'you', 'boy']

In [18]:
re.findall('''[o]{2,4}''', '''oh, hoow are yoooou, boooooooy?''')

['oo', 'oooo', 'oooo', 'ooo']

In [21]:
tokenizer = RegexpTokenizer('''[\w']+''')
tokenizer.tokenize("Sorry, I can't go there.")

['Sorry', 'I', "can't", 'go', 'there']

In [22]:
text1 = "Sorry, I can't go there."
tokenizer.tokenize(text1.lower())

['sorry', 'i', "can't", 'go', 'there']

In [24]:
english_stops = set(stopwords.words('english'))

In [25]:
text1 = "Sorry, I couldn't go to movie yesterday."
tokenizer = RegexpTokenizer("[\w']+")
tokens = tokenizer.tokenize(text1.lower())
result = [word for word in tokens if word not in english_stops]
result

['sorry', 'go', 'movie', 'yesterday']

In [27]:
stemmer = PorterStemmer()
stemmer.stem('cooking'), stemmer.stem('cookery'), stemmer.stem('cookbooks')

('cook', 'cookeri', 'cookbook')

In [29]:
para = "Hello everyone. It's good to see you. Let's start our text mining class!"
tokens = word_tokenize(para)
result = [stemmer.stem(token) for token in tokens]
result

['hello',
 'everyon',
 '.',
 'it',
 "'s",
 'good',
 'to',
 'see',
 'you',
 '.',
 'let',
 "'s",
 'start',
 'our',
 'text',
 'mine',
 'class',
 '!']

In [31]:
stemmer = LancasterStemmer()
stemmer.stem('cooking'), stemmer.stem('cookery'), stemmer.stem('cookbooks')

('cook', 'cookery', 'cookbook')

In [34]:
lemmatizer = WordNetLemmatizer()
lemmatizer.lemmatize('cooking'), lemmatizer.lemmatize('cooking', pos='v'), lemmatizer.lemmatize('cookery'), lemmatizer.lemmatize('cookbooks')

('cooking', 'cook', 'cookery', 'cookbook')