In [2]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize, WordPunctTokenizer, RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, LancasterStemmer, WordNetLemmatizer
from konlpy.tag import Okt
import re

In [3]:
para = "Hello everyone. It's good to see you. Let's start our text mining class!"

In [4]:
sent_tokenize(para)

['Hello everyone.',
 "It's good to see you.",
 "Let's start our text mining class!"]

In [5]:
para_kor = "안녕하세요, 여러분. 만나서 반갑습니다. 이제 텍스트마이닝 클래스를 시작해봅시다!"

In [6]:
sent_tokenize(para_kor)

['안녕하세요, 여러분.', '만나서 반갑습니다.', '이제 텍스트마이닝 클래스를 시작해봅시다!']

In [7]:
word_tokenize(para)

['Hello',
 'everyone',
 '.',
 'It',
 "'s",
 'good',
 'to',
 'see',
 'you',
 '.',
 'Let',
 "'s",
 'start',
 'our',
 'text',
 'mining',
 'class',
 '!']

In [8]:
WordPunctTokenizer().tokenize(para)

['Hello',
 'everyone',
 '.',
 'It',
 "'",
 's',
 'good',
 'to',
 'see',
 'you',
 '.',
 'Let',
 "'",
 's',
 'start',
 'our',
 'text',
 'mining',
 'class',
 '!']

In [9]:
re.findall('''[abc]''', '''How are you, boy?''')

['a', 'b']

In [10]:
re.findall('''[0123456789]''', '''3a7b5c9d''')

['3', '7', '5', '9']

In [11]:
re.findall('''[\w]''', '''3a 7b_ '.^&5c9d''')

['3', 'a', '7', 'b', '_', '5', 'c', '9', 'd']

In [12]:
re.findall('''[_]+''', '''a_b, c__d, e___f''')

['_', '__', '___']

In [13]:
re.findall('''[\w]+''', '''How are you, boy?''')

['How', 'are', 'you', 'boy']

In [14]:
re.findall('''[o]{2,4}''', '''oh, hoow are yoooou, boooooooy?''')

['oo', 'oooo', 'oooo', 'ooo']

In [15]:
tokenizer = RegexpTokenizer('''[\w']+''')
tokenizer.tokenize("Sorry, I can't go there.")

['Sorry', 'I', "can't", 'go', 'there']

In [16]:
text1 = "Sorry, I can't go there."
tokenizer.tokenize(text1.lower())

['sorry', 'i', "can't", 'go', 'there']

In [17]:
english_stops = set(stopwords.words('english'))

In [18]:
text1 = "Sorry, I couldn't go to movie yesterday."
tokenizer = RegexpTokenizer("[\w']+")
tokens = tokenizer.tokenize(text1.lower())
result = [word for word in tokens if word not in english_stops]
result

['sorry', 'go', 'movie', 'yesterday']

In [19]:
stemmer = PorterStemmer()
stemmer.stem('cooking'), stemmer.stem('cookery'), stemmer.stem('cookbooks')

('cook', 'cookeri', 'cookbook')

In [20]:
para = "Hello everyone. It's good to see you. Let's start our text mining class!"
tokens = word_tokenize(para)
result = [stemmer.stem(token) for token in tokens]
result

['hello',
 'everyon',
 '.',
 'it',
 "'s",
 'good',
 'to',
 'see',
 'you',
 '.',
 'let',
 "'s",
 'start',
 'our',
 'text',
 'mine',
 'class',
 '!']

In [21]:
stemmer = LancasterStemmer()
stemmer.stem('cooking'), stemmer.stem('cookery'), stemmer.stem('cookbooks')

('cook', 'cookery', 'cookbook')

In [22]:
lemmatizer = WordNetLemmatizer()
lemmatizer.lemmatize('cooking'), lemmatizer.lemmatize('cooking', pos='v'), lemmatizer.lemmatize('cookery'), lemmatizer.lemmatize('cookbooks')

('cooking', 'cook', 'cookery', 'cookbook')

In [23]:
tokens = word_tokenize("Hello everyone. It's good to see you. Let's start our text mining class!")
nltk.pos_tag(tokens)

[('Hello', 'NNP'),
 ('everyone', 'NN'),
 ('.', '.'),
 ('It', 'PRP'),
 ("'s", 'VBZ'),
 ('good', 'JJ'),
 ('to', 'TO'),
 ('see', 'VB'),
 ('you', 'PRP'),
 ('.', '.'),
 ('Let', 'VB'),
 ("'s", 'POS'),
 ('start', 'VB'),
 ('our', 'PRP$'),
 ('text', 'NN'),
 ('mining', 'NN'),
 ('class', 'NN'),
 ('!', '.')]

In [24]:
my_tag_set = ['NN', 'VB', 'JJ']
my_words = [word for word, tag in nltk.pos_tag(tokens) if tag in my_tag_set]
my_words

['everyone', 'good', 'see', 'Let', 'start', 'text', 'mining', 'class']

In [25]:
t = Okt()

In [26]:
sentence = '''절망의 반대가 희망은 아니다.
어두운 밤하늘에 별이 빛나듯
희망은 절망 속에 싹트는 거지
만약에 우리가 희망함이 적다면
그 누가 세상을 비출어줄까.
정희성, 희망 공부'''

In [27]:
t.morphs(sentence)

['절망',
 '의',
 '반대',
 '가',
 '희망',
 '은',
 '아니다',
 '.',
 '\n',
 '어',
 '두운',
 '밤하늘',
 '에',
 '별',
 '이',
 '빛나듯',
 '\n',
 '희망',
 '은',
 '절망',
 '속',
 '에',
 '싹트는',
 '거지',
 '\n',
 '만약',
 '에',
 '우리',
 '가',
 '희망',
 '함',
 '이',
 '적다면',
 '\n',
 '그',
 '누가',
 '세상',
 '을',
 '비출어줄까',
 '.',
 '\n',
 '정희성',
 ',',
 '희망',
 '공부']

In [28]:
t.pos(sentence)

[('절망', 'Noun'),
 ('의', 'Josa'),
 ('반대', 'Noun'),
 ('가', 'Josa'),
 ('희망', 'Noun'),
 ('은', 'Josa'),
 ('아니다', 'Adjective'),
 ('.', 'Punctuation'),
 ('\n', 'Foreign'),
 ('어', 'Noun'),
 ('두운', 'Noun'),
 ('밤하늘', 'Noun'),
 ('에', 'Josa'),
 ('별', 'Noun'),
 ('이', 'Josa'),
 ('빛나듯', 'Verb'),
 ('\n', 'Foreign'),
 ('희망', 'Noun'),
 ('은', 'Josa'),
 ('절망', 'Noun'),
 ('속', 'Noun'),
 ('에', 'Josa'),
 ('싹트는', 'Verb'),
 ('거지', 'Noun'),
 ('\n', 'Foreign'),
 ('만약', 'Noun'),
 ('에', 'Josa'),
 ('우리', 'Noun'),
 ('가', 'Josa'),
 ('희망', 'Noun'),
 ('함', 'Noun'),
 ('이', 'Josa'),
 ('적다면', 'Verb'),
 ('\n', 'Foreign'),
 ('그', 'Noun'),
 ('누가', 'Noun'),
 ('세상', 'Noun'),
 ('을', 'Josa'),
 ('비출어줄까', 'Verb'),
 ('.', 'Punctuation'),
 ('\n', 'Foreign'),
 ('정희성', 'Noun'),
 (',', 'Punctuation'),
 ('희망', 'Noun'),
 ('공부', 'Noun')]