In [1]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize, WordPunctTokenizer, RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, LancasterStemmer, WordNetLemmatizer
from konlpy.tag import Okt
import re

In [2]:
para = "Hello everyone. It's good to see you. Let's start our text mining class!"
print(sent_tokenize(para))

['Hello everyone.', "It's good to see you.", "Let's start our text mining class!"]


In [3]:
para_kor = "안녕하세요, 여러분. 만나서 반갑습니다. 이제 텍스트마이닝 클래스를 시작해봅시다!"
print(sent_tokenize(para_kor))

['안녕하세요, 여러분.', '만나서 반갑습니다.', '이제 텍스트마이닝 클래스를 시작해봅시다!']


In [4]:
word_tokenize(para)

['Hello',
 'everyone',
 '.',
 'It',
 "'s",
 'good',
 'to',
 'see',
 'you',
 '.',
 'Let',
 "'s",
 'start',
 'our',
 'text',
 'mining',
 'class',
 '!']

In [5]:
print(WordPunctTokenizer().tokenize(para))

['Hello', 'everyone', '.', 'It', "'", 's', 'good', 'to', 'see', 'you', '.', 'Let', "'", 's', 'start', 'our', 'text', 'mining', 'class', '!']


In [7]:
tokenizer = RegexpTokenizer(r"[\w']{2,}")
tokenizer.tokenize("Sorry, I can't go there.".lower())

['sorry', "can't", 'go', 'there']

In [8]:
eng_stops = set(stopwords.words('english'))
len(eng_stops)

179

In [9]:
text1 = "Sorry, I couldn't go to movie yesterday."
tokenizer = RegexpTokenizer(r"[\w']{2,}")
tokens = tokenizer.tokenize(text1.lower())
result = [word for word in tokens if word not in eng_stops]
result

['sorry', 'go', 'movie', 'yesterday']

In [10]:
stemmer = PorterStemmer()
stemmer.stem('cooking'), stemmer.stem('cookery'), stemmer.stem('cookbooks')

('cook', 'cookeri', 'cookbook')

In [11]:
stemmer = LancasterStemmer()
stemmer.stem('cooking'), stemmer.stem('cookery'), stemmer.stem('cookbooks')

('cook', 'cookery', 'cookbook')

In [13]:
para = "Hello everyone. It's good to see you. Let's start our text mining class!"
para = re.sub(r"[^\w\s']", "", para)
tokens = word_tokenize(para)
stemmer = PorterStemmer()
eng_stops = set(stopwords.words('english'))
result = [stemmer.stem(token) for token in tokens if token not in eng_stops]
result

['hello',
 'everyon',
 'it',
 "'s",
 'good',
 'see',
 'let',
 "'s",
 'start',
 'text',
 'mine',
 'class']

In [14]:
para = "Hello everyone. It's good to see you. Let's start our text mining class!"
para = re.sub(r"[^\w\s']", "", para)
tokens = word_tokenize(para)
lemma = WordNetLemmatizer()
eng_stops = set(stopwords.words('english'))
result = [lemma.lemmatize(token) for token in tokens if token not in eng_stops]
result

['Hello',
 'everyone',
 'It',
 "'s",
 'good',
 'see',
 'Let',
 "'s",
 'start',
 'text',
 'mining',
 'class']

In [18]:
lemma = WordNetLemmatizer()
lemma.lemmatize('cooking', pos='n'), lemma.lemmatize('cooking', pos='v'), lemma.lemmatize('cookery'), lemma.lemmatize('cookbooks')

('cooking', 'cook', 'cookery', 'cookbook')

In [19]:
text = "Hello everyone. It's good to see you. Let's start our text mining class!"
tokens = word_tokenize(text)
print(nltk.pos_tag(tokens))

[('Hello', 'NNP'), ('everyone', 'NN'), ('.', '.'), ('It', 'PRP'), ("'s", 'VBZ'), ('good', 'JJ'), ('to', 'TO'), ('see', 'VB'), ('you', 'PRP'), ('.', '.'), ('Let', 'VB'), ("'s", 'POS'), ('start', 'VB'), ('our', 'PRP$'), ('text', 'NN'), ('mining', 'NN'), ('class', 'NN'), ('!', '.')]


In [20]:
nltk.help.upenn_tagset('NNP')

NNP: noun, proper, singular
    Motown Venneboerger Czestochwa Ranzer Conchita Trumplane Christos
    Oceanside Escobar Kreisler Sawyer Cougar Yvette Ervin ODI Darryl CTCA
    Shannon A.K.C. Meltex Liverpool ...


In [21]:
my_tag_set = ['NN', 'VB', 'JJ']
my_words = [token for token, pos in nltk.pos_tag(tokens) if pos in my_tag_set]
my_words

['everyone', 'good', 'see', 'Let', 'start', 'text', 'mining', 'class']

In [23]:
words_with_tag = ['/'.join([token, pos]) for token, pos in nltk.pos_tag(tokens) if pos in my_tag_set]
words_with_tag

['everyone/NN',
 'good/JJ',
 'see/VB',
 'Let/VB',
 'start/VB',
 'text/NN',
 'mining/NN',
 'class/NN']

In [24]:
okt = Okt()

In [25]:
sentence = '''절망의 반대가 희망은 아니다.
어두운 밤하늘에 별이 빛나듯
희망은 절망 속에 싹트는 거지
만약에 우리가 희망함이 적다면
그 누가 세상을 비출어줄까.
정희성, 희망 공부'''

In [27]:
print(okt.morphs(sentence))

['절망', '의', '반대', '가', '희망', '은', '아니다', '.', '\n', '어', '두운', '밤하늘', '에', '별', '이', '빛나듯', '\n', '희망', '은', '절망', '속', '에', '싹트는', '거지', '\n', '만약', '에', '우리', '가', '희망', '함', '이', '적다면', '\n', '그', '누가', '세상', '을', '비출어줄까', '.', '\n', '정희성', ',', '희망', '공부']


In [28]:
print(okt.nouns(sentence))

['절망', '반대', '희망', '어', '두운', '밤하늘', '별', '희망', '절망', '속', '거지', '만약', '우리', '희망', '함', '그', '누가', '세상', '정희성', '희망', '공부']


In [29]:
print(okt.pos(sentence))

[('절망', 'Noun'), ('의', 'Josa'), ('반대', 'Noun'), ('가', 'Josa'), ('희망', 'Noun'), ('은', 'Josa'), ('아니다', 'Adjective'), ('.', 'Punctuation'), ('\n', 'Foreign'), ('어', 'Noun'), ('두운', 'Noun'), ('밤하늘', 'Noun'), ('에', 'Josa'), ('별', 'Noun'), ('이', 'Josa'), ('빛나듯', 'Verb'), ('\n', 'Foreign'), ('희망', 'Noun'), ('은', 'Josa'), ('절망', 'Noun'), ('속', 'Noun'), ('에', 'Josa'), ('싹트는', 'Verb'), ('거지', 'Noun'), ('\n', 'Foreign'), ('만약', 'Noun'), ('에', 'Josa'), ('우리', 'Noun'), ('가', 'Josa'), ('희망', 'Noun'), ('함', 'Noun'), ('이', 'Josa'), ('적다면', 'Verb'), ('\n', 'Foreign'), ('그', 'Noun'), ('누가', 'Noun'), ('세상', 'Noun'), ('을', 'Josa'), ('비출어줄까', 'Verb'), ('.', 'Punctuation'), ('\n', 'Foreign'), ('정희성', 'Noun'), (',', 'Punctuation'), ('희망', 'Noun'), ('공부', 'Noun')]
