In [2]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize, WordPunctTokenizer, RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, LancasterStemmer, WordNetLemmatizer
from nltk.tag import pos_tag

from konlpy.tag import Okt

import re

In [3]:
#nltk.download('punkt')
#nltk.download('punkt_tab')
#nltk.download('webtext')
#nltk.download('wordnet')
#nltk.download('stopwords')
#nltk.download('averaged_perceptron_tagger')
#nltk.download('averaged_perceptron_tagger_eng')
#nltk.download('tagsets_json')

In [4]:
para = "Hello everyone. It's good to see you. Let's start our text mining class!"
print(sent_tokenize(para))

['Hello everyone.', "It's good to see you.", "Let's start our text mining class!"]


In [5]:
para_kor = "안녕하세요, 여러분. 만나서 반갑습니다. 이제 텍스트마이닝 클래스를 시작해봅시다!"
print(sent_tokenize(para_kor))

['안녕하세요, 여러분.', '만나서 반갑습니다.', '이제 텍스트마이닝 클래스를 시작해봅시다!']


In [6]:
print(word_tokenize(para)), print(WordPunctTokenizer().tokenize(para))

['Hello', 'everyone', '.', 'It', "'s", 'good', 'to', 'see', 'you', '.', 'Let', "'s", 'start', 'our', 'text', 'mining', 'class', '!']
['Hello', 'everyone', '.', 'It', "'", 's', 'good', 'to', 'see', 'you', '.', 'Let', "'", 's', 'start', 'our', 'text', 'mining', 'class', '!']


(None, None)

In [7]:
print(re.findall(r'[abc]', 'How are you, boy?'))
print(re.findall(r'[\w]', '3a 7b_ ".^&5Cp9D'))
print(re.findall(r'[_]+', 'a_b, c__d, e___f'))
print(re.findall(r'[\w]+', 'How are you, boy?'))
print(re.findall(r'[o]{2,4}', 'How are you, boy? booy? booooy? boooooy?'))

['a', 'b']
['3', 'a', '7', 'b', '_', '5', 'C', 'p', '9', 'D']
['_', '__', '___']
['How', 'are', 'you', 'boy']
['oo', 'oooo', 'oooo']


In [8]:
text1 = "Sorry, I can't go there."
tokenizer = RegexpTokenizer(r"[\w']+")
print(tokenizer.tokenize("Sorry, I can't go there."))

['Sorry', 'I', "can't", 'go', 'there']


In [9]:
tokenizer = RegexpTokenizer(r"[\w']{3,}")
print(tokenizer.tokenize(text1.lower()))

['sorry', "can't", 'there']


In [10]:
eng_stops = set(stopwords.words('english'))
print(eng_stops)

{'by', 'of', 'after', 'now', 'am', 'be', 'on', 'the', 'an', 'so', 'into', 'was', 'having', 'itself', 'while', 'but', 'her', 'where', 'yours', 'shouldn', "i've", "couldn't", 'just', 'other', "that'll", 'through', "wasn't", 'further', "doesn't", 'only', "you'd", 'or', 'more', 'shan', "he's", 'themselves', 'who', 'me', "hasn't", 'herself', 'll', 'such', 'myself', 'most', 'she', 'd', "mustn't", 'should', 'didn', 'own', 'needn', 'then', 'don', 'haven', 'hadn', "i'll", 'ma', "aren't", 'isn', 'at', "shan't", "you've", 'again', 'few', 'off', 'each', "they're", 'all', 'same', 'from', 'my', 'with', 'won', 'hasn', "hadn't", 'did', 'do', 'between', 'has', 'if', "it's", 'no', 'to', 'both', "should've", 'how', "isn't", "they'll", 'm', 'couldn', 'any', 'against', 'below', 'their', 'some', "he'd", 'in', 'why', 'is', 'being', 'yourself', 'during', 'than', "you'll", 're', 'its', 'o', "they've", 'very', 'can', "she'd", "didn't", 'his', "needn't", 'what', 'when', 'i', 'hers', 'doing', 'under', 'and', 'doe

In [11]:
text1 = "Sorry, I couldn't go to movie yesterday."
tokenizer = RegexpTokenizer(r"[\w']+")
tokens = tokenizer.tokenize(text1.lower())
result = [word for word in tokens if word not in eng_stops]
result

['sorry', 'go', 'movie', 'yesterday']

In [12]:
stemmers = [print(stemmer.stem('cooking'), stemmer.stem('cookery'), stemmer.stem('cookbooks')) for stemmer in [PorterStemmer(), LancasterStemmer()]]

cook cookeri cookbook
cook cookery cookbook


In [13]:
stemmer = PorterStemmer()
print(stemmer.stem('cooking'), stemmer.stem('cookery'), stemmer.stem('cookbooks'))
stemmer = LancasterStemmer()
print(stemmer.stem('cooking'), stemmer.stem('cookery'), stemmer.stem('cookbooks'))

cook cookeri cookbook
cook cookery cookbook


In [14]:
para = "Hello everyone. It's good to see you. Let's start our text mining class!"
para = re.sub(r"[^\w\s']", "", para)
print(para)
tokens = word_tokenize(para)
print(tokens)
result = [stemmer.stem(token) for token in tokens]
print(result)

Hello everyone It's good to see you Let's start our text mining class
['Hello', 'everyone', 'It', "'s", 'good', 'to', 'see', 'you', 'Let', "'s", 'start', 'our', 'text', 'mining', 'class']
['hello', 'everyon', 'it', "'s", 'good', 'to', 'see', 'you', 'let', "'s", 'start', 'our', 'text', 'min', 'class']


In [15]:
lemma = WordNetLemmatizer()
print(lemma.lemmatize('cooking'), lemma.lemmatize('cooking', pos='v'), lemma.lemmatize('cookery'), lemma.lemmatize('cookbooks'))

cooking cook cookery cookbook


In [16]:
stemmer = PorterStemmer()
word = 'believes'
print(stemmer.stem(word), lemma.lemmatize(word), lemma.lemmatize(word, pos='v'))

believ belief believe


In [17]:
text = "Hello everyone. It's good to see you. Let's start our text mining class!"
tokens = word_tokenize(text)
print(tokens)
print(pos_tag(tokens))

['Hello', 'everyone', '.', 'It', "'s", 'good', 'to', 'see', 'you', '.', 'Let', "'s", 'start', 'our', 'text', 'mining', 'class', '!']
[('Hello', 'NNP'), ('everyone', 'NN'), ('.', '.'), ('It', 'PRP'), ("'s", 'VBZ'), ('good', 'JJ'), ('to', 'TO'), ('see', 'VB'), ('you', 'PRP'), ('.', '.'), ('Let', 'VB'), ("'s", 'POS'), ('start', 'VB'), ('our', 'PRP$'), ('text', 'NN'), ('mining', 'NN'), ('class', 'NN'), ('!', '.')]


In [18]:
nltk.help.upenn_tagset('CC')

CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet


In [19]:
my_tagset = ['NN', 'VB', 'JJ']
my_words = [(word, tag) for word, tag in pos_tag(tokens) if tag in my_tagset]
print(my_words)

[('everyone', 'NN'), ('good', 'JJ'), ('see', 'VB'), ('Let', 'VB'), ('start', 'VB'), ('text', 'NN'), ('mining', 'NN'), ('class', 'NN')]


In [20]:
words_with_tag = ['/'.join(item) for item in pos_tag(tokens)]
print(words_with_tag)

['Hello/NNP', 'everyone/NN', './.', 'It/PRP', "'s/VBZ", 'good/JJ', 'to/TO', 'see/VB', 'you/PRP', './.', 'Let/VB', "'s/POS", 'start/VB', 'our/PRP$', 'text/NN', 'mining/NN', 'class/NN', '!/.']


In [22]:
okt = Okt()
sentence = '''절망의 반대가 희망은 아니다.
어두운 밤하늘에 별이 빛나듯
희망은 절망 속에 싹트는 거지
만약에 우리가 희망함이 적다면
그 누가 세상을 비출어줄까.
정희성, 희망 공부'''

In [23]:
print(okt.morphs(sentence))
print(okt.nouns(sentence))
print(okt.pos(sentence))

['절망', '의', '반대', '가', '희망', '은', '아니다', '.', '\n', '어', '두운', '밤하늘', '에', '별', '이', '빛나듯', '\n', '희망', '은', '절망', '속', '에', '싹트는', '거지', '\n', '만약', '에', '우리', '가', '희망', '함', '이', '적다면', '\n', '그', '누가', '세상', '을', '비출어줄까', '.', '\n', '정희성', ',', '희망', '공부']
['절망', '반대', '희망', '어', '두운', '밤하늘', '별', '희망', '절망', '속', '거지', '만약', '우리', '희망', '함', '그', '누가', '세상', '정희성', '희망', '공부']
[('절망', 'Noun'), ('의', 'Josa'), ('반대', 'Noun'), ('가', 'Josa'), ('희망', 'Noun'), ('은', 'Josa'), ('아니다', 'Adjective'), ('.', 'Punctuation'), ('\n', 'Foreign'), ('어', 'Noun'), ('두운', 'Noun'), ('밤하늘', 'Noun'), ('에', 'Josa'), ('별', 'Noun'), ('이', 'Josa'), ('빛나듯', 'Verb'), ('\n', 'Foreign'), ('희망', 'Noun'), ('은', 'Josa'), ('절망', 'Noun'), ('속', 'Noun'), ('에', 'Josa'), ('싹트는', 'Verb'), ('거지', 'Noun'), ('\n', 'Foreign'), ('만약', 'Noun'), ('에', 'Josa'), ('우리', 'Noun'), ('가', 'Josa'), ('희망', 'Noun'), ('함', 'Noun'), ('이', 'Josa'), ('적다면', 'Verb'), ('\n', 'Foreign'), ('그', 'Noun'), ('누가', 'Noun'), ('세상', 'Noun'), ('을', 'Jo