# 字词前置处理

## 载入相关套件

In [1]:
import nltk

## 测试文章段落

In [17]:
text="Today is a great day. It is even better than yesterday." + \
     " And yesterday was the best day ever."

## 分割字句

In [5]:
nltk.sent_tokenize(text)

['Today is a great day.',
 'It is even better than yesterday.',
 'And yesterday was the best day ever.']

## 分词

In [6]:
nltk.word_tokenize(text)

['Today',
 'is',
 'a',
 'great',
 'day',
 '.',
 'It',
 'is',
 'even',
 'better',
 'than',
 'yesterday',
 '.',
 'And',
 'yesterday',
 'was',
 'the',
 'best',
 'day',
 'ever',
 '.']

## 词形还原

In [8]:
# 字根词形还原(Stemming)
text = 'My system keeps crashing his crashed yesterday, ours crashes daily'
ps = nltk.porter.PorterStemmer()
' '.join([ps.stem(word) for word in text.split()])

'My system keep crash hi crash yesterday, our crash daili'

In [9]:
# 依字典规则的词形还原(Lemmatization)
text = 'My system keeps crashing his crashed yesterday, ours crashes daily'
lem = nltk.WordNetLemmatizer()
' '.join([lem.lemmatize(word) for word in text.split()])

'My system keep crashing his crashed yesterday, ours crash daily'

## 停用词(Stopwords)

In [19]:
# 标点符号(Punctuation)
import string
print('标点符号:', string.punctuation)

# 测试文章段落
text="Today is a great day. It is even better than yesterday." + \
     " And yesterday was the best day ever."
# 读取停用词
stopword_list = set(nltk.corpus.stopwords.words('english') 
                    + list(string.punctuation))

# 移除停用词(Removing Stopwords)
def remove_stopwords(text, is_lower_case=False):
    if is_lower_case:
        text = text.lower()
    tokens = nltk.word_tokenize(text)
    tokens = [token.strip() for token in tokens]
    filtered_tokens = [token for token in tokens if token not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text, filtered_tokens

filtered_text, filtered_tokens = remove_stopwords(text) 
filtered_text

标点符号: !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


'Today great day It even better yesterday And yesterday best day ever'

## BOW 测试

In [25]:
# 测试文章段落
with open('./NLP_data/news.txt','r+', encoding='UTF-8') as f:
    text = f.read()

filtered_text, filtered_tokens = remove_stopwords(text, True) 

import collections
# 生字表的集合
word_freqs = collections.Counter()
for word in filtered_tokens:
    word_freqs[word] += 1
print(word_freqs.most_common(20))         

[('’', 35), ('stores', 15), ('convenience', 14), ('one', 8), ('—', 8), ('even', 8), ('seoul', 8), ('city', 7), ('korea', 6), ('korean', 6), ('cities', 6), ('people', 5), ('summer', 4), ('new', 4), ('also', 4), ('find', 4), ('store', 4), ('would', 4), ('like', 4), ('average', 4)]


In [29]:
# 移除停用词(Removing Stopwords)
lem = nltk.WordNetLemmatizer()
def remove_stopwords_regex(text, is_lower_case=False):
    if is_lower_case:
        text = text.lower()
    tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+') # 筛选文数字(Alphanumeric)
    tokens = tokenizer.tokenize(text)
    tokens = [lem.lemmatize(token.strip()) for token in tokens] # 词形还原
    filtered_tokens = [token for token in tokens if token not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text, filtered_tokens

filtered_text, filtered_tokens = remove_stopwords_regex(text, True) 
word_freqs = collections.Counter()
for word in filtered_tokens:
    word_freqs[word] += 1
print(word_freqs.most_common(20))         

[('store', 19), ('convenience', 14), ('city', 13), ('one', 8), ('even', 8), ('seoul', 8), ('korea', 6), ('korean', 6), ('night', 6), ('food', 5), ('ha', 5), ('people', 5), ('summer', 4), ('new', 4), ('life', 4), ('also', 4), ('find', 4), ('would', 4), ('like', 4), ('chain', 4)]


In [32]:
lem.lemmatize('korean')

'korean'

## 相似词(Synonyms)

In [9]:
synonyms = nltk.corpus.wordnet.synsets('love')
synonyms

[Synset('love.n.01'),
 Synset('love.n.02'),
 Synset('beloved.n.01'),
 Synset('love.n.04'),
 Synset('love.n.05'),
 Synset('sexual_love.n.02'),
 Synset('love.v.01'),
 Synset('love.v.02'),
 Synset('love.v.03'),
 Synset('sleep_together.v.01')]

In [11]:
# 单字说明
synonyms[0].definition()

'a strong positive emotion of regard and affection'

In [12]:
# 单字的例句
synonyms[0].examples()

['his love for his work', 'children need a lot of love']

## 相反词(Antonyms)

In [14]:
antonyms=[]
for syn in nltk.corpus.wordnet.synsets('ugly'):
    for l in syn.lemmas():
        if l.antonyms():
            antonyms.append(l.antonyms()[0].name())
antonyms

['beautiful']

## 词性标签(POS Tagging)

In [16]:
text='I am a human being, capable of doing terrible things'
sentences=nltk.sent_tokenize(text)
for sent in sentences:
    print(nltk.pos_tag(nltk.word_tokenize(sent)))

[('I', 'PRP'), ('am', 'VBP'), ('a', 'DT'), ('human', 'JJ'), ('being', 'VBG'), (',', ','), ('capable', 'JJ'), ('of', 'IN'), ('doing', 'VBG'), ('terrible', 'JJ'), ('things', 'NNS')]
