In [43]:
# 分词，不同模式
import jieba

seg_list = jieba.cut('我在学习自然语言处理', cut_all=True)
print('Full Mode: ', '/'.join(seg_list))  # 全模式

seg_list = jieba.cut('我在学习自然语言处理', cut_all=False)
print('Default Mode: ', '/'.join(seg_list))  # 精确模式

seg_list = jieba.cut('他来到了网易京研大厦')  
print('Default Mode: ', '/'.join(seg_list))  # 默认精确模式

seg_list = jieba.cut_for_search('小明硕士毕业于中国科学院计算所，后在日本京都大学深造')
print('Search Mode: ', '/'.join(seg_list))  # 搜索引擎模式


Full Mode:  我/在/学习/自然/自然语言/语言/处理
Default Mode:  我/在/学习/自然语言/处理
Default Mode:  他/来到/了/网易/京研/大厦
Search Mode:  小明/硕士/毕业/于/中国/科学/学院/科学院/中国科学院/计算/计算所/，/后/在/日本/京都/大学/日本京都大学/深造


In [82]:
# 示例，给一个篇文章分词
import jieba
import jieba.analyse
import re
import codecs
import string
from zhon.hanzi import punctuation

# 去掉标点符号
all_punctuation = punctuation + string.punctuation  # 中文标点符号和英文标点符号的集合
punctuation_pattern = '[{p}]+'.format(p=all_punctuation)

# 加载自定义字典和停用词
user_dict_file = 'D:/lijiangming/docs/algorithm/library/my_corpus/my.word.dict.txt'
jieba.load_userdict(user_dict_file)
stop_words_file = 'D:/lijiangming/docs/algorithm/library/my_corpus/my.stop.words.txt'
stop_words = [' ']
with open(stop_words_file, 'r', encoding='utf-8') as fr:
    for line in fr.readlines():
        line = line.strip()
        stop_words.append(line)

# 加载语料，做分词
my_corpus_file = 'D:/lijiangming/docs/algorithm/library/my_corpus/my.corpus.txt'
out_put = ''
total_word_count = 0
with open(my_corpus_file, 'r', encoding='utf-8') as fr:
    for line in fr.readlines():
        line = line.strip()
        if len(line) == 0:
            continue
        line = re.sub(punctuation_pattern, ' ', line)  # 去掉标点符号
        line = line.strip(' ')
        seg_list = jieba.cut(line, cut_all=False)
        
        result = []
        for seg in seg_list:
            if seg in stop_words:
                continue
            result.append(seg)
        
        total_word_count += len(result)
        out_put += ' '.join(result) + '\n'

out_file = 'D:/lijiangming/docs/algorithm/library/my_corpus/my.corpus.seg.txt'
fw = open(out_file, 'w', encoding='utf-8')
fw.write(out_put)
fw.close()


In [36]:
# 关键词提取

import codecs
import jieba.analyse as analyse

lines = codecs.open('D:/lijiangming/docs/algorithm/article/首届蓝莓大赏.txt', encoding='utf-8').read()

print('analyse: ', analyse.extract_tags(lines, topK=10, withWeight=True, allowPOS=()))
# print('lines: ', lines)
print('textrank: ', analyse.textrank(lines, topK=20, withWeight=False, allowPOS=()))

analyse:  [('VIPKID', 0.46233907469779), ('蓝莓', 0.29721797659143645), ('评测', 0.23389380244254146), ('教研', 0.1651210981063536), ('机构', 0.10281262053458563), ('外教', 0.09698994260718233), ('教材', 0.09522070633414365), ('服务', 0.08854393254812154), ('在线', 0.08843672689878454), ('家长', 0.08365698062375691)]
textrank:  []


In [29]:
# 词性标注

import jieba.posseg as pseg
words = pseg.cut('我爱北京天安门')
for word, flag in words:
    print(word, '\t', flag)


我 	 r
爱 	 v
北京 	 ns
天安门 	 ns
