In [1]:
from basic_methods.feature_extraction import TfidfTransformer 
from sklearn.feature_extraction.text import CountVectorizer 
import pandas as pd
import warnings
import numpy as np

In [2]:
# 读取数据
dataset = pd.read_csv('./resources/train.csv',sep="\t",names=["label","comment"],skiprows=1,encoding='utf-8')
comments = dataset['comment'].tolist()

In [3]:
# 使用 jieba、pkuseg、thulac 对首句切词
import jieba
import pkuseg
import thulac
jieba_cuts = jieba.cut(comments[0])
print(f"jieba result:{' '.join(jieba_cuts)}")
thu_lac = thulac.thulac(seg_only=True)
thu_cuts = thu_lac.cut(comments[0], text=True)
thu_cuts = list(filter(lambda s: not str.isspace(s),thu_cuts.split()))
print(f"thulac result:{' '.join(thu_cuts)}")
pku_seg = pkuseg.pkuseg()
pku_result = pku_seg.cut(comments[0])
print(f"pkuseg result:{' '.join(pku_result)}")

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\lonel\AppData\Local\Temp\jieba.cache
Loading model cost 1.171 seconds.
Prefix dict has been built succesfully.
jieba result:一如既往 地 好吃 ， 希望 可以 开 到 其他 城市
Model loaded succeed
thulac result:一如既往 地 好吃 ， 希望 可以 开 到 其他 城市
pkuseg result:一如既往 地 好吃 ， 希望 可以 开到 其他 城市


In [4]:
# 读取停用词并冻结
with open('./resources/stop_words_zh_cn.txt', 'r', encoding='utf-8') as f:
    stopwords = list(map(lambda line:line.rstrip('\n'),f.readlines()))
stopwords = frozenset(stopwords)

In [5]:
# 所有文档切词
cut_comments = list(map(lambda x: ' '.join(jieba.cut(x)), comments))

In [6]:
# 使用 sklearn 的计数器统计词频，并使用自己编写的 tfidf transformer 转换
warnings.filterwarnings("ignore")
count_vector = CountVectorizer(stop_words=stopwords)
X_train_counts = count_vector.fit_transform(cut_comments)
my_tf_transformer = TfidfTransformer().fit(X_train_counts)
X_train_tf = my_tf_transformer.transform(X_train_counts)
feature_names = count_vector.get_feature_names()

In [7]:
# 部分 idf 值
for idx, word in enumerate(feature_names[-10:]):
  print("{}\t{}".format(word, my_tf_transformer.idf_[idx]))

龙啸	9.111828078308406
龙头	9.517293186416572
龙抄手	9.517293186416572
龙湖	8.824146005856626
龙湾	9.517293186416572
龙眼	9.517293186416572
龙虾	9.111828078308406
龙骨	9.517293186416572
龟苓膏	9.111828078308406
龟速	6.185088676241367


In [8]:
# 对于第 3 条文档，抽取其关键词
doc_idx = 2
feature_index = X_train_tf[doc_idx,:].nonzero()[1]
tfidfs = list(zip([feature_names[x] for x in feature_index],[X_train_tf[doc_idx, x] for x in feature_index]))
sorted(tfidfs, key=lambda x: x[1],reverse=True)
tfidfs[0:10]

[('打扫', 0.3720893477066808),
 ('阿姨', 0.2808786855211425),
 ('场地', 0.36034160249402514),
 ('服务', 0.13519599086077247),
 ('服务行业', 0.3720893477066808),
 ('古色古香', 0.36034160249402514),
 ('干净', 0.18014118338832386),
 ('环境', 0.13619610275722627),
 ('火爆', 0.33203634035971624),
 ('想象', 0.294618819111222)]

In [9]:
# 每文档的五个特征：子句数，词数，每词次数，每词词频，停用词量

In [10]:
# 子句数，展示部分
import re
def cut_sent(para):
    para = re.sub('([。！？\?])([^”’])', r"\1\n\2", para)
    para = re.sub('(\.{6})([^”’])', r"\1\n\2", para)
    para = re.sub('(\…{2})([^”’])', r"\1\n\2", para)
    para = re.sub('([。！？\?][”’])([^，。！？\?])', r'\1\n\2', para)
    para = para.rstrip()
    return para.split("\n")
subsents = list(map(lambda doc: len(list(filter(lambda subsent: not str.isspace(subsent),cut_sent(doc)))),comments))
subsents[0:10]

[1, 1, 2, 1, 1, 1, 1, 1, 1, 2]

In [11]:
# 词数，展示部分
wordnums = list(map(lambda sent: len(sent.split(' ')),cut_comments))
wordnums[0:10]

[10, 11, 44, 10, 30, 5, 5, 32, 9, 9]

In [12]:
# 每词次数, 以第三条部分为例
row_counts = list(zip(feature_names,X_train_counts.toarray()[doc_idx]))
row_counts[-10:]

[('龙啸', 0),
 ('龙头', 0),
 ('龙抄手', 0),
 ('龙湖', 0),
 ('龙湾', 0),
 ('龙眼', 0),
 ('龙虾', 0),
 ('龙骨', 0),
 ('龟苓膏', 0),
 ('龟速', 0)]

In [13]:
# 每词频率，以第三条部分为例
sum_counts = sum(X_train_counts.toarray()[doc_idx])
row_freqs = list(zip(feature_names,X_train_counts.toarray()[doc_idx]/sum_counts))
row_freqs[-10:]

[('龙啸', 0.0),
 ('龙头', 0.0),
 ('龙抄手', 0.0),
 ('龙湖', 0.0),
 ('龙湾', 0.0),
 ('龙眼', 0.0),
 ('龙虾', 0.0),
 ('龙骨', 0.0),
 ('龟苓膏', 0.0),
 ('龟速', 0.0)]

In [14]:
# 停用词数
stopword_nums = [len(list(filter(lambda x: x in stopwords, comment.split(' ')))) for comment in cut_comments]
stopword_nums[0:10]

[5, 5, 30, 4, 12, 2, 1, 21, 4, 6]

In [15]:
# 引入斯坦福 nlp
from stanfordcorenlp import StanfordCoreNLP

In [18]:
sentence = comments[0]

In [17]:
nlp = StanfordCoreNLP(r'resources/stanford-corenlp-full-2018-10-05', lang='zh')

In [19]:
# 分词
print(nlp.word_tokenize(sentence))

['一', '如', '既往', '地', '好吃', '，', '希望', '可以', '开到', '其他', '城市']


In [20]:
# 词性分析
print(nlp.pos_tag(sentence))

[('一', 'CD'), ('如', 'CS'), ('既往', 'VA'), ('地', 'DEV'), ('好吃', 'VA'), ('，', 'PU'), ('希望', 'VV'), ('可以', 'VV'), ('开到', 'VV'), ('其他', 'DT'), ('城市', 'NN')]


In [21]:
# 命名实体识别
print(nlp.ner(sentence))

[('一', 'NUMBER'), ('如', 'O'), ('既往', 'O'), ('地', 'O'), ('好吃', 'O'), ('，', 'O'), ('希望', 'O'), ('可以', 'O'), ('开到', 'O'), ('其他', 'IDEOLOGY'), ('城市', 'O')]


In [22]:
# 解析
print(nlp.parse(sentence))

(ROOT
  (NP
    (QP (CD 一))
    (IP
      (VP
        (ADVP (CS 如))
        (VP
          (VP
            (DVP
              (VP (VA 既往))
              (DEV 地))
            (VP (VA 好吃)))
          (PU ，)
          (VP (VV 希望)
            (IP
              (VP (VV 可以)
                (VP (VV 开到)
                  (NP
                    (DP (DT 其他))
                    (NP (NN 城市))))))))))))


In [23]:
# 依存句法分析
print(nlp.dependency_parse(sentence))

[('ROOT', 0, 7), ('dep', 5, 1), ('advmod', 5, 2), ('advmod:dvp', 5, 3), ('mark', 3, 4), ('dep', 7, 5), ('punct', 7, 6), ('aux:modal', 9, 8), ('ccomp', 7, 9), ('det', 11, 10), ('dobj', 9, 11)]


In [24]:
nlp.close()