In [None]:
import os
import jieba
import jieba.analyse
import math
#from New_word_find import New_word

"""
a.file_name:输入文件名,一行一个文本
b.out_file:输出文件名：segment idf
c.out_file:路径与file_name路径一致
"""

class Key_word_exact():

    def __init__(self):
        self.__corpus_file_name = None
        self.__idf_file_name = None
        self.__stop_word_file_name = None
        self.__special_list = []
        self.__exact = None
        self.__topK = 0
        self.__num = 10000
        self.__percentage = 100

    def set_corpus_file_name(self, corpus_file_name):
        assert isinstance(corpus_file_name, str)
        self.__corpus_file_name = corpus_file_name

    def set_idf_file_name(self, idf_file_name):
        assert isinstance(idf_file_name, str)
        self.__idf_file_name = idf_file_name

    def set_stop_word_file_name(self, stop_word_file_name):
        assert isinstance(stop_word_file_name, str)
        self.__stop_word_file_name = stop_word_file_name

    def set_special_list(self, special_list=['\n']):
        assert isinstance(special_list, list)
        self.__special_list = special_list

    def set_exact(self, exact='tfidf'):
        assert isinstance(exact, str) and \
               (exact == 'tfidf' or exact == 'textrank' or exact =='tf' )
        self.__exact = exact

    def set_topK(self, topK=10):
        assert isinstance(topK, int) and topK > 0
        self.__topK = topK

    def set_num(self, num=10000):
        assert isinstance(num, int)
        self.__num = num

    def set_percentage(self, percentage=100):
        assert isinstance(percentage, int) and (percentage >= 0 and percentage <= 100)
        self.__percentage = percentage

    """
    a.待挖掘语料库格式：一行一个文本
    b.关键词挖掘：tfidf,textrank,tf模式,返回一个list,元素是挖掘出的关键词
    c.tfidf,textrank 单个文本挖掘关键词数量top_K
    d.tf 整个挖掘对象返回按tf值大小的关键词数top_K
    """
    def word_exact(self):
        current_path = os.path.abspath(__file__)
        # 获取当前文件的父目录
        father_path = os.path.abspath(os.path.dirname(current_path) + os.path.sep + ".")
        # 获取文件绝对路径
        corpus_path = father_path + '\\' + self.__corpus_file_name
        idf_path = father_path + '\\' + self.__idf_file_name
        stop_word_path = father_path + '\\' + self.__stop_word_file_name

        # 语料库文本的idf值
        with open(corpus_path, 'r', encoding='utf-8') as file:
            corpus_scale = 0
            count_dict = {}
            for text in file.readlines():
                corpus_scale += 1
                text_set = set([segment for segment in jieba.cut(text)])
                for segment in text_set:
                    # 特殊符号去除列表
                    if segment not in self.__special_list:
                        if segment in count_dict.keys():
                            count_dict[segment] += 1
                        else:
                            count_dict[segment] = 1

        with open(idf_path, 'w', encoding='utf-8') as file:
            for segment in count_dict.keys():
                segment_idf = str(math.log((corpus_scale/(count_dict[segment] + 1))))
                out_text = segment + ' ' + segment_idf + '\n'
                file.write(out_text)

        # 加载语料空间的idf词典
        if self.__idf_file_name:
            jieba.analyse.set_idf_path(idf_path)
        # 加载停用词词典
        if self.__stop_word_file_name:
            jieba.analyse.set_stop_words(stop_word_path)

        # 对语料库进行关键词挖掘
        with open(corpus_path, 'r', encoding='utf-8') as file:
            all_key_word = []
            # tfiff 方法挖掘,每个文本返回最多top_K个结果
            if self.__exact == 'tfidf':
                for content in file.readlines()[:self.__num]:
                    tags = jieba.analyse.extract_tags(content, topK=self.__topK)

                    for keyword in tags:
                        if keyword not in all_key_word:
                            all_key_word.append(keyword)

            # textrank 方法挖掘,每个文本返回最多top_K个结果
            elif self.__exact == 'textrank':
                for content in file.readlines()[:self.__num]:
                    tags = jieba.analyse.textrank(content, topK=self.__topK)

                    for keyword in tags:
                        if keyword not in all_key_word:
                            all_key_word.append(keyword)

            # 按词频降序返回所有分词结果
            elif self.__exact == 'tf':
                keyword_dict = {}
                for content in file.readlines()[:self.__num]:
                    content = content.replace('\n', '')
                    tags = jieba.cut_for_search(content)

                    for keyword in tags:
                        if keyword in keyword_dict.keys():
                            keyword_dict[keyword] += 1
                        else:
                            keyword_dict[keyword] = 1

                # 按tf降序
                temp = sorted(keyword_dict.items(), key=lambda item: item[1], reverse=True)
                num = math.floor((self.__percentage / 100) * len(temp))
                all_key_word = [v[0] for v in temp][:num]


        return all_key_word


    def save(self, file_name, all_key_word):
        assert isinstance(file_name, str) and isinstance(all_key_word, list)
        current_path = os.path.abspath(__file__)
        # 获取当前文件的父目录
        father_path = os.path.abspath(os.path.dirname(current_path) + os.path.sep + ".")

        save_path = father_path + '\\' + file_name

        with open(save_path, 'w', encoding='utf-8') as file:
            for key_word in all_key_word:
                key_word = key_word + '\n'
                file.write(key_word)
        print('save finish ! the save path:', save_path)

if __name__=='__main__':

    corpus_file_name = 'text2.txt'
    idf_file_name = 'text_idf.txt'
    stop_word_file_name = 'stopword.txt'

    A = Key_word_exact()
    A.set_corpus_file_name(corpus_file_name)
    A.set_idf_file_name(idf_file_name)
    A.set_stop_word_file_name(stop_word_file_name)
    A.set_special_list()

    A.set_topK(5)

    #默认10000行
    A.set_num()

    A.set_exact('textrank')
    textrank = A.word_exact()
    A.save('textrank.txt', textrank)
    print('textrank : ', textrank)

    A.set_exact('tfidf')
    tfidf = A.word_exact()
    A.save('tfidf.txt', tfidf)
    print('tfidf :', tfidf)

    A.set_percentage(100)
    A.set_exact('tf')
    tf = A.word_exact()
    A.save('tf1.txt', tf)
    print('tf :', tf)
