# 关键词抽取


* TF-IDF
* TextRank
* [EmbedRank](https://github.com/luozhouyang/embedrank)

In [6]:
import os
import re

In [4]:
!pip install -q jieba

In [7]:
import jieba

## TF-IDF

* **TF**: term frequency, 词语在文档中出现的次数
* **IDF**: inverse doucment frequence, 包含改词语的文档占总文档数量的比例的倒数

$$tf = \frac{count(w)}{\sum_{w_i} count(w_i)}$$

$$idf = \log{\frac{N}{\sum_{i=1}^N I(w, N_i)}}$$

防止分母为零，需要平滑处理，一般采用 **+1** 平滑

$$idf = \log{\frac{N+1}{\sum_{i=1}^N I(w, N_i) + 1}}$$

In [8]:
class KeywordsExtractor:
    
    def __init__(self, stopwords_file=None):
        self.stopwords = self._load_stopwords(stopwords_file) if stopwords_file else None
        
    def _load_stopwords(self, file):
        words = set()
        if not os.path.exists(file):
            print('File %s does not exist.' % file)
            return words
        with open(file, mode='rt', encoding='utf8') as fin:
            for lin in fin:
                line = line.strip('\n').strip()
                if not line:
                    continue
                words.add(line)
        return words
    
    def extract_keywords(self, document, *args, **kwargs):
        raise NotImplementedError()

In [9]:
class TFIDFKeywordsExtractor(KeywordsExtractor):
    
    def __init__(self, idf_file, stopwords_file=None):
        super().__init__(stopwords_file=stopwords_file)
        self.idfmap = self._load_idf(idf_file) if idf_file else dict()
        self.median_idf = sorted(self.idfmap.values())[len(self.idfmap)//2]
        
    def _load_idf(self, file):
        m = dict()
        if not os.path.exists(file):
            print('File %s does not exist.' % file)
            return m
        with open(file, mode='rt', encoding='utf8') as fin:
            for line in fin:
                line = line.strip('\n').strip()
                parts = line.split(' ')
                if len(parts) != 2:
                    continue
                m[parts[0]] = float(parts[1])
        return m
    
    def extract_keywords(self, document, topk=20):
        freq = {}
        for word in jieba.cut(document):
            word = word.strip()
            if len(word) < 2:
                continue
            if word in self.stopwords:
                continue
            freq[word] = freq.get(word, 0) + 1
        
        total_freq = sum(freq.values())
        idf = {}
        for k in freq.keys():
            idf[k] = freq[k] * self.idfmap.get(k, self.median_idf)
        return sorted(idf.items(), key=lambda x:x[1], reverse=True)
        

idf一般需要大量的数据统计得到。

pyspark提供了教程：

* [ml-features](https://spark.apache.org/docs/latest/ml-features)
* [tf-idf](https://spark.apache.org/docs/latest/ml-features#tf-idf)

以下是一个使用spark在Hadoop统计idf的代码：

In [13]:
!pip install -q -i https://mirrors.aliyun.com/pypi/simple pyspark numpy

In [None]:
import re
import logging

import jieba

from pyspark import SparkConf, SparkContext
from pyspark.sql import HiveContext, SparkSession
from pyspark import Row
from pyspark.ml.feature import IDF, HashingTF, Tokenizer


jieba.initialize()


def get_spark(master='local[*]', app_name='idf'):
    spark = SparkSession.builder \
        .appName(app_name) \
        .master(master) \
        .config('spark.executor.memory', '8g') \
        .config('spark.executor.cores', '8') \
        .config('spark.cores.max', '8') \
        .config('spark.driver.memory', '8g') \
        .getOrCreate()
    return spark


def _collect_documents(x):
    segs = x.split('\t')
    if len(segs) != 9:
        return []
    jd_json = segs[8]
    jd_json = re.sub(r'\n\t', '', jd_json)
    jd_json = re.sub(r'\\s+', ' ', jd_json)
    jd_json = jd_json.lower()
    return [jd_json]  # 整个JD作为一个document


def _tokenize(x):
    words = []
    for w in jieba.cut(x):
        w = w.strip()
        if not w:
            continue
        words.append(w)
    return words


def _idf_flat_map(x):
    items = []
    for w, _id, tf, idf in zip(x.words, x.tf.indices, x.tf.values, x.idf.values):
        items.append((w, idf))
    return items


def _debug(x):
    print(type(x))
    return x


def _filter_idf(x):
    w, v = x[0], x[1]
    if len(w) <= 1:
        return False
    if re.match(r'^[0-9]+$', x):
        return False
    if re.match(r'[0-9]{6,}', x):
        return False
    if re.match(r'^[0-9]+.[0-9]+$', w):
        return False
    return True


def calculate(input_path, output_path, parts=16):
    spark = get_spark()
    sc = spark.sparkContext

    rdd = sc.textFile(input_path)
    rdd = rdd.filter(lambda x: len(x.split('\t')) == 9)
    rdd = rdd.flatMap(_collect_documents).filter(lambda x: x)
    rdd = rdd.map(_tokenize).filter(lambda x: x).map(lambda x: Row(words=x))
    # rdd = rdd.map(_debug)

    df = rdd.toDF()
    # numFeatures即hash桶数
    hashingTF = HashingTF(inputCol='words', outputCol='tf', numFeatures=2 << 20)
    featuredData = hashingTF.transform(df)

    idf = IDF(inputCol='tf', outputCol='idf')
    idfModel = idf.fit(featuredData)
    res = idfModel.transform(featuredData)

    rdd = res.rdd.flatMap(_idf_flat_map).reduceByKey(lambda a, b: a).sortBy(lambda x: x[0], ascending=True)
    rdd = rdd.filter(_filter_idf)
    rdd = rdd.map(lambda x: x[0] + '\t' + str(x[1]))
    rdd.repartition(parts).saveAsTextFile(output_path)


if __name__ == "__main__":
    input_file = 'hdfs:///basic_data/tob/tob_ats/recruit_step_v3/part-00099-8d87777f-34ee-431a-be5d-8a6f0b92fea9-c000.txt'
    output_file = 'hdfs:///user/kdd_luozhouyang/idf/jd/20200509'
    calculate(input_file, output_file, parts=1)


## TextRank



In [2]:
!pip install -q -i https://mirrors.aliyun.com/pypi/simple networkx

In [20]:
import itertools

import networkx as nx
import jieba.posseg as jp

In [61]:
class TextRankKeywordsExtractor(KeywordsExtractor):
    
    def _unique_tokens(self, all_words):
        words = []
        for k in all_words:
            if k in words:
                continue
            words.append(k)
        return words
    
    def _edit_distance(self, a, b):
        m, n = len(a)-1, len(b)-1
        dp = [[0]*(n+1) for _ in range(m+1)] # (m+1)*(n+1)
        for i in range(m+1):
            dp[i][0] = i
        for j in range(n+1):
            dp[0][j] = j
        for i in range(1, m+1):
            for j in range(1, n+1):
                if a[i-1] == b[j-1]:
                    dp[i][j] = dp[i-1][j-1]
                else:
                    dp[i][j] = 1 + max(dp[i-1][j], dp[i][j-1])
        return dp[m][n]
    
    def _build_graph(self, words):
        g = nx.Graph()
        g.add_nodes_from(words)
        pairs = list(itertools.combinations(words, 2))
        
        for p in pairs:
            first, second = p[0], p[1]
            # 使用编辑距离来作为词语的相似度衡量，可以使用其他方式
            ed = self._edit_distance(first, second)
            g.add_edge(first, second, weight=ed)
        
        return g
    
    def extract_keywords(self, document):
        words = [w.strip() for w in jieba.cut(document) if w.strip()]
        unique_words = self._unique_tokens(words)
        
        graph = self._build_graph(unique_words)
        textrank = nx.pagerank(graph, weight='weight')
        print(textrank)
        # 所有的节点
        keyphrase = sorted(textrank, key=textrank.get, reverse=True)
        # 取1/3
        keyphrase = keyphrase[0:len(unique_words)//3 + 1]
        print(keyphrase)
        
        # 相邻的词合并成短语
        res, dealt = set(), set()
        i, j = 0, 0
        while j < len(words):
            a, b = words[i], words[j]
            if a in keyphrase and b in keyphrase:
                res.add(a + ' ' + b)
                dealt.add(a)
                dealt.add(b)
            else:
                if a in keyphrase and a not in dealt:
                    res.add(a)
                if j == len(words)-1 and b in keyphrase and b not in dealt:
                    res.add(b)
            i += 1
            j += 1
        return res


In [62]:
textrank = TextRankKeywordsExtractor()

In [63]:
res = textrank.extract_keywords('java开发工程师')
print(res)

{'java': 0.37078347266331135, '开发': 0.2962171231279811, '工程师': 0.3329994042087073}
['java', '工程师']
{'java java', '工程师 工程师'}
