In [1]:
import os
import sys
# 如果当前代码文件运行测试需要加入修改路径，避免出现后导包问题
BASE_DIR = os.path.dirname(os.path.dirname(os.getcwd()))
sys.path.insert(0, os.path.join(BASE_DIR))
PYSPARK_PYTHON = "/miniconda2/envs/reco_sys/bin/python"
# 当存在多个版本时，不指定很可能会导致出错
os.environ["PYSPARK_PYTHON"] = PYSPARK_PYTHON
os.environ["PYSPARK_DRIVER_PYTHON"] = PYSPARK_PYTHON
from offline import SparkSessionBase

class KeywordsToTfidf(SparkSessionBase):

    SPARK_APP_NAME = "keywordsByTFIDF"
    #SPARK_EXECUTOR_MEMORY = "7g"

    ENABLE_HIVE_SUPPORT = True

    def __init__(self):
        self.spark = self._create_spark_session()

In [2]:

ktt = KeywordsToTfidf()

In [3]:
ktt.spark.sql("use article")
article_dataframe = ktt.spark.sql("select * from article_data limit 20")

In [4]:
article_dataframe.show()

+----------+----------+------------+--------------------+--------------------+--------------------+
|article_id|channel_id|channel_name|               title|             content|            sentence|
+----------+----------+------------+--------------------+--------------------+--------------------+
|         1|        17|          前端|     Vue props用法小结原荐|<p><strong>Vue pr...|前端,Vue props用法小结原...|
|         2|        17|          前端|vue.js响应式原理解析与实现—...|<p>上次我们已经分析了vue.j...|前端,vue.js响应式原理解析与...|
|         3|        17|          前端|JavaScript中浅拷贝和深拷...|<p>要理解 JavaScript...|前端,JavaScript中浅拷贝...|
|         4|        17|          前端|基于vue2.0 +vuex+ e...|<p>效果演示地址,</p><p>...|前端,基于vue2.0 +vuex...|
|         5|        17|          前端|immutability因Reac...|<p><img src="http...|前端,immutability因R...|
|         6|        17|          前端|简单了解 node npm cnp...|<span id="OSC_h1_...|前端,简单了解 node npm ...|
|         7|        17|          前端|       Web工程师以太坊入门原荐|<p>我经常构建使用以太坊的Web...|前端,Web工程师以太坊入门原荐,...|


In [5]:
article_dataframe.rdd.take(1)

[Row(article_id=1, channel_id=17, channel_name='前端', title='Vue props用法小结原荐', content='<p><strong>Vue props用法详解</strong>组件接受的选项之一 props 是 Vue 中非常重要的一个选项。父子组件的关系可以总结为：</p><p><code>props down, events up</code></p><p>父组件通过 props 向下传递数据给子组件；子组件通过 events 给父组件发送消息。</p><p><strong>父子级组件</strong>比如我们需要创建两个组件 parent 和 child。需要保证每个组件可以在相对隔离的环境中书写，这样也能提高组件的可维护性。</p><p>这里我们先定义父子两个组件和一个 Vue 对象：</p><pre><code>var childNode = { template: `    &lt;div&gt;childNode&lt;/div&gt;    `};var parentNode = { template: `    &lt;div&gt;     &lt;child&gt;&lt;/child&gt;     &lt;child&gt;&lt;/child&gt;    &lt;/div&gt;    `, components: {  child: childNode }//前端全栈学习交流圈：866109386};//帮助1-3Ian前端人员，突破技术瓶颈，提升思维能力new Vue({ el: "#example", components: {  parent: parentNode }});</code></pre><pre><code>&lt;div id="example"&gt; &lt;parent&gt;&lt;/parent&gt;&lt;/div&gt;</code></pre><p>这里的 childNode 定义的 template 是一个 div，并且内容是"childNode"字符串。而在 parentNode 的 template 中定义了 div 的 class 名叫 parent 并且包含了两个 child 组件。</p><p><strong>静态 pr

In [6]:
def segmentation(partition):
    import os
    import re

    import jieba
    import jieba.analyse
    import jieba.posseg as pseg
    import codecs

    abspath = "/root/words"

    # 结巴加载用户词典
    userDict_path = os.path.join(abspath, "ITKeywords.txt")
    jieba.load_userdict(userDict_path)

    # 停用词文本
    stopwords_path = os.path.join(abspath, "stopwords.txt")

    def get_stopwords_list():
        """返回stopwords列表"""
        stopwords_list = [i.strip()
                          for i in codecs.open(stopwords_path).readlines()]
        return stopwords_list

    # 所有的停用词列表
    stopwords_list = get_stopwords_list()

    # 分词
    def cut_sentence(sentence):
        """对切割之后的词语进行过滤，去除停用词，保留名词，英文和自定义词库中的词，长度大于2的词"""
        # print(sentence,"*"*100)
        # eg:[pair('今天', 't'), pair('有', 'd'), pair('雾', 'n'), pair('霾', 'g')]
        seg_list = pseg.lcut(sentence)
        seg_list = [i for i in seg_list if i.word not in stopwords_list]
        filtered_words_list = []
        for seg in seg_list:
            # print(seg)
            if len(seg.word) <= 1:
                continue
            elif seg.flag == "eng":
                if len(seg.word) <= 2:
                    continue
                else:
                    filtered_words_list.append(seg.word)
            elif seg.flag.startswith("n"):
                filtered_words_list.append(seg.word)
            elif seg.flag in ["x", "eng"]:  # 是自定一个词语或者是英文单词
                filtered_words_list.append(seg.word)
        return filtered_words_list

    for row in partition:
        sentence = re.sub("<.*?>", "", row.sentence)    # 替换掉标签数据
        words = cut_sentence(sentence)
        yield row.article_id, row.channel_id, words

In [7]:
words_df = article_dataframe.rdd.mapPartitions(segmentation).toDF(["article_id", "channel_id", "words"])


In [8]:
words_df.show()

+----------+----------+--------------------+
|article_id|channel_id|               words|
+----------+----------+--------------------+
|         1|        17|[Vue, props, 用法, ...|
|         2|        17|[vue, 响应式, 原理, mo...|
|         3|        17|[JavaScript, 浅拷贝,...|
|         4|        17|[vue2, vuex, elem...|
|         5|        17|[immutability, Re...|
|         6|        17|[node, npm, cnpm,...|
|         7|        17|[Web, 工程师, 以太坊, 入...|
|         8|        17|[Web, pa, api, we...|
|         9|        17|[vue, 中用, 数据驱动, 视...|
|        10|        17|[程序, WebSocket, 长...|
|        11|        17|[flux, 架构, flux, ...|
|        12|        17|[合格, TypeScript, ...|
|        13|        17|[专属, 插件, Easy, Sl...|
|        14|        17|[前后端分离, vue, 网站前台...|
|        15|        17|[ajax, 页面, 重复提交, ...|
|        17|        17|[JSsearch, 购物网站, ...|
|        18|        17|[web, pa, react, ...|
|        19|        17|[合格, 事顶, 项目, 自我介绍...|
|        20|        17|[jQuery, 用法, jque...|
|        2

In [9]:
words_df.rdd.take(1)

[Row(article_id=1, channel_id=17, words=['Vue', 'props', '用法', '小结', 'Vue', 'props', '用法', '组件', '选项', 'props', 'Vue', '选项', '父子', '组件', '关系', 'props', 'events', '组件', 'props', '传递数据', '组件', '组件', 'events', '组件', '发送消息', '父子', '组件', '组件', 'pa', 'rent', 'child', '组件', '环境', '书写', '组件', '可维护性', '定义', '父子', '组件', 'Vue', '对象', 'var', 'childNode', 'template', 'div', 'childNode', 'div', 'var', 'pa', 'rentNode', 'template', 'div', 'child', 'child', 'child', 'child', 'div', 'components', 'child', 'childNode', '全栈', '交流', 'Ian', '人员', '技术', '瓶颈', '思维能力', 'Vue', 'components', 'pa', 'rent', 'pa', 'rentNode', 'div', 'pa', 'rent', 'pa', 'rent', 'div', 'childNode', '定义', 'template', 'div', '内容', 'childNode', '字符串', 'pa', 'rentNode', 'template', '定义', 'div', 'class', 'pa', 'rent', 'child', '组件', '静态', 'props', '组件', '实例', '作用域', '组件', '模板', '饮用', '组件', '数据', '组件', '组件', '数据', '组件', 'props', '选项', '组件', '向子', '组件', '传递数据', '方式', '动态', '静态', '静态', '方式', '组件', 'props', '声明', '数据', '上例', '代码', 'childNode

In [10]:
# 词语与词频统计
from pyspark.ml.feature import CountVectorizer
# 总词汇的大小，文本中必须出现的次数
cv = CountVectorizer(inputCol="words", outputCol="countFeatures", vocabSize=200*10000, minDF=1.0)
# 训练词频统计模型
cv_model = cv.fit(words_df)

In [13]:
cv_result = cv_model.transform(words_df)


In [14]:
cv_result.show()

+----------+----------+--------------------+--------------------+
|article_id|channel_id|               words|       countFeatures|
+----------+----------+--------------------+--------------------+
|         1|        17|[Vue, props, 用法, ...|(2509,[0,1,2,3,4,...|
|         2|        17|[vue, 响应式, 原理, mo...|(2509,[0,1,3,4,5,...|
|         3|        17|[JavaScript, 浅拷贝,...|(2509,[0,2,3,5,6,...|
|         4|        17|[vue2, vuex, elem...|(2509,[0,3,4,7,8,...|
|         5|        17|[immutability, Re...|(2509,[0,1,2,3,4,...|
|         6|        17|[node, npm, cnpm,...|(2509,[0,3,6,7,10...|
|         7|        17|[Web, 工程师, 以太坊, 入...|(2509,[0,1,2,3,4,...|
|         8|        17|[Web, pa, api, we...|(2509,[0,3,7,8,16...|
|         9|        17|[vue, 中用, 数据驱动, 视...|(2509,[0,4,5,9,15...|
|        10|        17|[程序, WebSocket, 长...|(2509,[0,1,9,10,1...|
|        11|        17|[flux, 架构, flux, ...|(2509,[0,1,2,4,5,...|
|        12|        17|[合格, TypeScript, ...|(2509,[0,1,3,4,5,...|
|        1

In [17]:
len(cv_model.vocabulary)

2509

In [15]:
cv_result.rdd.take(1)

[Row(article_id=1, channel_id=17, words=['Vue', 'props', '用法', '小结', 'Vue', 'props', '用法', '组件', '选项', 'props', 'Vue', '选项', '父子', '组件', '关系', 'props', 'events', '组件', 'props', '传递数据', '组件', '组件', 'events', '组件', '发送消息', '父子', '组件', '组件', 'pa', 'rent', 'child', '组件', '环境', '书写', '组件', '可维护性', '定义', '父子', '组件', 'Vue', '对象', 'var', 'childNode', 'template', 'div', 'childNode', 'div', 'var', 'pa', 'rentNode', 'template', 'div', 'child', 'child', 'child', 'child', 'div', 'components', 'child', 'childNode', '全栈', '交流', 'Ian', '人员', '技术', '瓶颈', '思维能力', 'Vue', 'components', 'pa', 'rent', 'pa', 'rentNode', 'div', 'pa', 'rent', 'pa', 'rent', 'div', 'childNode', '定义', 'template', 'div', '内容', 'childNode', '字符串', 'pa', 'rentNode', 'template', '定义', 'div', 'class', 'pa', 'rent', 'child', '组件', '静态', 'props', '组件', '实例', '作用域', '组件', '模板', '饮用', '组件', '数据', '组件', '组件', '数据', '组件', 'props', '选项', '组件', '向子', '组件', '传递数据', '方式', '动态', '静态', '静态', '方式', '组件', 'props', '声明', '数据', '上例', '代码', 'childNode

In [21]:
cv_model.vocabulary

['pa',
 'data',
 'var',
 'ul',
 '数据',
 'return',
 'function',
 'node',
 'web',
 '组件',
 'config',
 'obj',
 'console',
 'log',
 'div',
 'const',
 'npm',
 '文件',
 '节点',
 'index',
 'keys',
 'vue',
 '.a',
 'bar',
 'key',
 'child',
 '代码',
 '属性',
 'amp',
 '项目',
 'type',
 'class',
 '事件',
 '方法',
 '函数',
 '对象',
 'url',
 '元素',
 '文本',
 '内容',
 'fragment',
 'test',
 '浏览器',
 '&#',
 '服务器',
 'WebSocket',
 'msg',
 'left',
 'string',
 'props',
 'true',
 '用户',
 'loader',
 'position',
 'style',
 '模块',
 '资源',
 '页面',
 'state',
 'func',
 '版本',
 '合约',
 'export',
 '信息',
 'css',
 'slider',
 '定义',
 '时间',
 '数组',
 'DOM',
 'item',
 'model',
 '程序',
 'width',
 'val',
 'http',
 'html',
 'document',
 'Vue',
 '参数',
 'public',
 '状态',
 'defa',
 'script',
 'json',
 '.h',
 'web3',
 '类型',
 'template',
 'window',
 'obj2',
 'Array',
 'JSON',
 'forChildMsg',
 '以太坊',
 '方式',
 '交易',
 'error',
 'Object',
 'update',
 'import',
 '文章',
 '语法',
 '情况',
 'layer',
 'top',
 'mod',
 'mongodb',
 '用法',
 'Function',
 'input',
 'click',
 'store',
 

In [18]:
from pyspark.ml.feature import IDF
idf = IDF(inputCol="countFeatures", outputCol="idfFeatures")
idfModel = idf.fit(cv_result)

In [22]:
idfModel.idf

DenseVector([0.0, 0.6466, 0.6466, 0.1542, 0.5596, 0.4055, 0.3365, 0.8473, 0.9651, 0.7419, 1.0986, 1.2528, 0.4796, 0.4055, 0.9651, 0.7419, 1.0986, 0.5596, 1.2528, 0.8473, 1.6582, 0.8473, 0.5596, 1.9459, 1.0986, 1.6582, 0.2719, 1.2528, 0.6466, 0.6466, 0.9651, 0.8473, 0.6466, 0.3365, 0.6466, 0.5596, 1.0986, 1.4351, 1.6582, 0.4796, 2.3514, 1.2528, 0.8473, 1.9459, 0.9651, 2.3514, 1.6582, 1.4351, 1.0986, 1.9459, 0.6466, 0.7419, 1.6582, 1.6582, 1.2528, 1.2528, 1.9459, 0.6466, 1.6582, 1.9459, 0.8473, 2.3514, 1.4351, 0.8473, 1.0986, 2.3514, 1.0986, 0.9651, 0.7419, 1.4351, 1.4351, 1.6582, 1.6582, 1.2528, 1.4351, 0.9651, 1.2528, 1.0986, 1.4351, 0.4796, 1.4351, 1.0986, 1.6582, 1.0986, 0.9651, 0.6466, 2.3514, 0.7419, 1.6582, 1.2528, 2.3514, 1.0986, 1.4351, 2.3514, 2.3514, 0.6466, 2.3514, 1.4351, 0.8473, 1.6582, 1.9459, 0.8473, 1.6582, 0.8473, 1.9459, 1.4351, 1.4351, 1.9459, 1.0986, 1.2528, 1.2528, 1.6582, 1.6582, 1.6582, 2.3514, 1.6582, 1.6582, 1.0986, 1.2528, 1.4351, 1.2528, 1.2528, 1.4351, 2.3514

### 计算N篇文章数据的TFIDF值

In [23]:
from pyspark.ml.feature import CountVectorizerModel
from pyspark.ml.feature import IDFModel


In [24]:
cv_model = CountVectorizerModel.load('/headlines/countVectorizerOfArticleWords.model')

In [25]:
idf_model = IDFModel.load('/headlines/IDFOfArticleWords.model')

In [26]:
keywords_list_with_idf = list(zip(cv_model.vocabulary, idf_model.idf.toArray()))

In [27]:
keywords_list_with_idf

[('&#', 1.417829594344155),
 ('pa', 0.6651385256756351),
 ('ul', 0.8070591229443697),
 ('代码', 0.7368239176481552),
 ('方法', 0.7506253985501485),
 ('数据', 0.9375297590538404),
 ('return', 1.1584986818528347),
 ('对象', 1.2765716628665975),
 ('name', 1.3833429138490618),
 ('this', 1.6247297855214076),
 ('public', 1.7540399682870398),
 ('int', 1.6612207991983439),
 ('new', 1.3335127364488795),
 ('问题', 0.8151384673357938),
 ('函数', 1.4147095597213706),
 ('.a', 1.2475641921221166),
 ('class', 1.3562548221032567),
 ('文件', 1.2163286406564702),
 ('amp', 1.5313880611157102),
 ('com', 0.9229090811983397),
 ('元素', 1.7964130603067618),
 ('function', 1.7912981813618367),
 ('用户', 1.2794959063944176),
 ('String', 1.9240331178651056),
 ('内容', 0.7449604375370865),
 ('时候', 0.9169217434065458),
 ('var', 1.8734436358233233),
 ('参数', 1.2650616316269794),
 ('for', 1.254061482600779),
 ('属性', 1.5263663508338299),
 ('方式', 0.9338361580079647),
 ('void', 1.841161167761987),
 ('data', 1.8557185317252085),
 ('__', 2.5

In [28]:
len(keywords_list_with_idf)

1234544

In [29]:
def func(data):
    for index in range(len(data)):
        data[index] = list(data[index])
        data[index].append(index)
        data[index][1] = float(data[index][1])


In [30]:
func(keywords_list_with_idf)

In [31]:
keywords_list_with_idf

[['&#', 1.417829594344155, 0],
 ['pa', 0.6651385256756351, 1],
 ['ul', 0.8070591229443697, 2],
 ['代码', 0.7368239176481552, 3],
 ['方法', 0.7506253985501485, 4],
 ['数据', 0.9375297590538404, 5],
 ['return', 1.1584986818528347, 6],
 ['对象', 1.2765716628665975, 7],
 ['name', 1.3833429138490618, 8],
 ['this', 1.6247297855214076, 9],
 ['public', 1.7540399682870398, 10],
 ['int', 1.6612207991983439, 11],
 ['new', 1.3335127364488795, 12],
 ['问题', 0.8151384673357938, 13],
 ['函数', 1.4147095597213706, 14],
 ['.a', 1.2475641921221166, 15],
 ['class', 1.3562548221032567, 16],
 ['文件', 1.2163286406564702, 17],
 ['amp', 1.5313880611157102, 18],
 ['com', 0.9229090811983397, 19],
 ['元素', 1.7964130603067618, 20],
 ['function', 1.7912981813618367, 21],
 ['用户', 1.2794959063944176, 22],
 ['String', 1.9240331178651056, 23],
 ['内容', 0.7449604375370865, 24],
 ['时候', 0.9169217434065458, 25],
 ['var', 1.8734436358233233, 26],
 ['参数', 1.2650616316269794, 27],
 ['for', 1.254061482600779, 28],
 ['属性', 1.52636635083382

### 计算tfidf

In [32]:
cv_result = cv_model.transform(words_df)
tfidf_result = idf_model.transform(cv_result)

In [33]:
tfidf_result.show()

+----------+----------+--------------------+--------------------+--------------------+
|article_id|channel_id|               words|       countFeatures|         idfFeatures|
+----------+----------+--------------------+--------------------+--------------------+
|         1|        17|[Vue, props, 用法, ...|(1234544,[1,2,3,5...|(1234544,[1,2,3,5...|
|         2|        17|[vue, 响应式, 原理, mo...|(1234544,[1,2,3,4...|(1234544,[1,2,3,4...|
|         3|        17|[JavaScript, 浅拷贝,...|(1234544,[0,1,2,4...|(1234544,[0,1,2,4...|
|         4|        17|[vue2, vuex, elem...|(1234544,[1,2,4,5...|(1234544,[1,2,4,5...|
|         5|        17|[immutability, Re...|(1234544,[1,2,4,5...|(1234544,[1,2,4,5...|
|         6|        17|[node, npm, cnpm,...|(1234544,[0,1,2,3...|(1234544,[0,1,2,3...|
|         7|        17|[Web, 工程师, 以太坊, 入...|(1234544,[1,2,3,4...|(1234544,[1,2,3,4...|
|         8|        17|[Web, pa, api, we...|(1234544,[1,2,3,1...|(1234544,[1,2,3,1...|
|         9|        17|[vue, 中用, 数据驱动, 视...

In [35]:
tfidf_result.rdd.take(1)

[Row(article_id=1, channel_id=17, words=['Vue', 'props', '用法', '小结', 'Vue', 'props', '用法', '组件', '选项', 'props', 'Vue', '选项', '父子', '组件', '关系', 'props', 'events', '组件', 'props', '传递数据', '组件', '组件', 'events', '组件', '发送消息', '父子', '组件', '组件', 'pa', 'rent', 'child', '组件', '环境', '书写', '组件', '可维护性', '定义', '父子', '组件', 'Vue', '对象', 'var', 'childNode', 'template', 'div', 'childNode', 'div', 'var', 'pa', 'rentNode', 'template', 'div', 'child', 'child', 'child', 'child', 'div', 'components', 'child', 'childNode', '全栈', '交流', 'Ian', '人员', '技术', '瓶颈', '思维能力', 'Vue', 'components', 'pa', 'rent', 'pa', 'rentNode', 'div', 'pa', 'rent', 'pa', 'rent', 'div', 'childNode', '定义', 'template', 'div', '内容', 'childNode', '字符串', 'pa', 'rentNode', 'template', '定义', 'div', 'class', 'pa', 'rent', 'child', '组件', '静态', 'props', '组件', '实例', '作用域', '组件', '模板', '饮用', '组件', '数据', '组件', '组件', '数据', '组件', 'props', '选项', '组件', '向子', '组件', '传递数据', '方式', '动态', '静态', '静态', '方式', '组件', 'props', '声明', '数据', '上例', '代码', 'childNode

In [36]:
def func(partition):
    TOPK = 20
    for row in partition:
        # 找到索引与IDF值并进行排序
        _ = list(zip(row.idfFeatures.indices, row.idfFeatures.values))
        _ = sorted(_, key=lambda x: x[1], reverse=True)
        result = _[:TOPK]
        for word_index, tfidf in result:
            yield row.article_id, row.channel_id, int(word_index), round(float(tfidf), 4)

_keywordsByTFIDF = tfidf_result.rdd.mapPartitions(func).toDF(["article_id", "channel_id", "index", "tfidf"])


In [38]:
_keywordsByTFIDF.show()

+----------+----------+------+--------+
|article_id|channel_id| index|   tfidf|
+----------+----------+------+--------+
|         1|        17| 96675|256.3809|
|         1|        17|115370|200.6459|
|         1|        17|   515|174.1541|
|         1|        17| 20134|141.6573|
|         1|        17|   591|126.9896|
|         1|        17|    62|125.4149|
|         1|        17|    45|101.2453|
|         1|        17|   391| 85.5727|
|         1|        17|   491| 56.6656|
|         1|        17|  5650| 51.6382|
|         1|        17|  1465| 37.7835|
|         1|        17|415962|  33.441|
|         1|        17|426351|  33.441|
|         1|        17|   347| 31.9532|
|         1|        17|151480| 28.0657|
|         1|        17|   314| 26.1522|
|         1|        17|  1353| 25.2814|
|         1|        17|  5808|  23.998|
|         1|        17|     5| 23.4382|
|         1|        17|  5371| 20.4942|
+----------+----------+------+--------+
only showing top 20 rows



In [39]:
keywordsIndex = ktt.spark.sql("select keyword, index idx from idf_keywords_values")

In [40]:
keywordsByTFIDF = _keywordsByTFIDF.join(keywordsIndex, keywordsIndex.idx == _keywordsByTFIDF.index).select(["article_id", "channel_id", "keyword", "tfidf"])


In [41]:
keywordsByTFIDF.show()

+----------+----------+---------+--------+
|article_id|channel_id|  keyword|   tfidf|
+----------+----------+---------+--------+
|         3|        17|      var| 22.4813|
|         7|        17|      var| 41.2158|
|        13|        17|      var| 74.9377|
|        21|        17|      var| 14.9875|
|        11|        17|  Actions| 98.2025|
|        13|        17|barNumber|  55.735|
|        18|        17|   loader|125.6564|
|         7|        17|     uint| 29.6332|
|        17|        17| document| 16.8342|
|         9|        17|  caidan2| 10.7415|
|         3|        17|       &#| 41.1171|
|        12|        17|    Watch| 42.4374|
|        20|        17|  closest| 33.8248|
|         4|        17|      cmd| 15.7818|
|        10|        17|       石头| 44.6937|
|        18|        17|      mod| 42.2456|
|         2|        17|     元素节点|  83.279|
|        20|        17|     提交表单|  27.922|
|         1|        17|      Vue| 31.9532|
|        12|        17|      Vue| 46.1546|
+----------

### 计算textrank值

In [44]:
# 分词
def textrank(partition):
    import os

    import jieba
    import jieba.analyse
    import jieba.posseg as pseg
    import codecs

    abspath = "/root/words"

    # 结巴加载用户词典
    userDict_path = os.path.join(abspath, "ITKeywords.txt")
    jieba.load_userdict(userDict_path)

    # 停用词文本
    stopwords_path = os.path.join(abspath, "stopwords.txt")

    def get_stopwords_list():
        """返回stopwords列表"""
        stopwords_list = [i.strip()
                          for i in codecs.open(stopwords_path).readlines()]
        return stopwords_list

    # 所有的停用词列表
    stopwords_list = get_stopwords_list()

    class TextRank(jieba.analyse.TextRank):
        def __init__(self, window=20, word_min_len=2):
            super(TextRank, self).__init__()
            self.span = window  # 窗口大小
            self.word_min_len = word_min_len  # 单词的最小长度
            # 要保留的词性，根据jieba github ，具体参见https://github.com/baidu/lac
            self.pos_filt = frozenset(
                ('n', 'x', 'eng', 'f', 's', 't', 'nr', 'ns', 'nt', "nw", "nz", "PER", "LOC", "ORG"))

        def pairfilter(self, wp):
            """过滤条件，返回True或者False"""

            if wp.flag == "eng":
                if len(wp.word) <= 2:
                    return False

            if wp.flag in self.pos_filt and len(wp.word.strip()) >= self.word_min_len \
                    and wp.word.lower() not in stopwords_list:
                return True
    # TextRank过滤窗口大小为5，单词最小为2
    textrank_model = TextRank(window=5, word_min_len=2)
    allowPOS = ('n', "x", 'eng', 'nr', 'ns', 'nt', "nw", "nz", "c")

    import re
    
    for row in partition:
        sentence = re.sub("<.*?>", "", row.sentence)    # 替换掉标签数据
        tags = textrank_model.textrank(sentence, topK=20, withWeight=True, allowPOS=allowPOS, withFlag=False)
        for tag in tags:
            yield row.article_id, row.channel_id, tag[0], tag[1]

In [45]:
# 计算textrank
textrank_keywords_df = article_dataframe.rdd.mapPartitions(textrank).toDF(
["article_id", "channel_id", "keyword", "textrank"])

# textrank_keywords_df.write.insertInto("textrank_keywords_values")

In [46]:
textrank_keywords_df.show()

+----------+----------+-----------+-------------------+
|article_id|channel_id|    keyword|           textrank|
+----------+----------+-----------+-------------------+
|         1|        17|         组件|                1.0|
|         1|        17|      props| 0.5014665932761055|
|         1|        17|        msg| 0.4456533603873207|
|         1|        17|         数据|0.44060755114598205|
|         1|        17|      child|0.33105197799128705|
|         1|        17|         pa|0.23957543716597018|
|         1|        17|        Vue| 0.2235228074555735|
|         1|        17|  childNode|0.20705374199882054|
|         1|        17|         ul| 0.1826491258545327|
|         1|        17|forChildMsg|0.16026445493034286|
|         1|        17|     String|0.15910269591934798|
|         1|        17|       defa|0.15017376805500693|
|         1|        17|         定义|0.14735936484781245|
|         1|        17|      class|0.13948100302879202|
|         1|        17|       rent|0.13237113947

### 文章画像构建

In [48]:
idf = ktt.spark.sql("select * from idf_keywords_values")

In [49]:
idf = idf.withColumnRenamed("keyword", "keyword1")


In [50]:
result = textrank_keywords_df.join(idf,textrank_keywords_df.keyword==idf.keyword1)


In [52]:
result.show()

+----------+----------+----------+-------------------+----------+------------------+------+
|article_id|channel_id|   keyword|           textrank|  keyword1|               idf| index|
+----------+----------+----------+-------------------+----------+------------------+------+
|         2|        17|     input|0.29571012149785497|     input|2.5936612831652797|   139|
|         1|        17| childNode|0.20705374199882054| childNode| 7.869848788205214| 20134|
|        12|        17|    import| 0.6365577601955468|    import|1.9502451027406746|    44|
|         3|        17|       amp|0.23615437035522693|       amp|1.5313880611157102|    18|
|        13|        17|       amp| 0.2758702473634155|       amp|1.5313880611157102|    18|
|        20|        17|  dragsort| 0.4264886869985381|  dragsort|10.741528413089226|153844|
|         4|        17|        文章| 0.2816273109396151|        文章|1.5470125914600148|   107|
|        20|        17|   indexOf| 0.2924464165952302|   indexOf| 4.096871000928

In [53]:
keywords_res = result.withColumn("weights", result.textrank * result.idf).select(["article_id", "channel_id", "keyword", "weights"])


In [54]:
keywords_res.show()

+----------+----------+----------+-------------------+
|article_id|channel_id|   keyword|            weights|
+----------+----------+----------+-------------------+
|         2|        17|     input| 0.7669718931690873|
|         1|        17| childNode| 1.6294816405627728|
|        12|        17|    import|  1.241443654432938|
|         3|        17|       amp| 0.3616439833422923|
|        13|        17|       amp| 0.4224644032293722|
|        20|        17|  dragsort|  4.581140349255914|
|         4|        17|        文章|0.43568099612260935|
|        20|        17|   indexOf| 1.1981152434744045|
|         2|        17|   textReg| 3.1731673970653205|
|        11|        17|        事件| 0.7786776127586685|
|        20|        17|        事件| 2.2151923160728315|
|        21|        17|        感觉| 0.8579537018884125|
|         5|        17|       API| 0.7433559368554415|
|         2|        17|  fragment| 3.5597674935629273|
|         4|        17|        文件|0.43198866656514556|
|         

In [55]:
keywords_res.registerTempTable("temptable")
merge_keywords = ktt.spark.sql("select article_id, min(channel_id) channel_id, collect_list(keyword) keywords, collect_list(weights) weights from temptable group by article_id")

# 合并关键词权重合并成字典
def _func(row):
    return row.article_id, row.channel_id, dict(zip(row.keywords, row.weights))

keywords_info = merge_keywords.rdd.map(_func).toDF(["article_id", "channel_id", "keywords"])


In [56]:
topic_sql = """
                select t.article_id article_id2, collect_set(t.keyword) topics from tfidf_keywords_values t
                inner join 
                textrank_keywords_values r
                where t.keyword=r.keyword
                group by article_id2
                """
article_topics = ktt.spark.sql(topic_sql)

In [57]:
article_topics.show()

+-----------+--------------------+
|article_id2|              topics|
+-----------+--------------------+
|        148|[transform, solid...|
|        463|[clone, 按钮, 空格键, ...|
|        471|[font, DOCTYPE, m...|
|        496|[lock, 线程, 容器, 元素...|
|        833|[modal, close, bu...|
|       1088|[外边距, 宽度, 内边距, 像素...|
|       1238|[内存, 服务员, 语言, 软件,...|
|       1342|[速度, 距离, canvas, ...|
|       1580|[filename, requir...|
|       1591|[内容, keywords, li...|
|       1645|[圆角, solid, width...|
|       1829|[mirrors, https, ...|
|       1959|[GNU, 软件, openjdk...|
|       2122|[宽度, 样式, CSS+DIV,...|
|       2142|[tuple, File, cod...|
|       2366|[weights, &#, ste...|
|       2659|[语言, pic, head, 结...|
|       2866|[和子, 企业开发, class,...|
|       3175|[stretch, transfo...|
|       3749|[entry, ble, comp...|
+-----------+--------------------+
only showing top 20 rows



In [58]:
article_profile = keywords_info.join(article_topics, keywords_info.article_id==article_topics.article_id2).select(["article_id", "channel_id", "keywords", "topics"])
