# Keyword Extraction

In [1]:
import jieba
import jieba.posseg as psg
from operator import itemgetter
from jieba import analyse

def load_stopwords(stopword_filename='./data/stopword.txt'):
    with open(stopword_filename, 'r', encoding='utf8') as f:
        lines = f.readlines()
        lines = [line.strip() for line in lines]
    return lines

def tokenize(document):
    tokens = list()
    # tokenize & remove nouns & remove token with length less than 2
    for token, tag in psg.cut(document):
        if tag.startswith('n') and len(token) >= 2:
            tokens.append(token)
    stopwords = load_stopwords()
    # remove stopwords
    tokens = [token for token in tokens if token not in stopwords]
    return tokens

def tf(word, document):
    word = tokenize(word)[0]
    words = tokenize(document)
    return (sum(1 for word1 in words if word1 == word)) / len(words)

def idf(word, tokens_list):
    
    token = tokenize(word)[0]
    import math
    try:
        return math.log(len(tokens_list) / (1 + sum(1 for tokens in tokens_list if token in tokens)))
    except ValueError:
        return 0

def tf_idf(word, document, tokens_list):
    return tf(word, document) * idf(word, tokens_list)

def load_data(corpus_filename='./data/corpus.txt'):
    documents = list()
    with open(corpus_filename, 'r', encoding='utf8') as f:
        lines = f.readlines()
        for line in lines:
            line.strip()
            documents.append(line)
    return documents

def extract_keyword_tf_idf(text, num_keywords):
    documents = load_data()
    tokens_list = [tokenize(document) for document in documents]
    stopwords = load_stopwords()
    tokens = tokenize(text)
    tokens = set(tokens)
    tf_idfs = {}
    for token in tokens:
        tf_idfs[token] = tf_idf(token, text, tokens_list)
    keyword_tf_idfs = sorted(tf_idfs.items(), key=itemgetter(1), reverse=True)
    keywords = [keyword for keyword, tf_idf in keyword_tf_idfs]
    keywords = keywords[:num_keywords]
    return keywords




In [2]:
def my_textrank1(text, num_keywords, allowPOS=('ns', 'n', 'vn', 'v'), window_size=5):
    d = 0.85
    keywords = list()
    tokens = tokenize(text)
    in_set = {token: set() for token in tokens}
    out_set = {token: set() for token in tokens}

    for i in range(len(tokens) - window_size):
        sub_tokens = tokens[i: i + window_size]
        for token in sub_tokens:
            for other_token in sub_tokens:
                if other_token != token:
                    in_set[token] |= {other_token}
                    out_set[other_token] |= {token}
    scores = {token: 1.0 for token in tokens}
    tmp_scores = {token: 0.0 for token in tokens}
    #d = 
    for _ in range(1000):
        for token in tokens:
            tmp_scores[token] = 0.0
            for in_token in in_set[token]:
                tmp_scores[token] += scores[in_token] / len(out_set[in_token])
            tmp_scores[token] = (1 - d) + d * tmp_scores[token]
            scores = tmp_scores
            
    scores = sorted(scores.items(), key=itemgetter(1), reverse=True)
    keywords = [keyword for keyword, score in scores[:num_keywords]]
    return keywords

def my_textrank(text, num_keywords, allowPOS=('ns', 'n', 'vn', 'v'), window_size=5):
    d = 0.85
    keywords = list()
    tokens = tokenize(text)
    in_set = {token: dict() for token in tokens} # {token1: {in_token1: weight, ...}, ...}
    out_set = {token: dict() for token in tokens}

    for i in range(len(tokens) - window_size):
        sub_tokens = tokens[i: i + window_size]
        for token in sub_tokens:
            for other_token in sub_tokens:
                if other_token != token:
                    # The number of A->B is set to be the weight
                    in_set[token][other_token] = in_set[token].get(other_token, 0) + 1
                    out_set[other_token][token] = out_set[other_token].get(token, 0) + 1
                    
    scores = {token: 1.0 for token in tokens}
    tmp_scores = {token: 0.0 for token in tokens}
    for _ in range(10):
        for token in tokens:
            tmp_scores[token] = 0.0
            for in_token in in_set[token].keys():
                tmp_scores[token] += scores[in_token] / len(out_set[in_token]) * in_set[token].get(other_token, 1.0)
            tmp_scores[token] = (1 - d) + d * tmp_scores[token]
            scores = tmp_scores
            
    scores = sorted(scores.items(), key=itemgetter(1), reverse=True)
    keywords = [keyword for keyword, score in scores[:num_keywords]]
    return keywords



def extract_keyword_textrank(text, num_keywords):
    keywords = list()
    textrank = analyse.textrank
    keywords = textrank(text, num_keywords, allowPOS=('ns', 'n'))
    return keywords

def my_extract_keyword_textrank(text, num_keywords):
    keywords = list()
    textrank = analyse.textrank
    keywords = my_textrank(text, num_keywords, allowPOS=('ns', 'n'))
    
    return keywords

In [3]:
from gensim import corpora, models
from operator import itemgetter
import math

def extract_keyword_topic(text, num_keywords, mode, num_topics=4):
    # create BoW tf_idf
    documents = load_data()
    tokens_list = [tokenize(document) for document in documents]
    
    # construct index<->word map
    d = corpora.Dictionary(tokens_list) 
    # Convert tokens_list to BoW format
    bow_tokens_list = [d.doc2bow(tokens) for tokens in tokens_list]
    # Train tfidf Model using BoW data
    tfidf_model = models.TfidfModel(bow_tokens_list)
    # Calculates tfidf features(BoW format) for each document using BoW data
    bow_tfidf_features_list = [tfidf_model[bow_tokens] for bow_tokens in bow_tokens_list]
    
    # Train LSI/LDA model using tfidf features(BoW format)
    if mode == 'lsi':
        model = models.LsiModel(bow_tfidf_features_list, id2word=d, num_topics=num_topics)
    elif mode == 'lda':
        model = models.LsiModel(bow_tfidf_features_list, id2word=d, num_topics=num_topics)
    else:
        return list()
    
    # Get token set
    tmp_tokens = list()
    for tokens in tokens_list:
        tmp_tokens.extend(tokens)
    token_set = set(tmp_tokens)
    
    # Get 'token->topic feature' map
    token_topics = dict()
    for token in token_set:
        single_tokens_list = [token]
        # convert to tfidf features(BoW format)
        bow_tfidf_feature = tfidf_model[d.doc2bow(single_tokens_list)]
        topic = model[bow_tfidf_feature]
        
        token_topics[token] = topic
    
    # Get document topic feature
    tokens = tokenize(text)
    bow_tfidf_features = tfidf_model[d.doc2bow(tokens)]
    text_topic = model[bow_tfidf_features]
    
    # Calculate the topic features for each word in document
    def calculate_simlilarity(l1, l2):
        try:
            sim = math.sqrt(sum(topic ** 2 for topic_id, topic in l1) / sum(topic ** 2 for topic_id, topic in l2))
            # 貌似用余弦相似度结果会比较差
            #sim = math.sqrt(sum([(id_topic1[1] - id_topic2[1]) ** 2 for id_topic1, id_topic2 in zip(l1, l2)]))
        except ZeroDivisionError:
            sim = 0.0
        return sim

    sim_dict = dict()
    for token in tokens:
        topic = token_topics[token]
        sim = calculate_simlilarity(topic, text_topic) # the order of 'topic' and 'text_topic' cannot be changed
        sim_dict[token] = sim
    keyword_topics = sorted(sim_dict.items(), key=itemgetter(1), reverse=True)
    print('-'*30)
    print(keyword_topics[:num_keywords])
    print('-'*30)
    keywords = [keyword for keyword, topic in keyword_topics[:num_keywords]]
    
    return keywords
    


In [4]:
def extract_keyword(text, mode, num_keywords=10):
    keywords = list()
    if mode == 'tf-idf':
        keywords = extract_keyword_tf_idf(text, num_keywords)
    elif mode == 'textrank':
        keywords = extract_keyword_textrank(text, num_keywords)
    elif mode == 'mytextrank':
        keywords = my_extract_keyword_textrank(text, num_keywords)
    elif mode == 'lsi' or mode == 'lda':
        keywords =extract_keyword_topic(text, num_keywords, mode=mode)
    return keywords

In [5]:
text = '6月19日,《2012年度“中国爱心城市”公益活动新闻发布会》在京举行。' + \
       '中华社会救助基金会理事长许嘉璐到会讲话。基金会高级顾问朱发忠,全国老龄' + \
       '办副主任朱勇,民政部社会救助司助理巡视员周萍,中华社会救助基金会副理事长耿志远,' + \
       '重庆市民政局巡视员谭明政。晋江市人大常委会主任陈健倩,以及10余个省、市、自治区民政局' + \
       '领导及四十多家媒体参加了发布会。中华社会救助基金会秘书长时正新介绍本年度“中国爱心城' + \
       '市”公益活动将以“爱心城市宣传、孤老关爱救助项目及第二届中国爱心城市大会”为主要内容,重庆市' + \
       '、呼和浩特市、长沙市、太原市、蚌埠市、南昌市、汕头市、沧州市、晋江市及遵化市将会积极参加' + \
       '这一公益活动。中国雅虎副总编张银生和凤凰网城市频道总监赵耀分别以各自媒体优势介绍了活动' + \
       '的宣传方案。会上,中华社会救助基金会与“第二届中国爱心城市大会”承办方晋江市签约,许嘉璐理' + \
       '事长接受晋江市参与“百万孤老关爱行动”向国家重点扶贫地区捐赠的价值400万元的款物。晋江市人大' + \
       '常委会主任陈健倩介绍了大会的筹备情况。'

## 1. TF-IDF

In [6]:
keywords = extract_keyword(text, mode='tf-idf')
print(keywords)

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.779 seconds.
Prefix dict has been built succesfully.


['晋江市', '城市', '大会', '爱心', '中华', '基金会', '重庆市', '许嘉璐', '人大常委会', '巡视员']


## 2. TextRank

In [7]:
keywords = extract_keyword(text, mode='textrank')
print('textrank:')
print(keywords)

keywords = extract_keyword(text, mode='mytextrank')
print('\nmytextrank:')
print(keywords)

textrank:
['城市', '爱心', '中国', '社会', '基金会', '晋江市', '公益活动', '大会', '地区', '发布会']

mytextrank:
['城市', '晋江市', '爱心', '中国', '承办方', '许嘉璐', '大会', '重庆市', '内容', '理事长']


## 3. LSI & LDA

In [8]:
keywords = extract_keyword(text, mode='lsi')
print('Lsi:')
print(keywords)

keywords = extract_keyword(text, mode='lda')
print('Lda:')
print(keywords)

------------------------------
[('中国', 1.0218805794057921), ('中华', 0.9898930199536915), ('爱心', 0.8340576057836584), ('项目', 0.6760489770421642), ('基金会', 0.6696490642548523), ('社会', 0.6583389852464987), ('城市', 0.4604600466140951), ('公益活动', 0.42613597541935094), ('全国', 0.3173819304575886), ('年度', 0.28499260676411087)]
------------------------------
Lsi:
['中国', '中华', '爱心', '项目', '基金会', '社会', '城市', '公益活动', '全国', '年度']
------------------------------
[('中国', 1.0197322743315775), ('中华', 0.985991726468228), ('爱心', 0.8341924835701112), ('基金会', 0.6732561528476236), ('项目', 0.6637376085633867), ('社会', 0.6601663563325462), ('城市', 0.4542197366005631), ('公益活动', 0.4312422739683259), ('全国', 0.3101277260781903), ('国家', 0.2870316266024203)]
------------------------------
Lda:
['中国', '中华', '爱心', '基金会', '项目', '社会', '城市', '公益活动', '全国', '国家']
