## 3.2 Gensim进行LDA

In [47]:
import codecs
from gensim.models import LdaMulticore
from gensim.corpora import Dictionary
import os
from lxml import etree
import lxml
import jieba
import numpy as np
import re

### 一. 原始文本处理
1. 搜狗新闻文件是xml文件, 我们只抽取content标签中的文字. 形如  
  ```xml
  <docs>
    <doc>
        <contenttitle>新股发行＂减速＂本周拟融资额下降逾４成</contenttitle>
        <content>对于新股扩容是否影响股市涨跌的讨论近期不绝于耳</content>
    </doc>
  </docs>
  ```
2. 预料中都是全角,应转换成半角
2. 文件中包含不可见字符和英文字母, 简单过滤掉  
    ```python
    re.sub('[\u3000,\ue40c]','',text)
    re.sub('[a-zA-Z0-9]+','',text)
    ```

In [30]:
# 全角转换成半角
def strQ2B(ustring):
    '''全角转半角
    ustring : 需要转换的字符串
    '''
    ss = ''
    for s in ustring:
        rstring = ""
        for uchar in s:
            inside_code = ord(uchar)
            if inside_code == 12288:  # 全角空格直接转换
                inside_code = 32
            elif (inside_code >= 65281 and inside_code <= 65374):  # 全角字符（除空格）根据关系转化
                inside_code -= 65248
            rstring += chr(inside_code)
        ss = ss + rstring
    return ss

# 解析单个xml文件中的content
def getContentFromFile(_filepath,x):
    '''_filepath: 待解析的搜狗新闻文本,xml格式
        x: 解析出来的文本加到x后面, 作为语料库
    '''
    parser = etree.XMLParser(encoding='utf-8',huge_tree=True) #XML解析器
    text = open(_filepath).read()
    root = etree.fromstring(text,parser=parser)
    docs = root.findall('doc') # <doc>元素
    for doc in docs:
        if type(doc) is lxml.etree._Element:
            for child_elem in doc.getchildren():
                if (child_elem.tag=='content'): # or (child_elem.tag=='contenttitle')
                    text = child_elem.text
                    if (text is not None) and (type(text) is str) and (text != ''):
                        text = re.sub('[\u3000,\ue40c]','',text)
                        text = strQ2B(text)
                        text = re.sub('[a-zA-Z0-9]+','',text)
                        x.append(text)


In [31]:
# 解析文件夹下的所有xml文件
data_dir = '/home/lj/data/sogou_new2012'
x_origion = []
for subfile in os.listdir(data_dir):
    subfile_path = os.path.join(data_dir,subfile)
    getContentFromFile(subfile_path,x_origion)
print('corpus has %s documents' % len(x_origion))

corpus has 22978 documents


### 二. 对过滤后的文本进行分词, 去除停用词

In [32]:
jieba.set_dictionary('../../data/dict.txt.small')
x_cut = [] #分词后,去除停用词的语料库

# 获取停用词列表
with open('../../data/stop_words_utf8.txt') as f:
    content = f.readlines()
    stopwords = [w.strip() for w in content]

# 对corpus中每个文章分词后滤出停用词
for doc in x_origion:
    content = list(jieba.cut(doc))
    x_cut.append([w for w in content if w not in stopwords])

Building prefix dict from /home/lj/ideaprojects/DeepLearning/data/dict.txt.small ...
Loading model from cache /tmp/jieba.ube6b3622f3ed3770dfb4b3dbec1af2a2.cache
Loading model cost 0.210 seconds.
Prefix dict has been built succesfully.


### 三, 从分好词的文本中训练LDA
1. gensim可以自动从预料中学习dictionary, 免除手动创建的繁琐  
2. gensim统计文本出现的单词词频,生成带词频的BOW模型. 格式如下 :   
 ```python
  [(word1_index,freq1),(word2_index,freq2) ... ]
 ```
3. 使用gensim的`LdaModel训练`, 需要制定词典,语料库,主体数量
4. 获取每个主题下的词分布  
  ```python
  lda.print_topics(20)
  ```
5. 获取某个文章下的主题分布  
  ```python
  lda[document]
  ```

In [33]:
# gensim
dictionary = Dictionary(x_cut)  # 从语料库中获取词典

In [34]:
# gensim根据词典,统计文本中出现的单词index和单词出现次数, 形成[(index,count)]形式的列表
index_count_list = dictionary.doc2bow(x_cut[0])
print('第1片文档形成的(index,count): \n',index_count_list[:10],'..')

第1片文档形成的(index,count): 
 [(0, 4), (1, 4), (2, 1), (3, 1), (4, 1), (5, 5), (6, 10), (7, 1), (8, 2), (9, 1)] ..


In [35]:
# 将所有分词后的文档, 生成如上面所示的[(index,count)]列表
corpus = [dictionary.doc2bow(doc) for doc in x_cut]

In [50]:
# lda模型
lda = LdaMulticore(corpus=corpus,  # LDA训练语料
               id2word=dictionary, # id到单词的映射表
               num_topics=20)      # LDA主题数量 


In [51]:
# 获取主题下词的分布
lda.print_topics(20)

[(0,
  '0.090*"容量" + 0.077*"系列" + 0.048*"英寸" + 0.044*"主频" + 0.042*"尺寸" + 0.042*"硬盘" + 0.041*"屏幕" + 0.040*"内存" + 0.039*"芯片" + 0.039*"显卡"'),
 (1,
  '0.007*"月" + 0.007*"支持" + 0.005*"市场" + 0.005*"年" + 0.005*"公司" + 0.005*"中国" + 0.005*"产品" + 0.004*"功能" + 0.004*"经济" + 0.004*"汽车"'),
 (2,
  '0.009*"座椅" + 0.007*"市场" + 0.006*"调节" + 0.006*"电动" + 0.004*"公司" + 0.004*"系统" + 0.004*"年" + 0.004*"后" + 0.004*"月" + 0.004*"中国"'),
 (3,
  '0.009*"电话" + 0.008*"地址" + 0.007*"联系" + 0.007*"店铺" + 0.006*"手机" + 0.005*"号码" + 0.005*"座椅" + 0.005*"市场" + 0.005*"公司" + 0.004*"月"'),
 (4,
  '0.009*"市场" + 0.006*"月" + 0.004*"产品" + 0.004*"公司" + 0.004*"经济" + 0.004*"中国" + 0.004*"年" + 0.003*"企业" + 0.003*".%" + 0.003*"表示"'),
 (5,
  '0.008*"座椅" + 0.006*"调节" + 0.005*"类型" + 0.005*"年" + 0.004*"市场" + 0.004*"电动" + 0.004*"方向盘" + 0.004*"容量" + 0.004*"后" + 0.004*"系统"'),
 (6,
  '0.046*"容量" + 0.026*"硬盘" + 0.026*"系列" + 0.025*"内存" + 0.025*"主频" + 0.025*"英寸" + 0.024*"芯片" + 0.024*"屏幕" + 0.022*"尺寸" + 0.022*"产品"'),
 (7,
  '0.006*"公司" + 0.004*"市场" + 0.

In [39]:
# 第10篇文章的主体分布
lda[corpus[11]]

[(0, 0.08417053),
 (2, 0.07729106),
 (6, 0.12837616),
 (10, 0.02020849),
 (11, 0.39385492),
 (14, 0.25584596),
 (15, 0.019009368)]

In [40]:
# 将所有文章的主题分布形成ndarray
m = len(corpus)
docs_distribute = np.zeros((m,20))
for i,doc in enumerate(corpus):
    distribute = lda[doc] # 该文章的主体分布
    for tupl in distribute:
        docs_distribute[i,tupl[0]] = tupl[1]

In [41]:
# 第1篇文章的主体分步
docs_distribute[0,:]

array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.97060573, 0.02147073,
       0.        , 0.        , 0.        , 0.        , 0.        ])