# 预处理

In [1]:
import pandas as pd
import numpy as np
from collections import OrderedDict
import codecs
import os
import random
path = os.getcwd()
path

'D:\\JupyterNotebook\\Numpy_practice\\第四次作业'

In [2]:
stopwords = open('stopword.txt', encoding='utf-8').read().split('\n')
df = pd.read_csv('chinese_news.csv')
df.head()
# stopwords

Unnamed: 0,label,content
0,体育,鲍勃库西奖归谁属？ NCAA最强控卫是坎巴还是弗神新浪体育讯如今，本赛季的NCAA进入到了末...
1,体育,麦基砍28+18+5却充满寂寞 纪录之夜他的痛阿联最懂新浪体育讯上天对每个人都是公平的，贾维...
2,体育,黄蜂vs湖人首发：科比冲击七连胜 火箭两旧将登场新浪体育讯北京时间3月28日，NBA常规赛洛...
3,体育,双面谢亚龙作秀终成做作 谁来为低劣行政能力埋单是谁任命了谢亚龙？谁放纵了谢亚龙？谁又该为谢亚...
4,体育,兔年首战山西换帅后有虎胆 张学文用乔丹名言励志今晚客场挑战浙江稠州银行队，是山西汾酒男篮的兔...


In [3]:
import jieba

def text2tokens(raw_text):
    tokens = jieba.lcut(raw_text) # jieba.cut(raw_text) 后直接生成list
    tokens = [t for t in tokens if len(t) > 1]
    return tokens

In [4]:
df['label'].value_counts()

时政    1000
房产    1000
体育    1000
娱乐    1000
游戏    1000
时尚    1000
家居    1000
科技    1000
财经    1000
教育    1000
Name: label, dtype: int64

In [5]:
documents = [text2tokens(t) for t in df['content']]
# documents[0]

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\OYL\AppData\Local\Temp\jieba.cache
Loading model cost 0.881 seconds.
Prefix dict has been built successfully.


In [6]:
class Document(object):
    # 文档
    def __init__(self):
        self.words = []
        self.length = 0

In [7]:
class DataPreProcessing(object):
    # 数据处理为词袋
    def __init__(self):
        self.docs_count = 0
        self.words_count = 0
        self.docs = []
        self.word2id = OrderedDict()
    
    def cachewordidmap(self): # 将字典保存在本地（like dictionary.dict）
        with codecs.open(path + 'wordidmapfile', 'w+', 'utf-8') as f:
            for word, id in self.word2id.items():
                f.write(word + '\t' + str(id) + '\n')

In [8]:
# 数据预处理
def preprocessing(df):
    dpre = DataPreProcessing()         # 数据处理后的对象
    item_idx = 0                       # 将词用数字来代替在字典中的位置
    for text in df['content']:
        doc = Document()               # 一篇文档对象
        temp = text2tokens(text)       # 分词
        for item in temp:
            if item in dpre.word2id:             # 词在词典中存在，将对应的index加入doc
                doc.words.append(dpre.word2id[item])
            else:                                      # 新增于词典中，将index加入doc
                dpre.word2id[item] = item_idx
                doc.words.append(item_idx)
                item_idx += 1
        doc.length = len(temp)
        dpre.docs.append(doc)
    dpre.docs_count = len(dpre.docs)  # 文档的数目 1e4
    dpre.words_count = len(dpre.word2id) # 词汇的数目
    
    # dpre.cachewordidmap()
    return dpre

# LDA训练模型

In [9]:
class LDAmodel(object):
    def __init__(self, dpre):
        self.dpre = dpre  # dpre 预处理的数据对象
        self.K = K        # 主题个数
        self.beta = beta
        self.alpha = alpha
        self.iter_times = iter_times       # 最大迭代次数
        self.top_words_num = top_words_num # 特征词个数
        
        self.p = np.zeros(self.K)          # 临时变量
        self.nw = np.zeros((self.dpre.words_count, self.K), dtype="int") # nw，词Word在主题topic上的分布
        self.nwsum = np.zeros(self.K, dtype="int")                       # nwsum，各topic下词的总数
        self.nd = np.zeros((self.dpre.docs_count, self.K))               # nd，每个doc中各个topic的词的总数...
        self.ndsum = np.zeros(dpre.docs_count, dtype="int")              # ndsum，每各doc中词的总数
        
        self.Z = np.array([[0 for y in range(dpre.docs[x].length)] for x in range(dpre.docs_count)]) 
        # M * doc.size()，文档中的主题分布
        
        # 随机先分配类型，为每个文档中的各个单词分配主题
        for x in range(len(self.Z)):
            self.ndsum[x] = self.dpre.docs[x].length
            for y in range(self.dpre.docs[x].length):
                topic = random.randint(0, self.K - 1)           # 在 0 - K 中随机分配主题
                self.Z[x][y] = topic                            # 文档中词的主题分布
                self.nw[self.dpre.docs[x].words[y]][topic] += 1 # 第x篇文章第y个单词 在该topic上的分布
                self.nd[x][topic] += 1                          # 第x篇文章中属于topic的词数 + 1
                self.nwsum[topic] += 1                          # topic下的词的数量 + 1
        
        self.theta = np.array([[0.0 for y in range(self.K)] for x in range(self.dpre.docs_count)]) # 初始化theta doc的topic分布
        self.phi = np.array([[0.0 for y in range(self.dpre.words_count)] for x in range(self.K)])  # 初始化 phi topic下的Word分布
    
    def sampling(self, i, j):
        # 更换主题 第i个doc ，第j 个word
        topic = self.Z[i][j]
        word = self.dpre.docs[i].words[j]
        
        # 相关统计调整
        self.nw[word][topic] -= 1
        self.nwsum[topic] -= 1
        self.nd[i][topic] -= 1
        self.ndsum[i] -= 1
        
        Vbeta = self.dpre.words_count * self.beta
        Kalpha = self.K * self.alpha
        self.p = (self.nw[word] + self.beta) / (self.nwsum + Vbeta) * (self.nd[i] + self.alpha) / (self.ndsum[i] + Kalpha)
        
        p = np.squeeze(np.asarray(self.p / np.sum(self.p)))  # 把shape中为1的维度去掉
        topic = np.argmax(np.random.multinomial(1, p))       # 一次实验，浮点数序列，长度p

        self.nw[word][topic] += 1
        self.nwsum[topic] += 1
        self.nd[i][topic] += 1
        self.ndsum[i] += 1
        return topic
        
    def est(self):
        # 进行训练
        for x in range(self.iter_times):
            for i in range(self.dpre.docs_count):
                for j in range(self.dpre.docs[i].length):
                    topic = self.sampling(i, j)
                    self.Z[i][j] = topic
        self._theta()
        self._phi()
        self.save()
        
    def _theta(self):
        for i in range(self.dpre.docs_count):
            self.theta[i] = (self.nd[i] + self.alpha) / (self.ndsum[i] + self.K * self.alpha)
    
    def _alpha(self):
        for i in range(self.K):
            self.phi[i] = (self.nw.T[i] + self.beta) / (self.nwsum[i] + self.dpre.words_count * self.beta)
            
    def save(self):
        pass

In [10]:
K = 10
alpha = 0.1
beta = 0.01
iter_times = 1000
top_words_num = 10
dpre = preprocessing(df)

In [11]:
X = []
for i in range(dpre.docs_count):
    X = X + dpre.docs[i].words[:2]

In [12]:
print(type(X))

<class 'list'>


In [None]:
lda = LDAmodel(dpre)
lda.est()