In [1]:
#用到的包
import numpy as np
import pandas as pd
import re
from gensim import corpora, models, similarities
import gensim
import pyLDAvis.gensim_models
import math
import matplotlib.pyplot as plt

In [2]:
"""第一步：用正则表达式清洗数据，并去除停用词"""
data=pd.read_csv(r"C:\Users\123\LDA\HillaryEmails.csv",encoding='ISO-8859-1')
# 原邮件数据中有很多Nan的值，直接扔了。
data=data[['Id','ExtractedBodyText']].dropna()  #只选取 Id和 ExtractedBodyText两列 去除空值 #只选取 Id和 ExtractedBodyText两列 去除空值

In [3]:
# 用正则表达式清洗数据

def clean_email_text(text):
    text= text.replace('\n'," ") #用空格代替新的行
    text=re.sub(r"-"," ",text)  #把用“-”连接的单词分开 （close-up分成close和 up）
    text=re.sub(r"\d+\d+\d+","",text) #删除日期
    text=re.sub(r"[0-2]?[1-9]:[0-6][0-9]","",text)  #删除时间
    text=re.sub(r"[\w]+@[\.\w]+","",text)  #删除邮件地址
    text=re.sub(r"/[a-zA-Z]*[:\//\]*[A-Za-z0-9\-]+\.+[A-Za-z0-9\.\/%&=\?\-_]+/i","",text) #删除网址
    
    # 以防还有其他除了单词以外的特殊字符（数字）等等，我们把特殊字符过滤掉
    # 只留下字母和空格
    # 再把单个字母去掉，留下单词
    pure_text=''
    for letter in text:        #去除字母和空格
        if letter.isalpha() or letter==' ':
            pure_text+=letter
            
    text=' '.join(word for word in pure_text.split() if len(word)>1)
    return text

In [4]:
docs=data['ExtractedBodyText']
print(docs)
docs=docs.apply(lambda s:clean_email_text(s))
docs.head(10).values

1       B6\nThursday, March 3, 2011 9:45 PM\nH: Latest...
2                                                     Thx
4       H <hrod17@clintonemail.com>\nFriday, March 11,...
5       Pis print.\n-â¢-...-^\nH < hrod17@clintonerna...
7       H <hrod17@clintonemail.corn>\nFriday, March 11...
                              ...                        
7938    Hi. Sorry I haven't had a chance to see you, b...
7939    B6\nI assume you saw this by now -- if not, it...
7941    Big change of plans in the Senate. Senator Rei...
7943    PVerveer B6\nFriday, December 17, 2010 12:12 A...
7944                                           See below.
Name: ExtractedBodyText, Length: 6742, dtype: object


array(['Thursday March PM Latest How Syria is aiding Qaddafi and more Sid hrc memo syria aiding libya docx hrc memo syria aiding libya docx March For Hillary',
       'Thx',
       'Friday March PM Huma Abedin Fw Latest How Syria is aiding Qaddafi and more Sid hrc memo syria aiding libya docx Pis print',
       'Pis print Wednesday September PM Fw Meet The Right Wing Extremist Behind Anti fvluslim Film That Sparked Deadly Riots From meat Sent Wednesday September PM To Subject Meet The Right Wing Extremist Behind Anti Muslim Film That Sparked Deadly Riots httemaxbiumenthalcommeet the right wing extremist behind anti musiim tihn that sparked deadly riots Sent from my Verizon Wireless LTE DROID US Department of State Case No Doc No Date STATE DEPT PRODUCED TO HOUSE SELECT BENGHAZI COMM SUBJECT TO AGREEMENT ON SENSITIVE INFORMATION REDACTIONS NO FOIA WAIVER STATE CB',
       'Friday March PM Huma Abedin Fw Latest How Syria is aiding Qaddafi and more Sid hrc memo Syria aiding libya docx Pis

In [5]:
doclist = docs.values#把值都取出来做成一个list
print("一共有",len(doclist),"封邮件。\n")
print("第一封邮件经过清洗后的内容为: \n",doclist[:1],'\n')

一共有 6742 封邮件。

第一封邮件经过清洗后的内容为: 
 ['Thursday March PM Latest How Syria is aiding Qaddafi and more Sid hrc memo syria aiding libya docx hrc memo syria aiding libya docx March For Hillary'] 



In [6]:
# 去除停用词，处理成gensim需要的输入格式
def remove_stopword():
    stopword = []
    with open(r'C:\Users\123\LDA\english.txt', encoding='utf-8') as f:
        lines = f.readlines()
        for line in lines:
            line = line.replace('\n', '')
            stopword.append(line)
    return stopword
stop_words=remove_stopword()
texts=[[word for word in doc.lower().split() if word not in stop_words] for doc in doclist]
print("第一封邮件去除停用词并处理成gensim需要的格式为：\n",texts[0],'\n')

第一封邮件去除停用词并处理成gensim需要的格式为：
 ['march', 'syria', 'aiding', 'qaddafi', 'sid', 'hrc', 'memo', 'syria', 'aiding', 'libya', 'docx', 'hrc', 'memo', 'syria', 'aiding', 'libya', 'docx', 'march', 'hillary'] 



In [7]:
"""第二步：构建字典，将文本ID化"""
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
# 将每一篇邮件ID化
print("第一封邮件ID化后的结果为：\n",corpus[0],'\n')

第一封邮件ID化后的结果为：
 [(0, 3), (1, 2), (2, 1), (3, 2), (4, 2), (5, 2), (6, 2), (7, 1), (8, 1), (9, 3)] 



In [11]:
"""第三步：训练LDA模型"""
#LDA建模
LDA=gensim.models.ldamodel.LdaModel(corpus,id2word=dictionary,num_topics=3,random_state=100,
                chunksize=1000, passes=50,iterations=500)

In [12]:
#指定话题数和每个话题的词数（话题数用后面的困惑度和pyLDAvis来确定）
LDA.print_topics(num_topics=3,num_words=7)

[(0,
  '0.019*"fyi" + 0.008*"cheryl" + 0.008*"huma" + 0.007*"pls" + 0.007*"fw" + 0.006*"thx" + 0.006*"mills"'),
 (1,
  '0.029*"office" + 0.021*"secretarys" + 0.018*"meeting" + 0.010*"arrive" + 0.010*"route" + 0.009*"depart" + 0.009*"house"'),
 (2,
  '0.006*"obama" + 0.005*"president" + 0.004*"government" + 0.003*"political" + 0.003*"policy" + 0.003*"house" + 0.003*"support"')]

In [10]:
#用pyLDAvis来确定话题数
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(LDA, corpus, dictionary)
pyLDAvis.save_html(vis, 'lda_1.html')#//将结果用HTML保存
vis

In [11]:
#困惑度计算
def perplexity(ldamodel, testset, dictionary, size_dictionary, num_topics):
    prep = 0.0
    prob_doc_sum = 0.0
    topic_word_list = [] 
    for topic_id in range(num_topics):
        topic_word = ldamodel.show_topic(topic_id, size_dictionary)
        dic = {}
        for word, probability in topic_word:
            dic[word] = probability
        topic_word_list.append(dic)
    doc_topics_ist = [] 
    for doc in testset:
        doc_topics_ist.append(ldamodel.get_document_topics(doc, minimum_probability=0))
    testset_word_num = 0
    for i in range(len(testset)):
        prob_doc = 0.0 
        doc = testset[i]
        doc_word_num = 0 
        for word_id, num in dict(doc).items():
            prob_word = 0.0 
            doc_word_num += num
            word = dictionary[word_id]
            for topic_id in range(num_topics):
                prob_topic = doc_topics_ist[i][topic_id][1]
                prob_topic_word = topic_word_list[topic_id][word]
                prob_word += prob_topic*prob_topic_word
            prob_doc += math.log(prob_word) 
        prob_doc_sum += prob_doc
        testset_word_num += doc_word_num
    prep = math.exp(-prob_doc_sum/testset_word_num)
    return prep

In [None]:
#topic个数hepreplexity的关系图（通过曲线的下降速度来确定合适的topic数）
topic=[]
prep_value=[]

for i in range(1,20,1): #话题数从1到20
    topic.append(str(i))
    ldamodel=gensim.models.ldamodel.LdaModel(corpus,id2word=dictionary,num_topics=i,random_state=100,
               chunksize=1000, passes=50,iterations=100)
    testset=[]
    num_topics=i
    for j in range(int(6740/20)):
        testset.append(corpus[j*20])
    dictionary=corpora.Dictionary(texts)
    size_dictionary=len(dictionary.keys())
    prep=perplexity(ldamodel, testset, dictionary,size_dictionary,num_topics)
    prep_value.append(prep)
    
plt.plot(topic,prep_value,linewidth=2,color="red")
plt.xlabel("Number of topic")
plt.ylabel("Preplexity")
#plt.savefig('./picture4-5.jpg', dpi=300)
plt.show()

In [13]:
"""第四步：查看某封邮件所属的主题"""
print("第一封邮件的大致内容为：\n",texts[0],'\n')
topic = LDA.get_document_topics(corpus[0])
print("第一封邮件的主题分布为：\n",topic,'\n')

第一封邮件的大致内容为：
 ['march', 'syria', 'aiding', 'qaddafi', 'sid', 'hrc', 'memo', 'syria', 'aiding', 'libya', 'docx', 'hrc', 'memo', 'syria', 'aiding', 'libya', 'docx', 'march', 'hillary'] 

第一封邮件的主题分布为：
 [(0, 0.9624864), (1, 0.02039479), (2, 0.017118836)] 



In [14]:
# 希拉里发的两条推特
# 给大伙翻译一下这两句：
# 这是选举的一天!数以百万计的美国人投了希拉里的票。加入他们吧，确定你投给谁。
# 希望今天每个人都能度过一个安乐的感恩节，和家人朋友共度美好时光——来自希拉里的问候。
twitter = ["It's Election Day! Millions of Americans have cast their votes for Hillary—join them and confirm where you vote ",
       "Hoping everyone has a safe & Happy Thanksgiving today, & quality time with family & friends. -H"]

text_twitter = [clean_email_text(s) for s in twitter]
text_twitter = [[word for word in text.lower().split() if word not in stop_words] for text in text_twitter]
corpus_twitter = [dictionary.doc2bow(text) for text in text_twitter]
topics_twitter = LDA.get_document_topics(corpus_twitter)
print("这两条推特的主题分布分别为：\n",topics_twitter[0] ,'\n',topics_twitter[1])


这两条推特的主题分布分别为：
 [(0, 0.17469877), (1, 0.041711666), (2, 0.78358954)] 
 [(0, 0.6086528), (1, 0.045040857), (2, 0.34630635)]
