In [4]:
from sklearn.naive_bayes import MultinomialNB
import pandas as pd
import codecs
import re
import jieba
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from sklearn.model_selection import train_test_split

filename1 = '../data/trec06c/data1.csv'
filename2 = '../data/trec06c/data2.csv'
filename3 = '../data/trec06c/data3.csv'

In [5]:
email_labels = [] # 用来存邮件的标签数据 是不是垃圾邮件
email_contents = [] # 用来保存邮件的内容
sample_number = 500

for line in open('../data/trec06c/full/index',errors='ignore'):
    label,data = line.strip().split()
    file_name = '../data/trec06c'+data[2:]
    file_data = codecs.open(file_name,'r','gbk',errors='ignore').read()

    email_labels.append(label)
    email_contents.append(file_data)
email_data = pd.DataFrame({'content':email_contents,'label':email_labels})

In [6]:
email_data.content[1]

'Received: from jdl.ac.cn ([159.226.42.8])\n\tby spam-gw.ccert.edu.cn (MIMEDefang) with ESMTP id j7C1ceuQ019050\n\tfor <shi@ccert.edu.cn>; Sun, 14 Aug 2005 10:02:01 +0800 (CST)\nReceived: (qmail 5448 invoked from network); Sun, 14 Aug 2005 02:12:48 -0000\nReceived: from unknown (HELO d47db5334f2a479) (192.168.0.233)\n  by 159.226.42.8 with SMTP; Sun, 14 Aug 2005 02:12:48 -0000\nMessage-ID: <000b01c59ee0$a1f666b0$e900a8c0@d47db5334f2a479>\nFrom: "pan" <pan@jdl.ac.cn>\nTo: shi@ccert.edu.cn\nSubject: =?gb2312?B?ofEgzsrSu7K/zrrX2s3ytcS159Oww/uzxg==?=\nDate: Sun, 14 Aug 2005 10:16:47 +0800\nMIME-Version: 1.0\nContent-Type: text/plain;\n\tcharset="gb2312"\nContent-Transfer-Encoding: base64\nX-Priority: 3\nX-MSMail-Priority: Normal\nX-Mailer: Microsoft Outlook Express 6.00.2800.1506\nX-MimeOLE: Produced By Microsoft MimeOLE V6.00.2800.1506\n\n讲的是孔子后人的故事。一个老领导回到家乡，跟儿子感情不和，跟贪财的孙子孔为本和睦。\n老领导的弟弟魏宗万是赶马车的。\n有个洋妞大概是考察民俗的，在他们家过年。\n孔为本总想出国，被爷爷教育了。\n最后，一家人基本和解。\n顺便问另一类电影，北京青年电影制片厂的。中越战背景。一军人被介绍了一个对象，去相

In [7]:
spam_mail = email_data[email_data['label']=='spam'].sample(sample_number)
ham_mail = email_data[email_data['label']=='ham'].sample(sample_number)

email_data = pd.concat([spam_mail,ham_mail])
email_data.to_csv(filename1)

In [8]:
email_data.shape

(1000, 2)

## Mail data processing, removing line breaks, non-Chinese, blanks, and then tokenization

In [9]:
# 1. 读取数据
email_data = pd.read_csv('../data/trec06c/data1.csv')
contents=[]
for index,email in enumerate(email_data['content'],1):
    email = email.replace('\n',' ')
    # 2. remove non-chinese
    email = re.sub('[^\u4e00-\u9fff]', '', email)
    # 3. remove blanks
    email = ' '.join(email.split())
    # 4. Tokenization
    email = ' '.join(jieba.lcut(email)) # 使用jieba 库做中文的分词

    contents.append(email)
    if index % 100 ==0:
        print('已预处理%d封邮件' % index)

Building prefix dict from the default dictionary ...


Dumping model to file cache /var/folders/dx/1rczl6h97j97y80gd0snfrxh0000gn/T/jieba.cache
Loading model cost 0.755 seconds.
Prefix dict has been built successfully.


已预处理100封邮件
已预处理200封邮件
已预处理300封邮件
已预处理400封邮件
已预处理500封邮件
已预处理600封邮件
已预处理700封邮件
已预处理800封邮件
已预处理900封邮件
已预处理1000封邮件


In [10]:
data = pd.DataFrame({'contents':contents,'label':email_data['label']})

In [11]:
data.to_csv(filename2)

## 邮件数据向量化

In [13]:
email = pd.read_csv(filename2)
email.dropna(inplace=True)
# 准备加载停用词表
stop_words=[]
for word in open('../data/trec06c/stoplist.txt','r',encoding='gbk'):
    stop_words.append(word.strip())
transformer = CountVectorizer(stop_words=stop_words)
x = transformer.fit_transform(email['contents']).toarray()
y = np.where(email['label'].values =='ham',0,1)
data = pd.DataFrame(x)
data[x.shape[1]] = y
data.to_csv(filename3)

## 训练朴素贝叶斯模型

In [None]:
data = pd.read_csv(filename3)
x = data.iloc[:, :-1]
y = data.iloc[:, -1]

In [None]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2)
estimator = MultinomialNB()
estimator.fit(x_train,y_train)

estimator.score(x_test,y_test)

0.8860103626943006