In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from collections import defaultdict
from functools import reduce

In [2]:
nltk.data.path.append("./data/nltk_data")

In [3]:
# 预处理
file_path = './data/spam.csv'

df = pd.read_csv(file_path, encoding='ISO-8859-1')
ps = PorterStemmer()
df = df[['v1', 'v2']]
df['v1'] = df['v1'].map({'spam': 1, 'ham': 0})
df['v2'] = df['v2'].map(lambda sentence: ' '.join([ee for ee in [ps.stem(e) for e in word_tokenize(sentence)] if ee not in stopwords.words('english')]))

In [4]:
df

Unnamed: 0,v1,v2
0,0,"go jurong point , crazi .. avail onli bugi n g..."
1,0,ok lar ... joke wif u oni ...
2,1,free entri 2 wkli comp win fa cup final tkt 21...
3,0,u dun say earli hor ... u c alreadi say ...
4,0,"nah n't think goe usf , live around though"
...,...,...
5567,1,thi 2nd time tri 2 contact u. u å£750 pound pr...
5568,0,ì_ b go esplanad fr home ?
5569,0,"piti , * wa mood . ... ani suggest ?"
5570,0,guy bitch act like 'd interest buy someth els ...


In [5]:
# 分割训练集测试集
X_train, X_test, y_train, y_test = train_test_split(df.v2, df.v1, test_size=0.33, random_state=42)

In [6]:
len(X_train), len(X_test), len(y_train), len(y_test)

(3733, 1839, 3733, 1839)

In [7]:
df_train = X_train.to_frame().join(y_train)
df_train

Unnamed: 0,v2,v1
3235,"aight text 're back mu 'll swing , need somebo...",0
945,cant wait see ! photo use ? : ),0
5319,kothi print marandratha .,0
5528,effect irrit . ignor,0
247,kalli wont bat 2nd inning .,0
...,...,...
3772,came hostel . go sleep . plz call befor class ...,0
5191,"sorri , 'll call later",0
5226,prabha .. 'm soryda .. reali .. frm heart 'm sori,0
5390,nt joke serious told,0


In [8]:
# 正常邮件词频统计
norm_row_num, norm_word_cnt = 0, 0
norm_word_dict = defaultdict(int)
for line in df_train.loc[df_train['v1'] == 0]['v2']:
    if line != '':
        norm_row_num += 1
        lst = line.strip().split(' ')
        norm_word_cnt += len(lst)
        for l in lst:
            norm_word_dict[l] += 1

In [9]:
# norm_row_num, norm_word_cnt, norm_word_dict

In [10]:
# 垃圾邮件词频统计
spam_row_num, spam_word_cnt = 0, 0
spam_word_dict = defaultdict(int)
for line in df_train.loc[df_train['v1'] == 1]['v2']:
    if line != '':
        spam_row_num += 1
        lst = line.strip().split(' ')
        spam_word_cnt += len(lst)
        for l in lst:
            spam_word_dict[l] += 1

In [11]:
# spam_row_num, spam_word_cnt, spam_word_dict

In [12]:
word_cnt = norm_word_cnt + spam_word_cnt
norm_prob = norm_word_cnt / word_cnt
spam_prob = spam_word_cnt / word_cnt
norm_prob, spam_prob

(0.7796392295696886, 0.22036077043031146)

In [13]:
# 计算每个词存在时是垃圾邮件的条件概率
spam_prob_dict = dict()

words = set(list(norm_word_dict.keys()) + list(spam_word_dict.keys()))
for w in words:
    a = spam_prob * (spam_word_dict[w] / spam_word_cnt)
    b = norm_prob * (norm_word_dict[w] / norm_word_cnt)
    c = (spam_word_dict[w] + norm_word_dict[w]) / (spam_word_cnt + norm_word_cnt)
    spam_prob_dict[w] = (a / (a + b), c)

In [14]:
default_freq_prob = np.mean([v[1] for _, v in spam_prob_dict.items()])
default_freq_prob

0.0001543448062972681

In [15]:
np.mean([v[0] for _, v in spam_prob_dict.items()])

0.2734875363138543

In [16]:
def calc_spam_prob(sentence, spam_prob_dict=spam_prob_dict, default_spam_prob=0.4, default_freq_prob=default_freq_prob, limit=15):
    words = sentence.strip().split(' ')
    lst = list()
    for w in words:
        if w in spam_prob_dict:
            lst.append(spam_prob_dict[w])

    if len(lst) < limit:
        lst += [(default_spam_prob, default_freq_prob) for _ in range(limit - len(lst))]
    
    limit_n = [k for k, _ in sorted([(t, min(1-t[0],t[0])) for t in lst], key=lambda e: e[1])][:limit]
    a = reduce(lambda x, y: x * y, [max(e[0], 0.001) for e in limit_n])
    b = reduce(lambda x, y: x * y, [e[1] for e in limit_n])
    c = reduce(lambda x, y: x * y, [max(1-e[0], 0.001) for e in limit_n])
    if a + c == 0: print(sentence, lst,a ,c)
    return a / (a + c)

In [17]:
calc_spam_prob(sentence='thi 2nd time tri 2 contact u. u å£750 pound', limit=15)

0.004633749620396715

In [18]:
calc_spam_prob(sentence='guy bitch act like \'d interest buy someth', limit=15)

5.532986296963259e-17

In [19]:
pred = [calc_spam_prob(sentence=e) for e in X_test]

In [20]:
df_res = pd.DataFrame({'X_test': X_test, 'score': pred, 'actual': y_test})
df_res

Unnamed: 0,X_test,score,actual
3245,"funni fact nobodi teach volcano 2 erupt , tsun...",1.463723e-24,0
944,sent score sopha secondari applic school . thi...,1.943147e-25,0
1044,know someon know fanci . call 09058097218 find...,9.998136e-01,1
2484,onli promis get soon . 'll text morn let know ...,1.143428e-14,0
812,congratul ur award either å£500 cd gift vouche...,1.000000e+00,1
...,...,...,...
4944,"anyway n't think secur anyth , lem know want d...",2.771890e-21,0
3313,oh gei . happend tron . mayb ill dl 3d,2.697375e-15,0
3652,ha issu right . ill fix tomorrow .,5.081893e-11,0
14,date sunday ! !,1.312554e-05,0


In [21]:
df_res['predict'] = np.where(df_res['score'] >= 0.90, 1, 0)
df_res['pred_correct'] = np.where(df_res['actual'] == df_res['predict'], 'Yes', 'No')
df_res

Unnamed: 0,X_test,score,actual,predict,pred_correct
3245,"funni fact nobodi teach volcano 2 erupt , tsun...",1.463723e-24,0,0,Yes
944,sent score sopha secondari applic school . thi...,1.943147e-25,0,0,Yes
1044,know someon know fanci . call 09058097218 find...,9.998136e-01,1,1,Yes
2484,onli promis get soon . 'll text morn let know ...,1.143428e-14,0,0,Yes
812,congratul ur award either å£500 cd gift vouche...,1.000000e+00,1,1,Yes
...,...,...,...,...,...
4944,"anyway n't think secur anyth , lem know want d...",2.771890e-21,0,0,Yes
3313,oh gei . happend tron . mayb ill dl 3d,2.697375e-15,0,0,Yes
3652,ha issu right . ill fix tomorrow .,5.081893e-11,0,0,Yes
14,date sunday ! !,1.312554e-05,0,0,Yes


In [22]:
np.percentile(df_res['score'], 95)

0.9999999999999999

In [23]:
np.where((df_res['actual'] == 1), 1, 0).sum(), np.where((df_res['predict'] == 1), 1, 0).sum()

(252, 194)

In [24]:
np.where((df_res['actual'] == 0), 1, 0).sum(), np.where((df_res['predict'] == 0), 1, 0).sum()

(1587, 1645)

In [25]:
base = df_res.count()[0]
base

1839

In [26]:
# 漏召
a = np.where((df_res['actual'] == 1) & (df_res['predict'] == 0), 1, 0).sum()
a, a/base

(58, 0.031538879825992384)

In [27]:
# 误伤
b = np.where((df_res['actual'] == 0) & (df_res['predict'] == 1), 1, 0).sum()
b, b/base

(0, 0.0)