In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score

In [2]:
nltk.data.path.append("./data/nltk_data")

In [3]:
file_path = './data/spam.csv'

df = pd.read_csv(file_path, encoding='ISO-8859-1')
ps = PorterStemmer()
df = df[['v1', 'v2']]
df['v1'] = df['v1'].map({'spam': 1, 'ham': 0})
df['v2'] = df['v2'].map(lambda sentence: ' '.join([ee for ee in [ps.stem(e) for e in word_tokenize(sentence)] if ee not in stopwords.words('english')]))

In [4]:
df

Unnamed: 0,v1,v2
0,0,"go jurong point , crazi .. avail onli bugi n g..."
1,0,ok lar ... joke wif u oni ...
2,1,free entri 2 wkli comp win fa cup final tkt 21...
3,0,u dun say earli hor ... u c alreadi say ...
4,0,"nah n't think goe usf , live around though"
...,...,...
5567,1,thi 2nd time tri 2 contact u. u å£750 pound pr...
5568,0,ì_ b go esplanad fr home ?
5569,0,"piti , * wa mood . ... ani suggest ?"
5570,0,guy bitch act like 'd interest buy someth els ...


In [5]:
X_train, X_test, y_train, y_test = train_test_split(df.v2, df.v1, test_size=0.33, random_state=42)
print('X_train:\n', X_train)
print('\nX_test:\n', X_test)
print('\ny_train:\n', y_train)
print('\ny_test:\n', y_test)

X_train:
 3235    aight text 're back mu 'll swing , need somebo...
945                       cant wait see ! photo use ? : )
5319                            kothi print marandratha .
5528                                 effect irrit . ignor
247                           kalli wont bat 2nd inning .
                              ...                        
3772    came hostel . go sleep . plz call befor class ...
5191                               sorri , 'll call later
5226    prabha .. 'm soryda .. reali .. frm heart 'm sori
5390                                 nt joke serious told
860                                       work . go min .
Name: v2, Length: 3733, dtype: object

X_test:
 3245    funni fact nobodi teach volcano 2 erupt , tsun...
944     sent score sopha secondari applic school . thi...
1044    know someon know fanci . call 09058097218 find...
2484    onli promis get soon . 'll text morn let know ...
812     congratul ur award either å£500 cd gift vouche...
              

In [6]:
type(X_train), type(X_test), type(y_train), type(y_test)

(pandas.core.series.Series,
 pandas.core.series.Series,
 pandas.core.series.Series,
 pandas.core.series.Series)

In [7]:
# X_train = X_train[0:3]
# X_train

In [8]:
cv = CountVectorizer()
X_train = cv.fit_transform(X_train).astype(int)
X_train

<3733x6032 sparse matrix of type '<class 'numpy.int64'>'
	with 32730 stored elements in Compressed Sparse Row format>

In [9]:
print(X_train)

  (0, 775)	1
  (0, 5275)	1
  (0, 4394)	1
  (0, 1028)	1
  (0, 3660)	1
  (0, 3285)	1
  (0, 5172)	1
  (0, 3743)	1
  (0, 4902)	1
  (0, 2513)	1
  (0, 1965)	1
  (1, 1383)	1
  (1, 5713)	1
  (1, 4664)	1
  (1, 4101)	1
  (1, 5606)	1
  (2, 3159)	1
  (2, 4265)	1
  (2, 3441)	1
  (3, 2064)	1
  (3, 2976)	1
  (3, 2878)	1
  (4, 3103)	1
  (4, 5877)	1
  (4, 1067)	1
  :	:
  (3728, 1354)	1
  (3728, 1371)	1
  (3728, 1111)	1
  (3728, 4837)	1
  (3728, 1525)	1
  (3728, 4155)	1
  (3728, 2799)	1
  (3728, 2819)	1
  (3729, 3285)	1
  (3729, 1354)	1
  (3729, 3198)	1
  (3729, 4928)	1
  (3730, 2701)	1
  (3730, 4222)	1
  (3730, 2433)	1
  (3730, 4403)	1
  (3730, 4933)	1
  (3730, 4927)	1
  (3731, 3841)	1
  (3731, 5397)	1
  (3731, 4696)	1
  (3731, 3062)	1
  (3732, 2544)	1
  (3732, 5884)	1
  (3732, 3545)	1


In [10]:
y_train = y_train.astype(int)
y_train

3235    0
945     0
5319    0
5528    0
247     0
       ..
3772    0
5191    0
5226    0
5390    0
860     0
Name: v1, Length: 3733, dtype: int64

In [11]:
clf = MultinomialNB(force_alpha=True)
clf.fit(X_train, y_train)

In [12]:
clf_pred = clf.predict(cv.transform(X_test))
clf_pred, len(clf_pred)

(array([0, 0, 1, ..., 0, 0, 1]), 1839)

In [13]:
score = f1_score(y_test, clf_pred)
score

0.9306122448979592

In [14]:
df_merge = pd.DataFrame(clf_pred, columns = ['predict'], index = y_test.index.values).join(df[['v2']])
df_merge['actual'] = y_test.tolist()
df_merge['pred_correct'] = np.where(df_merge['actual'] == df_merge['predict'], 'Yes', 'No')
df_merge.head(5)

Unnamed: 0,predict,v2,actual,pred_correct
3245,0,"funni fact nobodi teach volcano 2 erupt , tsun...",0,Yes
944,0,sent score sopha secondari applic school . thi...,0,Yes
1044,1,know someon know fanci . call 09058097218 find...,1,Yes
2484,0,onli promis get soon . 'll text morn let know ...,0,Yes
812,1,congratul ur award either å£500 cd gift vouche...,1,Yes


In [15]:
df_merge.to_csv('./data/clf_pred.csv')

In [26]:
df_merge['pred_correct'].to_frame().reset_index().groupby(by='pred_correct').count()

Unnamed: 0_level_0,index
pred_correct,Unnamed: 1_level_1
No,34
Yes,1805


In [39]:
df_merge.dtypes

predict          int64
v2              object
actual           int64
pred_correct    object
dtype: object

In [48]:
base = df_merge.count()[0]

In [49]:
# 漏召
a = np.where((df_merge['actual'] == 1) & (df_merge['predict'] == 0), 1, 0).sum()
a, a/base

(24, 0.013050570962479609)

In [50]:
# 误伤
b = np.where((df_merge['actual'] == 0) & (df_merge['predict'] == 1), 1, 0).sum()
b, b/base

(10, 0.005437737901033171)