#### Data source :https://archive.ics.uci.edu/ml/machine-learning-databases/00228/

In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cross_validation import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer




In [2]:
from butterfingers import butterfinger

add the typo when we have the spam

In [3]:
df=pd.read_csv("../../datasets/SMSSpamCollection",sep='\t',names=['Status','Message'])
print(df.head())


  Status                                            Message
0    ham  Go until jurong point, crazy.. Available only ...
1    ham                      Ok lar... Joking wif u oni...
2   spam  Free entry in 2 a wkly comp to win FA Cup fina...
3    ham  U dun say so early hor... U c already then say...
4    ham  Nah I don't think he goes to usf, he lives aro...


In [4]:
for i in  range(df.count()[0]):
    x = df['Status'][i]
    if x=="spam":
        df.loc[i,'Message']=butterfinger(df['Message'][i])



print(df.head())

  Status                                            Message
0    ham  Go until jurong point, crazy.. Available only ...
1    ham                      Ok lar... Joking wif u oni...
2   spam  Xvew entst ii 2 z cyoj colp fi win GA Dul fhiq...
3    ham  U dun say so early hor... U c already then say...
4    ham  Nah I don't think he goes to usf, he lives aro...


In [5]:
len(df)

5572

In [6]:
len(df[df.Status=='spam'])

747

In [7]:
len(df[df.Status=='ham'])

4825

In [8]:
df.loc[df["Status"]=='ham',"Status",]=1

In [9]:
df.loc[df["Status"]=='spam',"Status",]=0

In [10]:
df.head()

Unnamed: 0,Status,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Xvew entst ii 2 z cyoj colp fi win GA Dul fhiq...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."


In [11]:
df_x=df["Message"]
df_y=df["Status"]

In [12]:
x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.2, random_state=4)

In [13]:
x_train.head()

1457    U sleeping now.. Or you going to take? Haha.. ...
472     How long has it been since you screamed, princ...
2481    Mdgxht! gapl 09066612661 fcim lwndkpne. Yjis f...
243     Okay. No no, just shining on. That was meant t...
1413    Wen ur lovable bcums angry wid u, dnt take it ...
Name: Message, dtype: object

In [14]:
cv1 = TfidfVectorizer(min_df=1,stop_words='english')

In [15]:
x_traincv=cv1.fit_transform(x_train)

In [16]:
a=x_traincv.toarray()

In [17]:
a[0]

array([ 0.,  0.,  0., ...,  0.,  0.,  0.])

In [18]:
cv1.inverse_transform(a[0])

[array(['checking', 'going', 'got', 'haha', 'lor', 'mails', 'online',
        'replying', 'sleeping', 'spys', 'wat'],
       dtype='<U27')]

In [19]:
x_train.iloc[0]

'U sleeping now.. Or you going to take? Haha.. I got spys wat.. Me online checking n replying mails lor..'

In [20]:
x_testcv=cv1.transform(x_test)

In [21]:
x_testcv.toarray()

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [22]:
 mnb = MultinomialNB()

In [23]:
y_train=y_train.astype('int')

In [24]:
mnb.fit(x_traincv,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [25]:
x_testcv

<1115x15782 sparse matrix of type '<class 'numpy.float64'>'
	with 6706 stored elements in Compressed Sparse Row format>

In [26]:
testmessage=x_test.iloc[0]

In [27]:
testmessage

'somewhere out there beneath the pale moon light someone think in of u some where out there where dreams come true... goodnite &amp; sweet dreams'

In [28]:
cv1.get_feature_names()

['00',
 '000',
 '000pes',
 '008704050406',
 '0125698789',
 '02',
 '0207',
 '02072069400',
 '02073162414',
 '02085076972',
 '021',
 '03',
 '04',
 '0430',
 '05',
 '050703',
 '06',
 '07',
 '07008009200',
 '07046744435',
 '07090201529',
 '07090298926',
 '07099833605',
 '07123456789',
 '0721072',
 '07734396839',
 '07742676969',
 '07753741225',
 '0776xxvxzxf',
 '07781482378',
 '07786200117',
 '077xsb',
 '078',
 '07801543489',
 '07808',
 '07808247860',
 '07808726822',
 '07815296484',
 '07821230901',
 '078498',
 '07880867867',
 '0789vxbrxez',
 '07946746291',
 '07973788240',
 '07dsrqzfdex',
 '07frxxxexxx',
 '08',
 '0800',
 '08000407165',
 '08000776320',
 '08000839402',
 '08000930705',
 '08000938767',
 '08001950382',
 '08002888812',
 '08002986030',
 '08002986906',
 '08002988890',
 '08006344447',
 '0808',
 '08081263000',
 '08081560665',
 '0825',
 '0844',
 '08448714184',
 '0845',
 '08450542832',
 '08452810071',
 '08452810073',
 '0870',
 '08700621170150k',
 '08700621170150o',
 '08701417012',
 '0870

In [29]:
predictions=mnb.predict(x_testcv)

In [30]:
predictions

array([1, 1, 1, ..., 1, 1, 1])

In [31]:
actual=np.array(y_test)

In [32]:
actual

array([1, 1, 1, ..., 1, 1, 0], dtype=object)

In [33]:
count=0

In [34]:
for i in range (len(predictions)):
    if predictions[i]==actual[i]:
        count=count+1

In [35]:
count

1001

In [36]:
len(predictions)

1115

In [37]:
1001/1115

0.8977578475336323

In [47]:
pre=predictions.tolist()
act=actual.tolist()

In [49]:
from sklearn.metrics import f1_score
f1_score(act, pre, average='macro')


0.71485678905997629