In [1]:
import numpy as np
import pandas as pd
import random
from textblob.classifiers import NaiveBayesClassifier
from textblob import TextBlob
import nltk

- Easy to use, slow processing
- https://textblob.readthedocs.io/en/dev/

### Load & inspect data

In [2]:
df= pd.read_csv('spam.csv',encoding='latin-1',sep=',')

In [3]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [4]:
df=df[['v2','v1']]

In [5]:
df.head()

Unnamed: 0,v2,v1
0,"Go until jurong point, crazy.. Available only ...",ham
1,Ok lar... Joking wif u oni...,ham
2,Free entry in 2 a wkly comp to win FA Cup fina...,spam
3,U dun say so early hor... U c already then say...,ham
4,"Nah I don't think he goes to usf, he lives aro...",ham


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   v2      5572 non-null   object
 1   v1      5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


### Data preprocess

**Convert df to a list**

In [7]:
records= df.to_records(index=False)
records

rec.array([('Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...', 'ham'),
           ('Ok lar... Joking wif u oni...', 'ham'),
           ("Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's", 'spam'),
           ...,
           ('Pity, * was in mood for that. So...any other suggestions?', 'ham'),
           ("The guy did some bitching but I acted like i'd be interested in buying something else next week and he gave it to us for free", 'ham'),
           ('Rofl. Its true to its name', 'ham')],
          dtype=[('v2', 'O'), ('v1', 'O')])

In [8]:
df_new= list(records)
df_new[:5]

[('Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...', 'ham'),
 ('Ok lar... Joking wif u oni...', 'ham'),
 ("Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's", 'spam'),
 ('U dun say so early hor... U c already then say...', 'ham'),
 ("Nah I don't think he goes to usf, he lives around here though", 'ham')]

**train & test split**

In [9]:
train= random.sample(df_new,int(0.8*len(df_new)))
len(train)

4457

In [10]:
test= [x for x in df_new if x not in train]
len(test)

1005

**train model**

In [11]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\tangh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [12]:
# build model
c1= NaiveBayesClassifier(train)

**Evaluate**

In [13]:
c1.accuracy(test)

0.9830845771144279

In [14]:
# show classified result
c1.classify("tomorrow we will work from home")

'ham'

In [15]:
# show probability
s1= c1.prob_classify("tomorrow we will work from home")
print(round(s1.prob("spam"),2))
print(round(s1.prob("ham"),2))

0.0
1.0


In [16]:
TextBlob("tomorrow we will work from home",classifier=c1).classify()

'ham'

**Important features**

In [17]:
c1.show_informative_features(10)


Most Informative Features
          contains(FREE) = True             spam : ham    =    322.5 : 1.0
            contains(16) = True             spam : ham    =    160.2 : 1.0
           contains(Txt) = True             spam : ham    =    144.8 : 1.0
       contains(service) = True             spam : ham    =    106.4 : 1.0
         contains(apply) = True             spam : ham    =     96.1 : 1.0
        contains(Orange) = True             spam : ham    =     91.8 : 1.0
          contains(Text) = True             spam : ham    =     91.0 : 1.0
      contains(landline) = True             spam : ham    =     87.6 : 1.0
          contains(draw) = True             spam : ham    =     70.5 : 1.0
         contains(await) = True             spam : ham    =     70.5 : 1.0
