## Sử dụng thư viện TextBlob để phân loại mail là ham/spam?

In [33]:
# pip install nltk
# pip install textblob
# python -m nltk.downloader all

In [34]:
import pandas as pd

import nltk
nltk.download()

from textblob.classifiers import NaiveBayesClassifier # Mô hình phân loại
from textblob import TextBlob

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


### Read Data File

In [35]:
data = pd.read_csv('data/spam.csv', encoding='latin-1', sep=',')
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [36]:
data.shape

(5572, 5)

In [37]:
data_sub = data[['v2', 'v1']].drop_duplicates()
data_sub.head()

Unnamed: 0,v2,v1
0,"Go until jurong point, crazy.. Available only ...",ham
1,Ok lar... Joking wif u oni...,ham
2,Free entry in 2 a wkly comp to win FA Cup fina...,spam
3,U dun say so early hor... U c already then say...,ham
4,"Nah I don't think he goes to usf, he lives aro...",ham


In [38]:
data_sub.shape

(5169, 2)

In [39]:
data_sub.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5169 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   v2      5169 non-null   object
 1   v1      5169 non-null   object
dtypes: object(2)
memory usage: 121.1+ KB


In [40]:
data_sub.v1.value_counts() # imbalance

ham     4516
spam     653
Name: v1, dtype: int64

### Chia ra train và test

In [41]:
train_data = data_sub.sample(frac=0.8, random_state=42)
test_data = data_sub.drop(train_data.index)

In [42]:
train_data.shape[0], test_data.shape[0]

(4135, 1034)

In [48]:
train_random_sample = train_data[['v2','v1']].values

In [49]:
test_random_sample = test_data[['v2','v1']].values

### Build model

In [50]:
cl = NaiveBayesClassifier(train_random_sample)

### Compute accuracy

In [51]:
print("Accuracy: {0}".format(cl.accuracy(test_random_sample)))

Accuracy: 0.9864603481624759


### Classify some text

In [52]:
print(cl.classify("Tomorrow We will work from home."))  
print(cl.classify("Call 090 123456. This is a subscrition service with weekly newspapers costing 15$."))  

ham
spam


### Displays a listing of the most informative features for this classifier

In [53]:
cl.show_informative_features(10)

Most Informative Features
          contains(FREE) = True             spam : ham    =    301.7 : 1.0
       contains(service) = True             spam : ham    =    158.7 : 1.0
          contains(Free) = True             spam : ham    =    136.3 : 1.0
         contains(Nokia) = True             spam : ham    =    131.8 : 1.0
       contains(attempt) = True             spam : ham    =     91.6 : 1.0
        contains(Orange) = True             spam : ham    =     82.7 : 1.0
        contains(latest) = True             spam : ham    =     82.7 : 1.0
           contains(Txt) = True             spam : ham    =     81.2 : 1.0
        contains(Mobile) = True             spam : ham    =     78.2 : 1.0
            contains(To) = True             spam : ham    =     76.7 : 1.0


### Classify textblob

In [54]:
blob = TextBlob("Tomorrow We will work from home weekly.", classifier=cl)
blob.classify()

'ham'

In [55]:
blob = TextBlob("Call 090 123456. This is a subscrition service with weekly newspapers costing 15$.", classifier=cl)
blob.classify()

'spam'