<a href="https://colab.research.google.com/github/look4pritam/ArtificialIntelligence/blob/master/MachineLearning/NaiveBayes/Notebooks/SpamFilter.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Naive Bayes Classifier
In this example, we will use Naive Bayes for classifying emails as either ham or spam.

We will use SMS spam collection dataset provided by UCI machine learning repository.

See [link](https://archive.ics.uci.edu/ml/datasets/sms+spam+collection) for more details.

# Set the root directory for processing.

In [1]:
import os

root_dir = '/content/'
os.chdir(root_dir)

!ls -al   

total 16
drwxr-xr-x 1 root root 4096 Feb  9 14:42 .
drwxr-xr-x 1 root root 4096 Feb 13 07:18 ..
drwxr-xr-x 4 root root 4096 Feb  9 14:41 .config
drwxr-xr-x 1 root root 4096 Feb  9 14:42 sample_data


# Download SMS spam collection dataset.

### Download the dataset.

In [2]:
!wget https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip

--2023-02-13 07:19:10--  https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 203415 (199K) [application/x-httpd-php]
Saving to: ‘smsspamcollection.zip’


2023-02-13 07:19:11 (382 KB/s) - ‘smsspamcollection.zip’ saved [203415/203415]



In [3]:
!ls -al

total 216
drwxr-xr-x 1 root root   4096 Feb 13 07:19 .
drwxr-xr-x 1 root root   4096 Feb 13 07:18 ..
drwxr-xr-x 4 root root   4096 Feb  9 14:41 .config
drwxr-xr-x 1 root root   4096 Feb  9 14:42 sample_data
-rw-r--r-- 1 root root 203415 Jun 22  2012 smsspamcollection.zip


### Extract the dataset.

In [4]:
!unzip smsspamcollection.zip
!ls -al

Archive:  smsspamcollection.zip
  inflating: SMSSpamCollection       
  inflating: readme                  
total 692
drwxr-xr-x 1 root root   4096 Feb 13 07:19 .
drwxr-xr-x 1 root root   4096 Feb 13 07:18 ..
drwxr-xr-x 4 root root   4096 Feb  9 14:41 .config
-rw-r--r-- 1 root root   5868 Apr 18  2011 readme
drwxr-xr-x 1 root root   4096 Feb  9 14:42 sample_data
-rw-r--r-- 1 root root 477907 Mar 15  2011 SMSSpamCollection
-rw-r--r-- 1 root root 203415 Jun 22  2012 smsspamcollection.zip


# Import required python modules.

In [5]:
import pandas as pd

In [6]:
sms_data = pd.read_csv('SMSSpamCollection', header=None, sep='\t', names=['Label', 'SMS'])

In [7]:
sms_data.head()

Unnamed: 0,Label,SMS
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
sms_data.shape

(5572, 2)

In [9]:
sms_data.groupby('Label').count()

Unnamed: 0_level_0,SMS
Label,Unnamed: 1_level_1
ham,4825
spam,747


### Prepare data

In [10]:
sms_data_clean = sms_data.copy()

In [11]:
sms_data_clean['SMS'] = sms_data_clean['SMS'].str.replace('\W+', ' ').str.replace('\s+', ' ').str.strip()

  sms_data_clean['SMS'] = sms_data_clean['SMS'].str.replace('\W+', ' ').str.replace('\s+', ' ').str.strip()


In [12]:
sms_data_clean['SMS'] = sms_data_clean['SMS'].str.lower()

In [13]:
sms_data_clean['SMS'] = sms_data_clean['SMS'].str.split()

In [14]:
sms_data_clean['SMS'].head()

0    [go, until, jurong, point, crazy, available, o...
1                       [ok, lar, joking, wif, u, oni]
2    [free, entry, in, 2, a, wkly, comp, to, win, f...
3    [u, dun, say, so, early, hor, u, c, already, t...
4    [nah, i, don, t, think, he, goes, to, usf, he,...
Name: SMS, dtype: object

In [15]:
sms_data_clean['Label'].value_counts() / sms_data.shape[0] * 100

ham     86.593683
spam    13.406317
Name: Label, dtype: float64

### Split to train and test data

In [16]:
train_data = sms_data_clean.sample(frac=0.8,random_state=1).reset_index(drop=True)
test_data = sms_data_clean.drop(train_data.index).reset_index(drop=True)
train_data = train_data.reset_index(drop=True)

In [17]:
train_data['Label'].value_counts() / train_data.shape[0] * 100

ham     86.54105
spam    13.45895
Name: Label, dtype: float64

In [18]:
train_data.shape

(4458, 2)

In [19]:
test_data['Label'].value_counts() / test_data.shape[0] * 100

ham     86.983842
spam    13.016158
Name: Label, dtype: float64

In [20]:
test_data.shape

(1114, 2)

In [21]:
test_data.head()

Unnamed: 0,Label,SMS
0,ham,"[aight, should, i, just, plan, to, come, up, l..."
1,ham,"[die, i, accidentally, deleted, e, msg, i, sup..."
2,spam,"[welcome, to, uk, mobile, date, this, msg, is,..."
3,ham,"[this, is, wishing, you, a, great, day, moji, ..."
4,ham,"[thanks, again, for, your, reply, today, when,..."


### Prepare vocabulary - the list of all the words from the dataset

In [22]:
vocabulary = list(set(train_data['SMS'].sum()))

In [23]:
vocabulary[11:20]

['sunlight',
 'but',
 'nice',
 'anythingtomorrow',
 'ship',
 'hilarious',
 'knackered',
 'praying',
 'ü']

In [24]:
len(vocabulary)

7783

### Calculate frequencies of the words for each message

In [25]:
word_counts_per_sms = pd.DataFrame([
    [row[1].count(word) for word in vocabulary]
    for _, row in train_data.iterrows()], columns=vocabulary)

In [26]:
train_data = pd.concat([train_data.reset_index(), word_counts_per_sms], axis=1).iloc[:,1:]

In [27]:
train_data.head()

Unnamed: 0,Label,SMS,planet,rate,ennal,tbs,system,inches,09050000878,question,...,reassurance,spring,time,florida,nasdaq,wishes,summers,0207,suggestion,wer
0,ham,"[yep, by, the, pretty, sculpture]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ham,"[yes, princess, are, you, going, to, make, me,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ham,"[welp, apparently, he, retired]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ham,[havent],0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ham,"[i, forgot, 2, ask, ü, all, smth, there, s, a,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Calculate values for the Bayes formula

In [28]:
alpha = 1

In [29]:
Nvoc = len(train_data.columns) - 3

In [30]:
Pspam = train_data['Label'].value_counts()['spam'] / train_data.shape[0]

In [31]:
Pham = train_data['Label'].value_counts()['ham'] / train_data.shape[0]

In [32]:
Nspam = train_data.loc[train_data['Label'] == 'spam', 'SMS'].apply(len).sum()

In [33]:
Nham = train_data.loc[train_data['Label'] == 'ham', 'SMS'].apply(len).sum()

In [34]:
def p_w_spam(word):
    if word in train_data.columns:
        return (train_data.loc[train_data['Label'] == 'spam', word].sum() + alpha) / (Nspam + alpha*Nvoc)
    else:
        return 1

In [35]:
def p_w_ham(word):
    if word in train_data.columns:
        return (train_data.loc[train_data['Label'] == 'ham', word].sum() + alpha) / (Nham + alpha*Nvoc)
    else:
        return 1

### Prepare the classificator

In [36]:
def classify(message):
    p_spam_given_message = Pspam
    p_ham_given_message = Pham
    for word in message:
        p_spam_given_message *= p_w_spam(word)
        p_ham_given_message *= p_w_ham(word)
    if p_ham_given_message > p_spam_given_message:
        return 'ham'
    elif p_ham_given_message < p_spam_given_message:
        return 'spam'
    else:
        return 'needs human classification'

In [37]:
classify('secret')

'ham'

In [38]:
classify(['secret', 'source', 'of', 'infinite', 'power'])

'ham'

### Use test data

In [39]:
test_data['predicted'] = test_data['SMS'].apply(classify)

In [40]:
test_data.head()

Unnamed: 0,Label,SMS,predicted
0,ham,"[aight, should, i, just, plan, to, come, up, l...",ham
1,ham,"[die, i, accidentally, deleted, e, msg, i, sup...",ham
2,spam,"[welcome, to, uk, mobile, date, this, msg, is,...",spam
3,ham,"[this, is, wishing, you, a, great, day, moji, ...",ham
4,ham,"[thanks, again, for, your, reply, today, when,...",ham


In [41]:
correct = (test_data['predicted'] == test_data['Label']).sum() / test_data.shape[0] * 100

In [42]:
test_data.loc[test_data['predicted'] != test_data['Label']]

Unnamed: 0,Label,SMS,predicted
56,spam,"[money, i, have, won, wining, number, 946, wot...",ham
99,ham,"[gettin, rdy, to, ship, comp]",spam
142,ham,"[have, you, laid, your, airtel, line, to, rest]",spam
218,spam,"[hi, babe, its, chloe, how, r, u, i, was, smas...",ham
245,ham,[anytime],spam
404,ham,"[nokia, phone, is, lovly]",spam
491,spam,"[hi, this, is, amy, we, will, be, sending, you...",ham
588,ham,"[we, have, sent, jd, for, customer, service, c...",spam
646,ham,"[a, boy, loved, a, gal, he, propsd, bt, she, d...",needs human classification
912,spam,"[dating, i, have, had, two, of, these, only, s...",ham


In [43]:
correct

99.10233393177738