In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import numpy as np
import random
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import string
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk import PorterStemmer as Stemmer
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB

from sklearn.model_selection import train_test_split

from sklearn.metrics import classification_report

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
spam = pd.read_csv('/content/drive/MyDrive/SPAM text message 20170820 - Data.csv', encoding='ISO-8859-1')
print(spam)

     label                                            message
0      ham  Go until jurong point, crazy.. Available only ...
1      ham                      Ok lar... Joking wif u oni...
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...
3      ham  U dun say so early hor... U c already then say...
4      ham  Nah I don't think he goes to usf, he lives aro...
...    ...                                                ...
5567  spam  This is the 2nd time we have tried 2 contact u...
5568   ham              Will Ã¼ b going to esplanade fr home?
5569   ham  Pity, * was in mood for that. So...any other s...
5570   ham  The guy did some bitching but I acted like i'd...
5571   ham                         Rofl. Its true to its name

[5572 rows x 2 columns]


In [4]:
total_ham_messages = 0
total_spam_messages = 0
label = list(spam['label'])

for i, message in enumerate(list(spam['label'])):
    if label[i] == 'ham':
        total_ham_messages += 1
    else:
        total_spam_messages += 1

print("Number of ham messages:", total_ham_messages, "or", round(total_ham_messages / (total_ham_messages + total_spam_messages) * 100, 2), "% of the dataset")
print("Number of spam messages:", total_spam_messages, "or", round(total_spam_messages / (total_ham_messages + total_spam_messages) * 100, 2), "% of the dataset")

Number of ham messages: 4825 or 86.59 % of the dataset
Number of spam messages: 747 or 13.41 % of the dataset


In [5]:
def data_process(text):
    # case-lowering
    text = text.lower()
    # remove punctuation
    text = ''.join([t for t in text if t not in string.punctuation])
    # remove stopwords
    text = [t for t in text.split() if t not in stopwords.words('english')]
    # stemming
    stem = Stemmer()
    text = [stem.stem(t) for t in text]
    # return token list
    return text

In [6]:
# test with the dataset
spam['message'][:11].apply(data_process)

0     [go, jurong, point, crazi, avail, bugi, n, gre...
1                          [ok, lar, joke, wif, u, oni]
2     [free, entri, 2, wkli, comp, win, fa, cup, fin...
3         [u, dun, say, earli, hor, u, c, alreadi, say]
4     [nah, dont, think, goe, usf, live, around, tho...
5     [freemsg, hey, darl, 3, week, word, back, id, ...
6     [even, brother, like, speak, treat, like, aid,...
7     [per, request, mell, mell, oru, minnaminungint...
8     [winner, valu, network, custom, select, receiv...
9     [mobil, 11, month, u, r, entitl, updat, latest...
10    [im, gonna, home, soon, dont, want, talk, stuf...
Name: message, dtype: object

In [7]:
# random test
data_process("It's 6:45 a.m. in the morning and we are studying at HUST. Life sucks!")

['645', 'morn', 'studi', 'hust', 'life', 'suck']

In [8]:
tfidfv = TfidfVectorizer(analyzer=data_process)
data = tfidfv.fit_transform(spam['message'])

In [9]:
test_mess = spam.iloc[6]['message']
print(test_mess)

Even my brother is not like to speak with me. They treat me like aids patent.


In [10]:
print(tfidfv.transform([test_mess]))

  (0, 7301)	0.34401730829227406
  (0, 6618)	0.3135180454566497
  (0, 5424)	0.46349458150382344
  (0, 4373)	0.4305926574465684
  (0, 2820)	0.2659732583187343
  (0, 1684)	0.34148566861813384
  (0, 1042)	0.4424556780354621


In [11]:
# more visually
arr = tfidfv.transform([test_mess]).toarray()[0]
print('index\tidf\ttfidf\tterm')
for i in range(len(arr)):
    if arr[i] != 0:
        print(i, format(tfidfv.idf_[i], '.4f'), format(arr[i], '.4f'), tfidfv.get_feature_names_out()[i], sep='\t')

index	idf	tfidf	term
1042	8.5271	0.4425	aid
1684	6.5812	0.3415	brother
2820	5.1259	0.2660	even
4373	4.1492	0.4306	like
5424	8.9325	0.4635	patent
6618	6.0422	0.3135	speak
7301	6.6300	0.3440	treat


In [12]:
spam_filter = Pipeline([
    ('vectorizer', TfidfVectorizer(analyzer = data_process)),       # messages to weighted TFIDF score
    ('classifier', MultinomialNB())  ])                             # train on TFIDF vectors with Naive Bayes

In [125]:
# train-test
rs = random.randint(4, 44)
ts = round(random.uniform(0.15, 0.35), 2)

x_train, x_test, y_train, y_test = train_test_split(spam['message'], spam['label'], test_size = ts, random_state = rs)   # after 66 randomizations, ts = 0.24 and rs = 21 performs best
#x_train, x_test, y_train, y_test = train_test_split(spam['message'], spam['label'], test_size = 0.24, random_state = 21)

print('Test size = ', ts)
print('Random state = ', rs)

Test size =  0.16
Random state =  34


In [126]:
print(y_test)
print(x_test)

2982     ham
2167     ham
5319     ham
4209     ham
2742    spam
        ... 
2396     ham
4936     ham
1804     ham
4911     ham
2731     ham
Name: label, Length: 1338, dtype: object
2982    7 wonders in My WORLD 7th You 6th Ur style 5th...
2167              Thank you. And by the way, I just lost.
5319                         Kothi print out marandratha.
4209    Or i go home first lar Ã¼ wait 4 me lor.. I pu...
2742    I don't know u and u don't know me. Send CHAT ...
                              ...                        
2396               Babe, I'm back ... Come back to me ...
4936    Hey babe, how's it going ? Did you ever figure...
1804                   I'm in class. Did you get my text.
4911    WE REGRET TO INFORM U THAT THE NHS HAS MADE A ...
2731                         I havent lei.. Next mon can?
Name: message, Length: 1338, dtype: object


In [127]:
# train spam_filter
spam_filter.fit(x_train, y_train)

In [128]:
# predict each test case using the x_test part of the dataset
predictions = spam_filter.predict(x_test)

In [129]:
# wrong predictions
x_test[y_test != predictions]

2742    I don't know u and u don't know me. Send CHAT ...
3942    Free Msg: get Gnarls Barkleys "Crazy" ringtone...
2079                       85233 FREE>Ringtone!Reply REAL
1069    Someone U know has asked our dating service 2 ...
4968    You can donate Â£2.50 to UNICEF's Asian Tsunam...
3360    Sorry I missed your call let's talk when you h...
607     XCLUSIVE@CLUBSAISAI 2MOROW 28/5 SOIREE SPECIAL...
4643    You are being ripped off! Get your mobile cont...
2915    Sorry! U can not unsubscribe yet. THE MOB offe...
4460    Welcome to UK-mobile-date this msg is FREE giv...
3862    Free Msg: Ringtone!From: http://tms. widelive....
3981                                   ringtoneking 84484
2364    Fantasy Football is back on your TV. Go to Sky...
5466    http//tms. widelive.com/index. wml?id=820554ad...
1544    Hello from Orange. For 1 month's free access t...
2003    TheMob>Yo yo yo-Here comes a new selection of ...
3954    Refused a loan? Secured or Unsecured? Can't ge...
2965    Do you

In [130]:
def detect_spam(message):
    return spam_filter.predict([message])[0]

print("The message #1 is:", detect_spam("Congratulations! You've won a $1,000 Walmart giftcard! Go to https://bit.ly/123456 to claim now!"))
print("The message #2 is:", detect_spam("Hello son, it's dad. I've just wondered if you would come back home for Christmas. Love you."))
print("The message #3 is:", detect_spam("URGENT! You have won a 1 week FREE membership in our £100,000 Prize Jackpot! CLAIM to No: 81010 T&C www.dbuk.net"))
print("The message #4 is:", detect_spam("XXXMobileMovieClub: To use your credit, click the WAP link in the next txt message or click here>> http://wap. xxxmobilemovieclub.com?n=QJKGIGHJJGCBL"))
print("The message #5 is:", detect_spam("Bro where are you?"))
print("The message #6 is:", detect_spam("I miss you so much, please come back to me :((. I swear I'm not gay."))

The message #1 is: spam
The message #2 is: ham
The message #3 is: spam
The message #4 is: spam
The message #5 is: ham
The message #6 is: ham


In [131]:
count = 0
for i in range(len(y_test)):
    if y_test.iloc[i] != predictions[i]:
        count += 1
print('Number of test cases:', len(y_test))
print('Number of wrong predictions:', count)

print('Accuracy of the classifier:', round((1 - (count / len(y_test))) * 100, 2), '%')

Number of test cases: 1338
Number of wrong predictions: 47
Accuracy of the classifier: 96.49 %


In [132]:
print(classification_report(predictions, y_test))

              precision    recall  f1-score   support

         ham       1.00      0.96      0.98      1200
        spam       0.75      0.99      0.85       138

    accuracy                           0.96      1338
   macro avg       0.87      0.98      0.92      1338
weighted avg       0.97      0.96      0.97      1338

