## Importing neccessary Libraries

In [1]:
import sys
import nltk
import sklearn
import pandas
import numpy
print('Python: {}'.format(sys.version))
print('NLTK: {}'.format(nltk.__version__))
print('Scikit-learn: {}'.format(sklearn.__version__))
print('Pandas: {}'.format(pandas.__version__))
print('Numpy: {}'.format(numpy.__version__))

Python: 3.7.3 (default, Mar 27 2019, 22:11:17) 
[GCC 7.3.0]
NLTK: 3.4.4
Scikit-learn: 0.21.2
Pandas: 0.24.2
Numpy: 1.16.4


## Loading the dataset
https://archive.ics.uci.edu/ml/datasets/sms+spam+collection

In [2]:
import pandas as pd
import numpy as np

In [3]:
# load the dataset of SMS messages
df = pd.read_table('SMSSpamCollection', header=None, encoding='utf-8') # don't use latin-1

  


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
0    5572 non-null object
1    5572 non-null object
dtypes: object(2)
memory usage: 87.1+ KB


In [5]:
df.shape ##here are 2 columns and 5571 rows...

(5572, 2)

In [6]:
#show some of the data of the dataset
df.head()


Unnamed: 0,0,1
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
df.columns # returning the first 

Int64Index([0, 1], dtype='int64')

In [8]:
df[0]

0        ham
1        ham
2       spam
3        ham
4        ham
5       spam
6        ham
7        ham
8       spam
9       spam
10       ham
11      spam
12      spam
13       ham
14       ham
15      spam
16       ham
17       ham
18       ham
19      spam
20       ham
21       ham
22       ham
23       ham
24       ham
25       ham
26       ham
27       ham
28       ham
29       ham
        ... 
5542     ham
5543     ham
5544     ham
5545     ham
5546     ham
5547    spam
5548     ham
5549     ham
5550     ham
5551     ham
5552     ham
5553     ham
5554     ham
5555     ham
5556     ham
5557     ham
5558     ham
5559     ham
5560     ham
5561     ham
5562     ham
5563     ham
5564     ham
5565     ham
5566    spam
5567    spam
5568     ham
5569     ham
5570     ham
5571     ham
Name: 0, Length: 5572, dtype: object

In [9]:
df[1][2]

"Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"

In [10]:
# check class distribution
classes = df[0]
print(classes.value_counts())

ham     4825
spam     747
Name: 0, dtype: int64


## Preprocess Data

In [12]:
from sklearn.preprocessing import LabelEncoder
#Now convert ham = 0 and spam = 1
encoder = LabelEncoder()
y = encoder.fit_transform(classes)


In [13]:
# see what changes are made by the label encoder
# list(y)
# for i in y:
#     print(i, end =" ")
print(y)
print(type(y))

[0 0 1 ... 0 0 0]
<class 'numpy.ndarray'>


##### Now there are some common things in msg, that are phone numbers, email, websites that are need to be replaced by the common words or need to be removed, because they makes no sense here

In [14]:
text_messages = df[1]
print(text_messages[:10])

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
5    FreeMsg Hey there darling it's been 3 week's n...
6    Even my brother is not like to speak with me. ...
7    As per your request 'Melle Melle (Oru Minnamin...
8    WINNER!! As a valued network customer you have...
9    Had your mobile 11 months or more? U R entitle...
Name: 1, dtype: object


In [15]:
# Replace email addresses with 'email'
# you can use any regex expression they are basically taken from the wikipedia

processed = text_messages.str.replace(r'^.+@[^\.].*\.[a-z]{2,}$',
                                 'emailaddress')


In [16]:
# Replace URLs with 'webaddress'
# you can use any regex expression they are basically taken from the wikipedia

processed = processed.str.replace(r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$',
                                  'webaddress')

In [17]:
# Replace money symbols with 'moneysymb' (£ can by typed with ALT key + 156)
# you can use any regex expression they are basically taken from the wikipedia

processed = processed.str.replace(r'£|\$', 'moneysymb')
    


In [18]:
# Replace 10 digit phone numbers (formats include paranthesis, spaces, no spaces, dashes) with 'phonenumber'
# you can use any regex expression they are basically taken from the wikipedia

processed = processed.str.replace(r'^\(?[\d]{3}\)?[\s-]?[\d]{3}[\s-]?[\d]{4}$',
                                  'phonenumbr')


In [19]:
# Replace numbers with 'numbr'
# you can use any regex expression they are basically taken from the wikipedia

processed = processed.str.replace(r'\d+(\.\d+)?', 'numbr')

In [20]:
# Remove punctuation
# you can use any regex expression they are basically taken from the wikipedia

processed = processed.str.replace(r'[^\w\d\s]', ' ')

# Replace whitespace between terms with a single space
processed = processed.str.replace(r'\s+', ' ')

# Remove leading and trailing whitespace
processed = processed.str.replace(r'^\s+|\s+?$', '')

In [21]:
#as HORse horse Horse are same SO conver are letters to lower case
processed = processed.str.lower()

In [22]:
processed = processed.str.lower()

In [23]:
processed

0       go until jurong point crazy available only in ...
1                                 ok lar joking wif u oni
2       free entry in numbr a wkly comp to win fa cup ...
3             u dun say so early hor u c already then say
4       nah i don t think he goes to usf he lives arou...
5       freemsg hey there darling it s been numbr week...
6       even my brother is not like to speak with me t...
7       as per your request melle melle oru minnaminun...
8       winner as a valued network customer you have b...
9       had your mobile numbr months or more u r entit...
10      i m gonna be home soon and i don t want to tal...
11      six chances to win cash from numbr to numbr nu...
12      urgent you have won a numbr week free membersh...
13      i ve been searching for the right words to tha...
14                      i have a date on sunday with will
15      xxxmobilemovieclub to use your credit click th...
16                                 oh k i m watching here
17      eh u r

In [24]:
# Now you have to remove stopwords, these are common words, use in every sentence and make nosense in prediction

In [25]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/saurabh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [26]:
from nltk.corpus import stopwords
s = stopwords.words('english')
print(set(s))

{'over', "couldn't", 'did', 'a', 'below', 'weren', 'few', 'was', 'its', 'him', 'being', 'she', 'but', 'do', 'or', 'will', 'them', 'at', "didn't", 'here', 'don', "that'll", 'you', "wasn't", 'shouldn', 'themselves', 'should', 'above', 'nor', 'o', "hadn't", 'am', 'further', 'this', 'isn', 'both', 'only', "shan't", 'they', 'about', 'didn', 'me', "hasn't", 'down', 'what', 'most', "you'll", 'while', 'aren', 'and', 'with', 'ma', 'no', 'until', 'in', 'all', 'have', 'when', "you've", 'up', 'into', 'her', "mightn't", 'ain', 'theirs', 'these', 'each', 'can', "shouldn't", 'yourselves', "mustn't", 'your', 'more', 'be', 'does', 'doing', 'their', 'there', 'hadn', 'wouldn', "isn't", 've', 'under', 'mightn', "it's", 'that', 'been', 'after', "you'd", 'through', 'yourself', 'as', 'is', 'before', 'haven', 'needn', 'the', 'to', 'very', 'hers', 'it', "she's", 'mustn', 'itself', 'such', 'if', 'off', 'we', 'are', 'because', "won't", 'has', 'having', 'just', "wouldn't", 'any', 'doesn', 'won', 'wasn', "should'v

In [27]:
processed = processed.apply(lambda x: ' '.join(
    term for term in x.split() if term not in s))

In [28]:
# Remove word stems using a Porter stemmer
ps = nltk.PorterStemmer() # it removes the synonyms and similar sounding words..

processed = processed.apply(lambda x: ' '.join(
    ps.stem(term) for term in x.split()))

In [29]:
for i in processed:
    print(i) # just checking everything at this point
    # here you can see effects of stemming
    # crazy -> crazi
    #early, earli, earlii -> earli

go jurong point crazi avail bugi n great world la e buffet cine got amor wat
ok lar joke wif u oni
free entri numbr wkli comp win fa cup final tkt numbrst may numbr text fa numbr receiv entri question std txt rate c appli numbrovernumbr
u dun say earli hor u c alreadi say
nah think goe usf live around though
freemsg hey darl numbr week word back like fun still tb ok xxx std chg send moneysymbnumbr rcv
even brother like speak treat like aid patent
per request mell mell oru minnaminungint nurungu vettam set callertun caller press numbr copi friend callertun
winner valu network custom select receivea moneysymbnumbr prize reward claim call numbr claim code klnumbr valid numbr hour
mobil numbr month u r entitl updat latest colour mobil camera free call mobil updat co free numbr
gonna home soon want talk stuff anymor tonight k cri enough today
six chanc win cash numbr numbr numbr pound txt cshnumbr send numbr cost numbrp day numbrday numbr tsandc appli repli hl numbr info
urgent numbr week f

hey book kb sat alreadi lesson go ah keep sat night free need meet confirm lodg
chk ur belovd ms dict
time want come
awesom lemm know whenev around
shb b ok lor thanx
beauti truth graviti read care heart feel light someon feel heavi someon leav good night
also rememb get dobbi bowl car
filthi stori girl wait
sorri c ur msg yar lor poor thing numbr one night tmr u brand new room numbr sleep
love decis feel could decid love life would much simpler less magic
welp appar retir
sort code acc bank natwest repli confirm sent right person

u sure u take sick time
urgent tri contact u today draw show moneysymbnumbr prize guarante call numbr land line claim mnumbr validnumbrhr
watch cartoon listen music amp eve go templ amp church u
yo chad gymnast class wanna take site say christian class full
much buzi
better still catch let ask sell lt gt
sure night menu know noon menu
u want come back beauti necklac token heart that give wife like see one give dont call wait till come
will go aptitud class
w

watch tv got new job
pen thing beyond joke wont biro master ever
parti alex nichol
u secret admir look numbr make contact u find r reveal think ur special call numbr
see miss call dear brother grnumbr day
ok ü finish soon
sorri help
come slave go shell unconsci avoid make unhappi
love ass enjoy doggi style
think ask gym excus lazi peopl jog
dear numbrxxxxxxx u invit xchat final attempt contact u txt chat numbr numbrp msgrcvdhg suitenumbr numbrland row wnumbrjnumbrhl ldn numbryr
urgent pleas call numbr landlin abta complimentari numbr tenerif holiday moneysymbnumbr cash await collect sae cs box numbr cwnumbrwx numbrppm
way home long dri spell season would
gotta collect da car numbr lei
ok knacker came home went sleep good full time work lark
probabl earlier station think
call numbr listen extrem dirti live chat go offic right total privaci one know sic listen numbrp min numbr numbrmp numbr
good morn plz call sir
freemsg hey u got numbr video pic fone repli wild txt ill send u pic hurri 

smith wast da wanna gayl
mum sent mani mani messag sinc got want know actual get enjoy rest day
aight tomorrow around lt gt
hous maid murder coz man murder lt gt th januari public holiday govt instituit close includ post offic understand
chanc realiti fantasi show call numbr numbrp per min ntt ltd po box numbr croydon crnumbr numbrwb numbr nation rate call
actual first time went bed long spoke woke numbr night
see
dont understand messag
crucifi c told earlier
idk keep say sinc move keep but head freedom vs respons tire much shit deal bare keep togeth get ad
fuck cedar key fuck come anyway tho
twenti past five said train durham alreadi coz reserv seat
hey boy want hot xxx pic sent direct numbr ur phone txt porn numbr numbrhr free numbrp per day stop text stopbcm sf wcnumbrnnumbrxx
u still paint ur wall
last chanc claim ur moneysymbnumbr worth discount voucher today text shop numbr savamob offer mobil cs savamob poboxnumbr mnumbruz moneysymbnumbr sub numbr
printer cool mean groovi wine g

ok im sure time finish tomorrow wanna spend even co would vewi vewi lubli love xxx
hello per request lt gt rs numbr transfer
tirupur call da
winner special select receiv moneysymbnumbr cash moneysymbnumbr award speak live oper claim call numbram numbrpm cost numbrp
luck numbr catch put
noe ü specifi da domain nusstu ü still sch
oh ask fun haha take care ü
shall get pouch
hey loverboy love tell look pictur ach feel leg fuck want need crave
boy sweet word left morn sigh goe day love start studi
kent vale lor ü wait numbr ar
ok good make money
read gud habit nan bari hudgi yorg pataistha ertini kano
aight still want get money
free top rington sub weekli rington get numbrst week free send subpoli numbr numbr per week stop sm numbr
ok ok ok what ur today plan
town v import
sorri pa dont knw ru pa
wat u
meet ü rite go home lor ü dun feel like comin ok
oh get paid outstand one commerci hasbro august made us jump mani hoop get paid still
late call tomorrow morn take care sweet dream u ummifi b

urgent call numbrfrom landlin complimentari numbr ibiza holiday moneysymbnumbr numbr cash await collect sae cs po box numbr sknumbr numbrwp numbrppm numbr
holi live christ take long
ü thk wat eat tonight
thanx yup come back sun finish dinner go back numbr hotel time fli tog numbr exactli mth today hope haf mani mth come
opposit side drop
yup izzit still rain heavili co e mrt c outsid
send resum
gd luck numbr ur exam
u ask next sat make ok lor
sorri uncl keep touch
saw guy doll last night patrick swayz great
urgent numbrnd attempt contact u moneysymbnumbr prize yesterday still await collect claim call numbr
santa call would littl one like call santa xma eve call numbr book time callsnumbrppm last numbrmin numbr c www santacal com
come home want u miser
dont know get messag
cool tyler take gonna buy drop place later tonight total order quarter got enough
guy car shop flirt got phone number paperwork call text nervou cours may address call boss tell know may get fire
revers cheat mathemat

total disappoint text craziest shit got
effect irrit ignor
one
think tantrum finish yeah point
compliment away system side
happen adventur
hey chief give bell get need talk royal visit numbrst june
ok anoth number
know thinkin malaria relax children cant handl malaria would wors gastroenter take enough replac loss temp reduc give malaria med vomit self limit ill mean day complet stop
aiyah ok wat long got improv alreadi wat
want explicit sex numbr sec ring numbr cost numbrp min gsex pobox numbr wcnumbrn numbrxx
believ attach see everi day know best get babe go teach class midnight
sleep surf
ask numbrmobil numbr chatlin inclu free min india cust serv sed ye lnumbrer got mega bill numbr dont giv shit bailiff due day moneysymbnumbr numbr want moneysymbnumbr
yeah ju rite
armand say get ass epsilon
u still havent got urself jacket ah
take derek amp taylor walmart back time done leav mous desk text priscilla readi
hi durban still number
ic lotta childporn car
contract mobil numbr mnth lates

In [30]:
from nltk.tokenize import word_tokenize

In [31]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/saurabh/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [32]:
from nltk.tokenize import word_tokenize

# create bag-of-words
all_words = []

for message in processed:
    words = word_tokenize(message)
    for w in words:
        all_words.append(w)
        
all_words = nltk.FreqDist(all_words)

In [33]:
print(len(all_words))

6579


In [34]:
print(all_words.most_common(100))

[('numbr', 2648), ('u', 1207), ('call', 674), ('go', 456), ('get', 451), ('ur', 391), ('gt', 318), ('lt', 316), ('come', 304), ('moneysymbnumbr', 303), ('ok', 293), ('free', 284), ('day', 276), ('know', 275), ('love', 266), ('like', 261), ('got', 252), ('time', 252), ('good', 248), ('want', 247), ('text', 231), ('send', 214), ('txt', 190), ('need', 190), ('one', 185), ('today', 181), ('take', 174), ('ü', 173), ('see', 173), ('stop', 168), ('home', 167), ('think', 166), ('repli', 163), ('r', 162), ('lor', 162), ('sorri', 160), ('still', 158), ('tell', 157), ('n', 155), ('numbrp', 154), ('back', 153), ('mobil', 153), ('da', 151), ('dont', 149), ('make', 148), ('k', 147), ('week', 141), ('pleas', 141), ('phone', 141), ('say', 140), ('hi', 140), ('work', 136), ('new', 136), ('pl', 135), ('later', 135), ('hope', 134), ('miss', 133), ('ask', 133), ('co', 131), ('meet', 128), ('msg', 127), ('messag', 125), ('night', 124), ('dear', 122), ('c', 121), ('wait', 121), ('happi', 121), ('well', 120)

In [35]:
word_features = list(all_words.keys()) #using all most common words as features to increase accuracy

In [36]:
def find_features(message):
    words = word_tokenize(message)
    features = {}
    for word in word_features:
        features[word] = (word in words)

    return features


In [37]:
messages = zip(processed, y)

In [38]:
# just for debugging
for i,j in messages:
    print(i,j)

go jurong point crazi avail bugi n great world la e buffet cine got amor wat 0
ok lar joke wif u oni 0
free entri numbr wkli comp win fa cup final tkt numbrst may numbr text fa numbr receiv entri question std txt rate c appli numbrovernumbr 1
u dun say earli hor u c alreadi say 0
nah think goe usf live around though 0
freemsg hey darl numbr week word back like fun still tb ok xxx std chg send moneysymbnumbr rcv 1
even brother like speak treat like aid patent 0
per request mell mell oru minnaminungint nurungu vettam set callertun caller press numbr copi friend callertun 0
winner valu network custom select receivea moneysymbnumbr prize reward claim call numbr claim code klnumbr valid numbr hour 1
mobil numbr month u r entitl updat latest colour mobil camera free call mobil updat co free numbr 1
gonna home soon want talk stuff anymor tonight k cri enough today 0
six chanc win cash numbr numbr numbr pound txt cshnumbr send numbr cost numbrp day numbrday numbr tsandc appli repli hl numbr in

headin toward busetop 0
messag text miss sender name miss number miss sent date miss miss u lot that everyth miss sent via fullonsm com 0
come room point iron plan weekend 0
co want thing 0
oki go yan jiu skip ard oso go cine den go mrt one blah blah blah 0
bring home wendi 0
numbr date servic cal l numbr boxnumbrsknumbrch 1
whatsup dont u want sleep 0
alright new goal 0
free entri moneysymbnumbr weekli competit text word win numbr numbr c www txttowin co uk 1
alright head minut text meet 0
send logo numbr ur lover numbr name join heart txt love namenumbr namenumbr mobno eg love adam eve numbr numbr yahoo poboxnumbrwnumbrwq txtno numbr ad numbrp 1
ye last week take live call 0
someon contact date servic enter phone fanci find call landlin numbr poboxnumbrnnumbrtfnumbrp 1
siva hostel aha 0
urgent mobil number award moneysymbnumbr prize guarante call numbr land line claim numbr valid numbrhr 1
send ur friend receiv someth ur voic speak express numbr childish numbr naughti numbr sentiment

yo guess drop 0
carlo say mu lt gt minut 0
offic call lt gt min 0
geeee miss alreadi know think fuck wait till next year togeth love kiss 0
yun ah ubi one say ü wan call tomorrow call numbr look iren ere got busnumbr numbr numbr numbr numbr numbr ubi cre ubi tech park numbrph numbrst numbrwkg day èn 0
ugh gotta drive back sd la butt sore 0
numbrth juli 0
hi im relax time ever get numbram everi day parti good night get home tomorrow numbrish 0
ü ü wan come come lor din c stripe skirt 0
xma stori peac xma msg love xma miracl jesu hav bless month ahead amp wish u merri xma 0
number 0
chang e one next escal 0
yetund class run water make ok pl 0
lot happen feel quiet beth aunt charli work lot helen mo 0
ü wait numbr bu stop aft ur lect lar dun c ü go get car come back n pick ü 0
aight thank comin 0
heard abt tat 0
pleas call custom servic repres freephon numbr numbr numbr numbram numbrpm guarante moneysymbnumbr cash moneysymbnumbr prize 1
ye realli great bhaji told kalli best cricket sachin

wont touch permiss 0
hi luci hubbi meetin day fri b alon hotel u fanci cumin pl leav msg numbrday numbr luci x callsmoneysymbnumbr minmobsmorelkpoboxnumbrhpnumbrfl 1
numbr wonder world numbrth numbrth ur style numbrth ur smile numbrth ur person numbrrd ur natur numbrnd ur sm numbrst ur love friendship good morn dear 0
take small dose tablet fever 0
oh u must taken real valentin shop first 0
sent email address incomm right 0
gonna blake night might abl get littl earli 0
friendship game play word say start march end may tomorrow yesterday today e 0
nice wait text right gonna pay ticket ya know 0
watch lotr w si di aft u wan numbr meet numbr dinner nite 0
keep away like 0
think far find check googl map place dorm 0
trip ok quit tire lor uni start today ok numbr co take modul ju concentr final yr project 0
alway say welp 0
guy browsin compulsori 0
ok 0
puriti friendship two smile read forward messag smile see name gud evng musthu 0
sorri call later 0
add realli care least get dude fuck hey

good night dear sleepwel amp take care 0
wondarful song 0
freemsg claim ur numbr sm messag text ok numbr use webnumbrmobil numbr ur mate etc join txtnumbr com numbrp wk c boxnumbr lanumbrwu numbr remov txtx stop 1
yar lor actual quit fast co da ge slow wat haha 0
must come later normal bath da afternoon mah 0
trust even 0
hey hun onbu goin numbr meet want numbrgo numbra meal donyt feel like cuz numbr get last bu home he sweet latelyxxx 0
numbr free rington repli real 1
take like noon 0
open mca 0
aight wat happen side 0
done oredi 0
sweet well princess pleas tell like dislik bed 0
wish great semest 0
moji love word rich day 0
dude like buff wind 0
alright babe justthought sayhey u doin nearli endof wk offdam nevamind numbrhook sn uwant mnumbr lovejen x 0
well done england get offici poli rington colour flag yer mobil text tone flag numbr opt txt eng stop boxnumbr wnumbrwx moneysymbnumbr 1
give everyth want need actual could better yor got money get work get man pay rent even fill fuck 

onum ela pa normal 0
k k sister kid 0
cool text way 0
nope meanwhil talk say make greet 0
cant talk call dont keep call 0
anyth lar 0
rose need water season need chang poet need imagin phone need ur sm need ur love frndship forev 0
good afternoon babe goe day job prospect yet miss love sigh 0
pick drop car problem 0
think wast rr 0
world famamu 0
come friday leav pongal get news work place 0
lol well without could big sale togeth 0
way 0
eat old airport road numbr oredi got lot pple 0
sri talk phone parent 0
final chanc claim ur moneysymbnumbr worth discount voucher today text ye numbr savamob member offer mobil cs savamob poboxnumbr mnumbruz moneysymbnumbr sub numbr 1
ok lor wat time ü finish 0
princess like make love lt gt time per night hope that problem 0
mm way railway 0
dnt wnt tlk wid u 0
done sorri hope next space give everyth want rememb furnitur around move lock lock leav key jenn 0
yet like keep touch easiest way barcelona way ru hous 0
sppok ur mob halloween collect nokia l

thank chikku gud nyt 0
xy ur car u pick 0
thanx numbr time spent numbrgeva bin mint ur babi want u xxxx 0
yo way could pick someth tonight 0
sent send 0
fine simpli sit 0
tht god gift bird human hav natur gift frm god 0
come day class 0
im done studyn librari 0
ok u enjoy ur show 0
anyth 0
wuld without babi thought alon mite break wanna go crazi everyboy need ladi xxxxxxxx 0
wat dear sleep ah 0
hi test lt gt rd 0
numbr student solv cat question xam numbr numbr numbr lt gt numbr numbr numbr lt gt numbr numbr numbr lt gt numbr numbr numbr tell answer u r brilliant numbrth got answr 0
yo know anyon lt gt otherwis abl buy liquor guy flake right get hold somebodi numbr loko night 0
yup n fren lor meet fren numbr 0
yeah got one line us 0
stop wonder wow ever go stop tm ing tm whenev want mine laugh 0
lol yep yesterday alreadi got fireplac anoth icon sit 0
hey book pilat yoga lesson alreadi haha 0
ok happen behav like 0
numbr new messag pleas call numbr 1
supervisor find numbr one lor thk stu

ask around lot term mid 0
sure check yahoo email sent photo yesterday 0
look 0
wherr boytoy 0
want new video phonenumbr anytim network min numbr text five pound per week call numbr repli deliveri tomorrow 1
hello love goe day wish well fine babe hope find job prospect miss boytoy teas kiss 0
tell bad charact u dnt lik tri chang lt gt add tat numbr new year resolut wait ur repli frank good morn 0
got rumour go buy apart chennai 0
yeah probabl earlier 0
chang window logoff sound 0
still check da 0
also came room 0
huh got lesson numbr lei n thinkin go sch earlier n tot parkin kent vale 0
ok 0
reach offic around lt decim gt amp mobil problem cann get voic call asa free 0
cool text head 0
contact date servic someon know find call land line numbr poboxnumbrwnumbrtgnumbrp 1
wannumbr win meet greet westlif numbr u mnumbr current tour numbr unbreak numbr untam numbr unkempt text numbr numbr numbr numbr cost numbrp std text 1
happi birthday may u find ur princ charm soon n dun work hard 0
oh gr

dont make ne plan nxt wknd coz want us come ok 0
school start stay weather like food social support system like friend school thing import 0
ha ha nan yalrigu heltini iyo kothi chikku u share mani thing wit far told bodi even utter word abt u ur trust much tell other plz nxt time dont use word ok chikku b 0
noic text 0
hi di yiju meet numbr pm esaplanad tonight 0
mobi pub quiz win moneysymbnumbr high street prize u know new duchess cornwal txt first name numbr unsub stop moneysymbnumbr numbr sp 1
week savamob member offer access call numbr detail savamob pobox numbr lanumbr numbrwu moneysymbnumbr week savamob offer mobil 1
aight set free think could text blake address occur quit sure thought 0
hi dear saw dear happi batteri low 0
age abj 0
prof pass paper sem congrat student enna kalaachutaarama prof gud mrng 0
dont kick coco 0
fyi gonna call sporad start like lt gt bc doin shit 0
contact date servic someon know find call mobil landlin numbr poboxnumbrldnsnumbr 1
tb persolvo chase us s

wa u effici gee thanx 0
numbr receiv mobil content enjoy 1
abl sleep 0
want explicit sex numbr sec ring numbr cost numbrp min 1
meet soon princess ttyl 0
pick numbrpm go taunton still want come 0
oh numbr outsid player allow play know 0
anyth lor 0
erutupalam thandiyachu 0
cant u tri new invent fli joke 0
ful song lyric 0
u reckon need numbr arrang transport u thank 0
true lov n care wil nevr go unrecogn though somon often make mistak valu definitli undrstnd start miss 0
shop eh ger toke abt syd leh haha 0
stand 0
good weekend 0
miss call miss call khelat kintu opponent miss call dhort lage that rule one great phone receiv qualiti win 0
call get chanc plz lt numbr 0
new deu ex game comin earli next yr 0
comput fri essenti part keep spare fuck idiot roommat looovvv leav thing run full lt gt numbr 0
friend studi warwick plan go shop concert tmw may cancel havn seen age yeah get togeth sometim 0
probabl coupl hour top 0
lol grin babe thank think 0
man bu slow think gonna get 0
hope text m

hi probabl much fun get messag thought id txt u co im bore jame fart night 0
hi babi im sat bloodi bu mo wont home numbr numbr wanna somethin later call later ortxt back jess xx 0
welcom select onumbr servic ad benefit call special train advisor free mobil diall numbr 1
lost numbr pound sinc doc visit last week woot woot gonna celebr stuf face 0
u come back numbr dinner rite dad ask confirm wif u 0
master buy bb co sale bf 0
ahhhh woken bad dream u tho dont like u right didnt know anyth comedi night guess im 0
vivek got call number 0
u call lunch 0
mean left earli check co work numbr numbr 0
want lt gt rs da 0
bit ur smile hppnss drop ur tear sorrow part ur heart life heart like mine wil care u forevr goodfriend 0
yup ok 0
want see pretti pussi 0
dear voucher holder next meal us use follow link pc numbr enjoy numbr numbr numbr dine experiencehttp www vouchnumbrm com etlp dine asp 1
peopl game mall iouri kaila 0
urgent tri contact u today draw show moneysymbnumbr prize guarante call num

In [105]:
type(processed)
print(y[0:10])
print(processed[5])

[0 0 1 0 0 1 0 0 1 1]
freemsg hey darl numbr week word back like fun still tb ok xxx std chg send moneysymbnumbr rcv


In [39]:
# Now lets do it for all the messages
messages = zip(processed, y)

# define a seed for reproducibility
seed = 1
np.random.seed = seed
#np.random.shuffle(messages)

# call find_features function for each SMS message
featuresets = [(find_features(text), label) for (text, label) in messages]

In [40]:
from sklearn import model_selection

# split the data into training and testing datasets
training, testing = model_selection.train_test_split(featuresets, test_size = 0.25, random_state=seed)

In [95]:
training[2][1]
    

1

In [97]:
## training data format
#training data -> [({"word": True or False}, 1 or 0)] // 1 tolds that the msg is spam, and 0 tolds that msg is ham
## True means it's in our word bag and decides the msg is spam or not
# No histogram because I don't think it's needed
# basically training set it [({},1), ({},0), ({},1), ({},0)]

In [96]:
len(testing)
featuresets[1]

({'go': False,
  'jurong': False,
  'point': False,
  'crazi': False,
  'avail': False,
  'bugi': False,
  'n': False,
  'great': False,
  'world': False,
  'la': False,
  'e': False,
  'buffet': False,
  'cine': False,
  'got': False,
  'amor': False,
  'wat': False,
  'ok': True,
  'lar': True,
  'joke': True,
  'wif': True,
  'u': True,
  'oni': True,
  'free': False,
  'entri': False,
  'numbr': False,
  'wkli': False,
  'comp': False,
  'win': False,
  'fa': False,
  'cup': False,
  'final': False,
  'tkt': False,
  'numbrst': False,
  'may': False,
  'text': False,
  'receiv': False,
  'question': False,
  'std': False,
  'txt': False,
  'rate': False,
  'c': False,
  'appli': False,
  'numbrovernumbr': False,
  'dun': False,
  'say': False,
  'earli': False,
  'hor': False,
  'alreadi': False,
  'nah': False,
  'think': False,
  'goe': False,
  'usf': False,
  'live': False,
  'around': False,
  'though': False,
  'freemsg': False,
  'hey': False,
  'darl': False,
  'week': Fals

## Sckit Learn classifier with NLTK

In [44]:
from nltk.classify.scikitlearn import SklearnClassifier

In [46]:
#SVM classsifier (support vector machine)
from sklearn.svm import SVC
model1 = SklearnClassifier(SVC(kernel = 'linear'))
model1.train(training)

<SklearnClassifier(SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False))>

In [47]:
accuracy = nltk.classify.accuracy(model1, testing)


## SVC Classifier

In [48]:
#import math
print("SVC Classifier accuracy {}%".format(round(accuracy * 100,4)))

SVC Classifier accuracy 98.7796%


In [119]:
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
txt_features, labels = zip(*testing)
prediction = model1.classify_many(txt_features)
print(classification_report(prediction,labels))

              precision    recall  f1-score   support

           0       1.00      0.99      0.99      1217
           1       0.93      0.98      0.95       176

    accuracy                           0.99      1393
   macro avg       0.96      0.98      0.97      1393
weighted avg       0.99      0.99      0.99      1393



## KNeighbors classifier

In [120]:
from sklearn.neighbors import KNeighborsClassifier

In [121]:
model2 = SklearnClassifier(KNeighborsClassifier())
model2.train(training)

<SklearnClassifier(KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform'))>

In [122]:
accuracy = nltk.classify.accuracy(model2, testing )

In [123]:
#import math
print("KNeighbor Classifier accuracy {}%".format(round(accuracy * 100,4)))

KNeighbor Classifier accuracy 92.8212%


In [124]:
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
txt_features, labels = zip(*testing)
prediction = model2.classify_many(txt_features)
print(classification_report(prediction,labels))

              precision    recall  f1-score   support

           0       1.00      0.92      0.96      1308
           1       0.46      1.00      0.63        85

    accuracy                           0.93      1393
   macro avg       0.73      0.96      0.79      1393
weighted avg       0.97      0.93      0.94      1393



## RandomForest Classifier

In [None]:
from sklearn.ensemblen import 

In [56]:
(training[0])

({'go': False,
  'jurong': False,
  'point': False,
  'crazi': False,
  'avail': False,
  'bugi': False,
  'n': False,
  'great': False,
  'world': False,
  'la': False,
  'e': False,
  'buffet': False,
  'cine': False,
  'got': False,
  'amor': False,
  'wat': False,
  'ok': False,
  'lar': False,
  'joke': False,
  'wif': False,
  'u': False,
  'oni': False,
  'free': True,
  'entri': False,
  'numbr': True,
  'wkli': False,
  'comp': False,
  'win': False,
  'fa': False,
  'cup': False,
  'final': False,
  'tkt': False,
  'numbrst': False,
  'may': False,
  'text': False,
  'receiv': False,
  'question': False,
  'std': False,
  'txt': False,
  'rate': False,
  'c': False,
  'appli': False,
  'numbrovernumbr': False,
  'dun': False,
  'say': False,
  'earli': False,
  'hor': False,
  'alreadi': False,
  'nah': False,
  'think': False,
  'goe': False,
  'usf': False,
  'live': False,
  'around': False,
  'though': False,
  'freemsg': False,
  'hey': False,
  'darl': False,
  'week': 

In [65]:
txt_features[0]

{'go': False,
 'jurong': False,
 'point': False,
 'crazi': False,
 'avail': False,
 'bugi': False,
 'n': False,
 'great': False,
 'world': False,
 'la': False,
 'e': False,
 'buffet': False,
 'cine': False,
 'got': False,
 'amor': False,
 'wat': False,
 'ok': False,
 'lar': False,
 'joke': False,
 'wif': False,
 'u': False,
 'oni': False,
 'free': False,
 'entri': False,
 'numbr': False,
 'wkli': False,
 'comp': False,
 'win': False,
 'fa': False,
 'cup': False,
 'final': False,
 'tkt': False,
 'numbrst': False,
 'may': False,
 'text': False,
 'receiv': False,
 'question': False,
 'std': False,
 'txt': False,
 'rate': False,
 'c': False,
 'appli': False,
 'numbrovernumbr': False,
 'dun': False,
 'say': False,
 'earli': False,
 'hor': False,
 'alreadi': False,
 'nah': False,
 'think': False,
 'goe': False,
 'usf': False,
 'live': False,
 'around': False,
 'though': False,
 'freemsg': False,
 'hey': False,
 'darl': False,
 'week': False,
 'word': False,
 'back': False,
 'like': False,
 '

# printing OUTPUT

In [115]:
my_msg = find_features("freemsg hey darl numbr week word back like fun still tb ok xxx std chg send moneysymbnumbr rcv")

In [116]:
prediction = model1.classify_many(my_msg)

In [117]:
prediction[0]
## 0 means no spam
## 1 means spam

1