# Filtering Mails as Spam or Ham 

In [None]:
import pandas as pd

In [13]:
#Loading Dataset
df=pd.read_csv("/Users/nisha/Downloads/spam 3.csv",encoding='latin-1')

In [14]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [15]:
#dropping nul value columns
df.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'],axis=1,inplace=True)

In [16]:
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [17]:
#renaming columns
df.rename(columns={'v1':'labels','v2':'message'},inplace=True)

In [18]:
df.head()

Unnamed: 0,labels,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [19]:
#Null value assessment

df.isnull().sum()

labels     0
message    0
dtype: int64

In [20]:
df['message'][1]

'Ok lar... Joking wif u oni...'

In [21]:
#Function to clean the messages from special characters and hyperlinks

import re
import string
def cleaner(text):
    text=text.lower()
    text=re.sub(r'https?:\/\/\S+',"",text)
    text=re.sub('[%s]' % re.escape(string.punctuation),'',text)
  #  text=re.sub('\[.*?\]',"",text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text=re.sub(r'@[A-Za-z0-9]+','',text)
    text=re.sub('#',"",text)
    return text

In [22]:
#applying the function on DF

df['message']=df['message'].apply(cleaner)

In [23]:
df.head()

Unnamed: 0,labels,message
0,ham,go until jurong point crazy available only in ...
1,ham,ok lar joking wif u oni
2,spam,free entry in a wkly comp to win fa cup final...
3,ham,u dun say so early hor u c already then say
4,ham,nah i dont think he goes to usf he lives aroun...


In [24]:
#Performing Stemming 
from nltk.stem.porter import PorterStemmer
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
porter=PorterStemmer()

[nltk_data] Downloading package stopwords to /Users/nisha/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [25]:
corpus=[]
for i in range(0,len(df)):
    review=df['message'][i]
    review=review.lower()
    review=review.split()
    review=[porter.stem(w) for w in review if not w in stopwords.words('english')]
    review=" ".join(review)
    corpus.append(review)

In [26]:
print(corpus[:5])

['go jurong point crazi avail bugi n great world la e buffet cine got amor wat', 'ok lar joke wif u oni', 'free entri wkli comp win fa cup final tkt may text fa receiv entri questionstd txt ratetc appli', 'u dun say earli hor u c alreadi say', 'nah dont think goe usf live around though']


In [27]:
#Bag of words limiting the features to 5000 only
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(max_features=5000,stop_words='english')
X=cv.fit_transform(corpus).toarray()


In [28]:
vec=cv.fit(corpus)

In [29]:
y=pd.get_dummies(df['labels'])

In [30]:
y.head()

Unnamed: 0,ham,spam
0,1,0
1,1,0
2,0,1
3,1,0
4,1,0


In [31]:
y=y.iloc[:,1].values

In [32]:
y[:10]

array([0, 0, 1, 0, 0, 1, 0, 0, 1, 1], dtype=uint8)

In [33]:
#splitting data

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=0)

In [34]:
#applying naive Bayes
from sklearn.naive_bayes import MultinomialNB

In [35]:
modelling=MultinomialNB().fit(X_train,y_train)

In [36]:
y_pred=modelling.predict(X_test)

In [37]:
y_pred[:10]

array([0, 0, 0, 0, 0, 0, 1, 0, 0, 0], dtype=uint8)

In [38]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [39]:
#Confusion Matrix
print(confusion_matrix(y_test,y_pred))

[[1402   32]
 [  15  223]]


In [40]:
#printing accuracy
print("Accuracy: ", accuracy_score(y_test,y_pred))

Accuracy:  0.97188995215311


In [41]:
print(vec.get_feature_names())

['aa', 'aah', 'aathilov', 'aathiwher', 'abi', 'abil', 'abiola', 'abj', 'abl', 'abt', 'abta', 'aburo', 'abus', 'ac', 'academ', 'acc', 'accept', 'access', 'accid', 'accident', 'accomod', 'accordingli', 'account', 'ach', 'act', 'action', 'activ', 'actor', 'actual', 'ad', 'adam', 'add', 'addamsfa', 'addi', 'addict', 'address', 'admin', 'administr', 'admir', 'ador', 'adult', 'advanc', 'adventur', 'advic', 'advis', 'aeronaut', 'aeroplan', 'affair', 'affect', 'affection', 'afraid', 'aft', 'afternoon', 'aftr', 'ag', 'agalla', 'age', 'agent', 'ago', 'agre', 'ah', 'aha', 'ahead', 'ahmad', 'ahsen', 'aid', 'aight', 'aint', 'air', 'airport', 'airtel', 'aiya', 'aiyah', 'aiyar', 'aiyo', 'aka', 'al', 'alaipayuth', 'album', 'alcohol', 'alert', 'alex', 'alfi', 'algarv', 'aliv', 'allah', 'allow', 'alon', 'alreadi', 'alright', 'alrit', 'alway', 'alwi', 'amaz', 'american', 'ami', 'amp', 'amt', 'amus', 'andr', 'andro', 'angri', 'anim', 'anna', 'anni', 'anniversari', 'announc', 'annoy', 'anot', 'anoth', 'ans

In [43]:
X[:10]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [44]:
vec.get_feature_names()[:10]

['aa',
 'aah',
 'aathilov',
 'aathiwher',
 'abi',
 'abil',
 'abiola',
 'abj',
 'abl',
 'abt']

In [45]:
df1=pd.DataFrame(X[:5000],index=vec.get_feature_names()[:5000])

In [46]:
#summation along the columns to find the importance of each word

df1['sum']=df1.sum(axis=1)

In [47]:
df1['sum'][:10]

aa           11
aah           5
aathilov     17
aathiwher     6
abi           6
abil         15
abiola        6
abj          14
abl          13
abt          12
Name: sum, dtype: int64

In [48]:

df2=df1.sort_values(['sum'],ascending="True")

In [49]:
df2['sum']

strangersaw                      0
wana                             0
sday                             0
aiyo                             0
budget                           0
fromm                            0
diesel                           0
forå                             0
ringtonefrom                     0
trubl                            0
eir                              0
advic                            0
fri                              0
ennal                            0
shld                             0
unredeem                         0
wyli                             0
youd                             0
trywal                           0
qlynnbv                          0
stillmayb                        0
attractioni                      0
program                          0
money                            0
gb                               0
tui                              0
floor                            0
wwwbridalpetticoatdreamscouk     0
tsc                 

In [None]:
#TFIDF

In [None]:
#tokenizing

In [50]:
def tokenizer_p(text):
    return [porter.stem(w) for w in text.split()]

In [51]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf=TfidfVectorizer(strip_accents=None, lowercase=False,preprocessor=None,
                     tokenizer=tokenizer_p,use_idf=True,
                     norm='l2',smooth_idf=True)

In [52]:
X1=tfidf.fit_transform(df.message)

In [53]:
type(X1)

scipy.sparse.csr.csr_matrix

In [54]:
X1_train, X1_test, y1_train, y1_test=train_test_split(X1,y,test_size=0.3,random_state=0)


In [55]:
type(X1_test)

scipy.sparse.csr.csr_matrix

In [56]:
print(X1[:10])

  (0, 2308)	0.1279893290676052
  (0, 6255)	0.22421039216410404
  (0, 3070)	0.31822891238857515
  (0, 4468)	0.21729342944200922
  (0, 1251)	0.2464463867642385
  (0, 402)	0.24198735006385153
  (0, 4151)	0.15363563633168983
  (0, 2812)	0.10513626499475971
  (0, 780)	0.2688410711448906
  (0, 3847)	0.1726627504612227
  (0, 2393)	0.17712178716160965
  (0, 6663)	0.2162607253430146
  (0, 3206)	0.2688410711448906
  (0, 1685)	0.18863700694241692
  (0, 778)	0.3037838948290126
  (0, 1040)	0.2688410711448906
  (0, 5903)	0.15294047677674996
  (0, 2358)	0.14949806876101196
  (0, 210)	0.31822891238857515
  (0, 6460)	0.17678409935536454
  (1, 4116)	0.27688324362563543
  (1, 3238)	0.41207318189732706
  (1, 3036)	0.46742635237301533
  (1, 6579)	0.4355907679554564
  (1, 6184)	0.2024772238069804
  :	:
  (8, 832)	0.12361326053272054
  (8, 1093)	0.23606829748806687
  (8, 6321)	0.24582615727704726
  (8, 2688)	0.21634708466655145
  (9, 6184)	0.09917198170641212
  (9, 2133)	0.2837707045514668
  (9, 6007)	0.1482

In [57]:
#applying Logistic Regression
from sklearn.linear_model import LogisticRegressionCV

In [62]:
clf=LogisticRegressionCV(cv=6,scoring='accuracy', random_state=0, n_jobs=-1,
                        verbose=3, max_iter=300).fit(X1_train,y1_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   6 | elapsed:    6.5s remaining:   13.0s
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:    7.7s finished


In [59]:
X1_test.shape

(1672, 6905)

In [63]:
y_pred=clf.predict(X1_test)

In [65]:
#Accuracy of model
print(accuracy_score(y_test,y_pred))

0.9814593301435407
