In [1]:
import numpy as np
import pandas as pd
import re
import string
import math


In [2]:
data = pd.read_csv('../input/emails-spam-ham/emails.csv', usecols=['spam','text'])

In [3]:
data

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1.0
1,Subject: the stock trading gunslinger fanny i...,1.0
2,Subject: unbelievable new homes made easy im ...,1.0
3,Subject: 4 color printing special request add...,1.0
4,"Subject: do not have money , get software cds ...",1.0
...,...,...
5722,Subject: re : research and development charges...,0.0
5723,"Subject: re : receipts from visit jim , than...",0.0
5724,Subject: re : enron case study update wow ! a...,0.0
5725,"Subject: re : interest david , please , call...",0.0


In [4]:
#frequency distribution of the class attribute
print(pd.crosstab(index=data["spam"],columns="count"))

col_0  count
spam        
0.0     4359
1.0     1367


### ------Data Preprocessing--------

In [5]:
data.rename(columns={'spam':'class'},inplace=True)
data['label'] = np.where(data['class']==1,'spam','ham')
data.drop_duplicates(inplace=True)

In [6]:
data

Unnamed: 0,text,class,label
0,Subject: naturally irresistible your corporate...,1.0,spam
1,Subject: the stock trading gunslinger fanny i...,1.0,spam
2,Subject: unbelievable new homes made easy im ...,1.0,spam
3,Subject: 4 color printing special request add...,1.0,spam
4,"Subject: do not have money , get software cds ...",1.0,spam
...,...,...,...
5722,Subject: re : research and development charges...,0.0,ham
5723,"Subject: re : receipts from visit jim , than...",0.0,ham
5724,Subject: re : enron case study update wow ! a...,0.0,ham
5725,"Subject: re : interest david , please , call...",0.0,ham


In [7]:
data['text'][0]

"Subject: naturally irresistible your corporate identity  lt is really hard to recollect a company : the  market is full of suqgestions and the information isoverwhelminq ; but a good  catchy logo , stylish statlonery and outstanding website  will make the task much easier .  we do not promise that havinq ordered a iogo your  company will automaticaily become a world ieader : it isguite ciear that  without good products , effective business organization and practicable aim it  will be hotat nowadays market ; but we do promise that your marketing efforts  will become much more effective . here is the list of clear  benefits : creativeness : hand - made , original logos , specially done  to reflect your distinctive company image . convenience : logo and stationery  are provided in all formats ; easy - to - use content management system letsyou  change your website content and even its structure . promptness : you  will see logo drafts within three business days . affordability : your  ma

In [8]:
contractions_dict = {"ain't": "are not","'s":" is","aren't": "are not"}
contractions_re=re.compile('(%s)' % '|'.join(contractions_dict.keys()))
def expand_contractions(text,contractions_dict=contractions_dict):
    def replace(match):
        return contractions_dict[match.group(0)]
    return contractions_re.sub(replace, text)
data['text']=data['text'].apply(lambda x:expand_contractions(x))

In [9]:
data['text'][0]

"Subject: naturally irresistible your corporate identity  lt is really hard to recollect a company : the  market is full of suqgestions and the information isoverwhelminq ; but a good  catchy logo , stylish statlonery and outstanding website  will make the task much easier .  we do not promise that havinq ordered a iogo your  company will automaticaily become a world ieader : it isguite ciear that  without good products , effective business organization and practicable aim it  will be hotat nowadays market ; but we do promise that your marketing efforts  will become much more effective . here is the list of clear  benefits : creativeness : hand - made , original logos , specially done  to reflect your distinctive company image . convenience : logo and stationery  are provided in all formats ; easy - to - use content management system letsyou  change your website content and even its structure . promptness : you  will see logo drafts within three business days . affordability : your  ma

In [10]:
data['text'] = data['text'].str.lower()

In [11]:
data['text'][0]

"subject: naturally irresistible your corporate identity  lt is really hard to recollect a company : the  market is full of suqgestions and the information isoverwhelminq ; but a good  catchy logo , stylish statlonery and outstanding website  will make the task much easier .  we do not promise that havinq ordered a iogo your  company will automaticaily become a world ieader : it isguite ciear that  without good products , effective business organization and practicable aim it  will be hotat nowadays market ; but we do promise that your marketing efforts  will become much more effective . here is the list of clear  benefits : creativeness : hand - made , original logos , specially done  to reflect your distinctive company image . convenience : logo and stationery  are provided in all formats ; easy - to - use content management system letsyou  change your website content and even its structure . promptness : you  will see logo drafts within three business days . affordability : your  ma

In [12]:
string.punctuation
#'!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~'

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [13]:
#remove punctuation
data['text'] = data['text'].apply(lambda x: re.sub('[%s]' % re.escape(string.punctuation), '' , x))

In [14]:
data['text'][0]

'subject naturally irresistible your corporate identity  lt is really hard to recollect a company  the  market is full of suqgestions and the information isoverwhelminq  but a good  catchy logo  stylish statlonery and outstanding website  will make the task much easier   we do not promise that havinq ordered a iogo your  company will automaticaily become a world ieader  it isguite ciear that  without good products  effective business organization and practicable aim it  will be hotat nowadays market  but we do promise that your marketing efforts  will become much more effective  here is the list of clear  benefits  creativeness  hand  made  original logos  specially done  to reflect your distinctive company image  convenience  logo and stationery  are provided in all formats  easy  to  use content management system letsyou  change your website content and even its structure  promptness  you  will see logo drafts within three business days  affordability  your  marketing break  through 

In [15]:
data['text'][3]

'subject 4 color printing special  request additional information now  click here  click here for a printable version of our order form  pdf format   phone   626  338  8090 fax   626  338  8102 e  mail  ramsey  goldengraphix  com  request additional information now  click here  click here for a printable version of our order form  pdf format   golden graphix  printing 5110 azusa canyon rd  irwindale  ca 91706 this e  mail message is an advertisement and  or solicitation  '

In [16]:
#remove words and digits
data['text'] = data['text'].apply(lambda x: re.sub(r'\b[0-9]+\b\s*', '',x))


In [17]:

data['text'][3]

'subject color printing special  request additional information now  click here  click here for a printable version of our order form  pdf format   phone   fax   e  mail  ramsey  goldengraphix  com  request additional information now  click here  click here for a printable version of our order form  pdf format   golden graphix  printing azusa canyon rd  irwindale  ca this e  mail message is an advertisement and  or solicitation  '

In [19]:
#remove stopwords
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
stop_words.add('subject')
stop_words.add('http')
stop_words.add('aa')
stop_words.add('aaa')
def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in stop_words])
data['text'] = data['text'].apply(lambda x: remove_stopwords(x))

In [20]:
data['text'][3]

'color printing special request additional information click click printable version order form pdf format phone fax e mail ramsey goldengraphix com request additional information click click printable version order form pdf format golden graphix printing azusa canyon rd irwindale ca e mail message advertisement solicitation'

In [21]:
#stemming
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
def stem_words(text):
    return " ".join([stemmer.stem(word) for word in text.split()])
data["text"] = data["text"].apply(lambda x: stem_words(x))

In [22]:
data["text"][3]

'color print special request addit inform click click printabl version order form pdf format phone fax e mail ramsey goldengraphix com request addit inform click click printabl version order form pdf format golden graphix print azusa canyon rd irwindal ca e mail messag advertis solicit'

In [23]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
def lemmatize_words(text):
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])
data["text"] = data["text"].apply(lambda text: lemmatize_words(text))

In [24]:
data["text"][3]

'color print special request addit inform click click printabl version order form pdf format phone fax e mail ramsey goldengraphix com request addit inform click click printabl version order form pdf format golden graphix print azusa canyon rd irwindal ca e mail messag advertis solicit'

In [25]:
data["text"] = data["text"].apply(lambda x: re.sub(' +', ' ', x))

In [26]:
data["text"]

0       natur irresist corpor ident lt realli hard rec...
1       stock trade gunsling fanni merril muzo colza a...
2       unbeliev new home made easi im want show homeo...
3       color print special request addit inform click...
4       money get softwar cd softwar compat great grow...
                              ...                        
5722    research develop charg gpg forward shirley cre...
5723    receipt visit jim thank invit visit lsu shirle...
5724    enron case studi updat wow day super thank muc...
5725    interest david plea call shirley crenshaw assi...
5726    news aurora updat aurora version fastest model...
Name: text, Length: 5694, dtype: object

In [27]:
data

Unnamed: 0,text,class,label
0,natur irresist corpor ident lt realli hard rec...,1.0,spam
1,stock trade gunsling fanni merril muzo colza a...,1.0,spam
2,unbeliev new home made easi im want show homeo...,1.0,spam
3,color print special request addit inform click...,1.0,spam
4,money get softwar cd softwar compat great grow...,1.0,spam
...,...,...,...
5722,research develop charg gpg forward shirley cre...,0.0,ham
5723,receipt visit jim thank invit visit lsu shirle...,0.0,ham
5724,enron case studi updat wow day super thank muc...,0.0,ham
5725,interest david plea call shirley crenshaw assi...,0.0,ham


In [28]:
#frequency distribution of the class attribute
print(pd.crosstab(index=data["label"],columns="count"))

col_0  count
label       
ham     4327
spam    1367


In [30]:
from sklearn.model_selection import train_test_split
dataTrain, dataTest = train_test_split(data,train_size=0.8,random_state=2022,stratify=data['label'])

In [31]:
dataTrain

Unnamed: 0,text,class,label
1607,jinbaek kim molli pay plane ticket make sure s...,0.0,ham
2578,research dept move hello everyon attach churn ...,0.0,ham
89,think might interest hello found site call gra...,1.0,spam
4315,pierr philipp thank messag shall glad make pre...,0.0,ham
2471,fwd optic network engin enron research offsit ...,0.0,ham
...,...,...,...
1957,mgmt rice univers roster mgmt let know need li...,0.0,ham
3140,time sensit execut impact influenc program sur...,0.0,ham
4042,price credit thousand name continu discus tues...,0.0,ham
2403,statu enron project howard sorri delay shall a...,0.0,ham


In [32]:
dataTrain.groupby(['label']).size()

label
ham     3461
spam    1094
dtype: int64

In [33]:
dataTest.groupby(['label']).size()

label
ham     866
spam    273
dtype: int64

### --------Generating the document term matrix---------

In [34]:

from sklearn.feature_extraction.text import CountVectorizer
parseur = CountVectorizer(binary=True)
XTrain = parseur.fit_transform(dataTrain['text'])

In [35]:
XTrain

<4555x22680 sparse matrix of type '<class 'numpy.int64'>'
	with 373494 stored elements in Compressed Sparse Row format>

In [37]:
#list of tokens
print(parseur.get_feature_names()[:100])

['aaaenerfax', 'aadedeji', 'aagraw', 'aal', 'aaldou', 'aanalysi', 'aaron', 'aawesom', 'ab', 'abacha', 'abacu', 'abahi', 'abandon', 'abargain', 'abarr', 'abb', 'abba', 'abbott', 'abbrevi', 'abc', 'abcsearch', 'abdalla', 'abdallat', 'abdelnour', 'abdul', 'abdulla', 'abdullah', 'abei', 'abel', 'abello', 'abernathi', 'abet', 'abey', 'abf', 'abhay', 'abi', 'abid', 'abidjan', 'abiiiti', 'abil', 'abilen', 'abit', 'abitibi', 'abl', 'abler', 'abli', 'ablig', 'ablx', 'abn', 'abneg', 'abnorm', 'aboard', 'abolish', 'abondantli', 'abook', 'aborigin', 'aborm', 'abort', 'about', 'aboutthi', 'aboutu', 'aboutvenita', 'aboveground', 'abovenet', 'abovetelefax', 'abqewvbgf', 'abr', 'abraham', 'abram', 'abramov', 'abramowicz', 'abras', 'abreast', 'abreo', 'abridg', 'abroad', 'abscissa', 'abscond', 'absenc', 'absens', 'absent', 'absolut', 'absolv', 'absorb', 'absorpt', 'abstract', 'abstrus', 'absult', 'absurd', 'abu', 'abund', 'abundancegroup', 'abundantli', 'abus', 'abutl', 'abyssinia', 'ac', 'acacia', 'ac

In [38]:
#number of  tokens
print(len(parseur.get_feature_names()))

22680


In [39]:
mdtTrain = XTrain.toarray()
print(type(mdtTrain))
print(mdtTrain.shape)

<class 'numpy.ndarray'>
(4555, 22680)


In [40]:
mdtTrain

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [41]:
#frequency of the terms
freq_mots = np.sum(mdtTrain,axis=0)#sum since we are using binary ponderation
print(freq_mots)
print('****')
index = np.argsort(freq_mots)
print(index)
print('****')
imp = {'terme':np.asarray(parseur.get_feature_names())[index],'freq':freq_mots[index]}
print(pd.DataFrame(imp).sort_values(by='freq', ascending=False))

[1 1 1 ... 1 2 3]
****
[    0  9001 17697 ...  6333 21390 15202]
****
            terme  freq
22679        plea  2258
22678        vinc  2212
22677       enron  2055
22676       thank  1962
22675       would  1722
...           ...   ...
7225        unarm     1
7224        wozni     1
7223     unattain     1
7222      arneson     1
0      aaaenerfax     1

[22680 rows x 2 columns]


In [42]:
parseurBis = CountVectorizer(stop_words='english',binary=True, min_df = 10)
XTrainBis = parseurBis.fit_transform(dataTrain['text'])
print(len(parseurBis.get_feature_names()))
mdtTrainBis = XTrainBis.toarray()

3790




### -------Training the classifier-------

#### KNN classifier

In [43]:
from sklearn.neighbors import KNeighborsClassifier
knn_classifier = KNeighborsClassifier(n_neighbors=2)
knn_classifier.fit(mdtTrainBis,dataTrain['label'])



KNeighborsClassifier(n_neighbors=2)

In [44]:
from sklearn import metrics
mdtTestBis = parseurBis.transform(dataTest['text'])
predTestBis = knn_classifier.predict(mdtTestBis)
print('***Confusion matrix')
mcTestBis = metrics.confusion_matrix(dataTest['label'],predTestBis)
print(mcTestBis)

***Confusion matrix
[[751 115]
 [ 29 244]]


In [45]:
#recall
print('Recall')
print(metrics.recall_score(dataTest['label'],predTestBis,pos_label='spam'))

#precision
print('precision')
print(metrics.precision_score(dataTest['label'],predTestBis,pos_label='spam'))

#F1-Score
print('F1-Score')
print(metrics.f1_score(dataTest['label'],predTestBis,pos_label='spam'))

#accuracy rate
print('accuracy rate -->')
print(metrics.accuracy_score(dataTest['label'],predTestBis))

Recall
0.8937728937728938
precision
0.6796657381615598
F1-Score
0.7721518987341772
accuracy rate -->
0.8735733099209834


#### LogistiRegression

In [46]:
from sklearn.linear_model import LogisticRegression
modelBis = LogisticRegression()
modelBis.fit(mdtTrainBis,dataTrain['label'])
mdtTestBis = parseurBis.transform(dataTest['text'])
predTestBis = modelBis.predict(mdtTestBis)

#confusion matrix
print('***Confusion matrix')
mcTestBis = metrics.confusion_matrix(dataTest['label'],predTestBis)
print(mcTestBis)

#recall
print('Recall')
print(metrics.recall_score(dataTest['label'],predTestBis,pos_label='spam'))

#precision
print('precision')
print(metrics.precision_score(dataTest['label'],predTestBis,pos_label='spam'))

#F1-Score
print('F1-Score')
print(metrics.f1_score(dataTest['label'],predTestBis,pos_label='spam'))

#accuracy rate
print('accuracy rate -->')
print(metrics.accuracy_score(dataTest['label'],predTestBis))

***Confusion matrix
[[859   7]
 [  3 270]]
Recall
0.989010989010989
precision
0.9747292418772563
F1-Score
0.9818181818181818
accuracy rate -->
0.9912203687445127


### --------- Dimensionality reduction ----------

In [47]:
modelBis.coef_

array([[ 7.43619723e-02, -1.01476535e-01, -3.95979217e-01, ...,
        -7.90715942e-02, -3.18720007e-04,  1.74978497e-01]])

In [48]:
coef_abs = np.abs(modelBis.coef_[0,:])

coef_abs

array([7.43619723e-02, 1.01476535e-01, 3.95979217e-01, ...,
       7.90715942e-02, 3.18720007e-04, 1.74978497e-01])

In [49]:
thresholds = np.percentile(coef_abs,[0,25,50,75,90,100])
print(thresholds)

[1.71422867e-06 1.96260079e-02 5.52818188e-02 1.39161256e-01
 2.58193452e-01 2.89586554e+00]


In [50]:
indices = np.where(coef_abs > thresholds[2])
print(len(indices[0]))

1895


In [51]:
mdtTrainTer = mdtTrainBis[:,indices[0]]#
mdtTestTer = mdtTestBis[:,indices[0]]

print(mdtTrainTer.shape)
print(mdtTestTer.shape)

(4555, 1895)
(1139, 1895)


In [52]:

modelTer = LogisticRegression()
modelTer.fit(mdtTrainTer,dataTrain['label'])
predTestTer = modelTer.predict(mdtTestTer)
mcTestTer = metrics.confusion_matrix(dataTest['label'],predTestTer)
print(mcTestTer)

[[859   7]
 [  4 269]]


In [53]:

sel_terms = np.array(parseurBis.get_feature_names())[indices[0]]
sorted_indices = np.argsort(np.abs(modelTer.coef_[0,:]))
imp = {'term':np.asarray(sel_terms)[sorted_indices],'coef':modelTer.coef_[0,:][sorted_indices]}
print(pd.DataFrame(imp).sort_values(by='coef', ascending=False).head(10))

         term      coef
1890      com  1.224374
1888  softwar  1.162326
1884    click  1.062102
1881     love  0.960091
1879   viagra  0.912186
1878     site  0.904477
1877    money  0.897402
1876    medic  0.890194
1874    onlin  0.839651
1873    offer  0.830557




### -------- Deployment --------

In [54]:

doc = ['this is a new free service for you only']

doc=expand_contractions(str(doc))
doc = str(doc).lower()
doc = re.sub('[%s]' % re.escape(string.punctuation), '' , str(doc))
doc = re.sub('[%s]' % re.escape(string.punctuation), '' ,  str(doc))
doc =re.sub(r'\b[0-9]+\b\s*', '', str(doc))
doc = remove_stopwords(str(doc))

desc = parseurBis.transform([doc])
print(desc)

  (0, 1370)	1
  (0, 2282)	1


In [55]:

doc = ["Hey oumaima,Happy St. Patrick's Day! Wishing you a wee bit of fun and a lot of luck, this festive season. We're here to share the joy with you too. Get 30% off on InVideo PRO, for it's the perfect occasion to get cracking on your video creation journey. For all you know, this could be your pot of gold at the end of the rainbow! "]

doc=expand_contractions(str(doc))
doc = str(doc).lower()
doc = re.sub('[%s]' % re.escape(string.punctuation), '' , str(doc))
doc =re.sub(r'\b[0-9]+\b\s*', '', str(doc))
doc = remove_stopwords(str(doc))

desc = parseurBis.transform([doc])
print(desc)

  (0, 334)	1
  (0, 767)	1
  (0, 823)	1
  (0, 1090)	1
  (0, 1386)	1
  (0, 1443)	1
  (0, 1549)	1
  (0, 1870)	1
  (0, 2006)	1
  (0, 2022)	1
  (0, 2468)	1
  (0, 2497)	1
  (0, 2638)	1
  (0, 3000)	1
  (0, 3063)	1
  (0, 3215)	1
  (0, 3625)	1


In [56]:
doc

'hey oumaimahappy st patrick day wishing wee bit fun lot luck festive season share joy get invideo pro perfect occasion get cracking video creation journey know could pot gold end rainbow'

In [57]:
print(np.asarray(parseurBis.get_feature_names())[desc.indices])

['bit' 'creation' 'day' 'end' 'fun' 'gold' 'hey' 'know' 'lot' 'luck'
 'patrick' 'perfect' 'pro' 'season' 'share' 'st' 'video']




In [58]:
dense_desc = desc.toarray()
dense_sel = dense_desc[:,indices[0]]

In [59]:
dense_desc 

array([[0, 0, 0, ..., 0, 0, 0]])

In [60]:
pred_doc = modelTer.predict(dense_sel)
print(pred_doc)

['spam']


In [61]:
pred_proba = modelTer.predict_proba(dense_sel)
print(pred_proba)

[[0.40905906 0.59094094]]
