## Spam classsification using nlp
#### author : vijaytummala

In [2]:
import pandas as pd

In [3]:
df = pd.read_csv('emails.csv')

In [4]:
df.head()

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


In [5]:
df.isnull().sum()

text    0
spam    0
dtype: int64

In [6]:
# cleaning

In [7]:
def remove_Subject(x):
    subj = x.split(':')[-1]
    return subj

In [8]:
df['text'] = df['text'].apply(remove_Subject)

In [9]:
df.head()

Unnamed: 0,text,spam
0,we provide unlimited amount of changes with n...,1
1,the stock trading gunslinger fanny is merril...,1
2,unbelievable new homes made easy im wanting ...,1
3,ramsey @ goldengraphix . com request additio...,1
4,"do not have money , get software cds from her...",1


In [10]:
df.spam.unique()

array([1, 0], dtype=int64)

In [11]:
# finding how many are spam mails and how many are non spam mails

df[df['spam']==1].count()

text    1368
spam    1368
dtype: int64

In [12]:
df[df['spam']==0].count()

text    4360
spam    4360
dtype: int64

In [13]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [14]:
def removePunctuation(text):
    return re.sub('[^a-zA-Z0-9]+', ' ',text)

In [15]:
df['text'] = df['text'].apply(removePunctuation)

In [16]:
df.head()

Unnamed: 0,text,spam
0,we provide unlimited amount of changes with n...,1
1,the stock trading gunslinger fanny is merrill...,1
2,unbelievable new homes made easy im wanting t...,1
3,ramsey goldengraphix com request additional i...,1
4,do not have money get software cds from here ...,1


In [17]:
def convert_to_lower(text):
    return text.lower()

In [18]:
df['text'] = df['text'].apply(convert_to_lower)

In [19]:
# convert words into tokens

In [20]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\pardh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [21]:
def convert_text_into_tokens(text):
    token_list = nltk.word_tokenize(text)
    return token_list

In [22]:
df['text_tokens'] = df['text'].apply(convert_text_into_tokens)

In [23]:
df.text_tokens[0]

['we',
 'provide',
 'unlimited',
 'amount',
 'of',
 'changes',
 'with',
 'no',
 'extra',
 'fees',
 'for',
 'you',
 'to',
 'be',
 'surethat',
 'you',
 'will',
 'love',
 'the',
 'result',
 'of',
 'this',
 'collaboration',
 'have',
 'a',
 'look',
 'at',
 'our',
 'portfolio',
 'not',
 'interested']

In [24]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\pardh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [25]:
def removeStopWords(text_token):
    tokens = []
    for i in text_token:
        if i not in  stopwords.words('english'):
            tokens.append(i)
    return tokens

In [26]:
df['text_tokens'] = df['text_tokens'].apply(removeStopWords)

In [27]:
df.head()

Unnamed: 0,text,spam,text_tokens
0,we provide unlimited amount of changes with n...,1,"[provide, unlimited, amount, changes, extra, f..."
1,the stock trading gunslinger fanny is merrill...,1,"[stock, trading, gunslinger, fanny, merrill, m..."
2,unbelievable new homes made easy im wanting t...,1,"[unbelievable, new, homes, made, easy, im, wan..."
3,ramsey goldengraphix com request additional i...,1,"[ramsey, goldengraphix, com, request, addition..."
4,do not have money get software cds from here ...,1,"[money, get, software, cds, software, compatib..."


In [28]:
# remove stem words

In [29]:
ps = PorterStemmer()

In [30]:
def remove_stem_words(text_tokens):
    stem_list = []
    for i in text_tokens:
        stem_list.append(ps.stem(i))
    return list(set(stem_list))

In [31]:
df['text_tokens'] = df['text_tokens'].apply(remove_stem_words)

In [32]:
df.head()

Unnamed: 0,text,spam,text_tokens
0,we provide unlimited amount of changes with n...,1,"[provid, extra, look, result, chang, surethat,..."
1,the stock trading gunslinger fanny is merrill...,1,"[colza, tanzania, mcdougal, chameleon, optima,..."
2,unbelievable new homes made easy im wanting t...,1,"[home, unbeliev, post, form, credit, websit, v..."
3,ramsey goldengraphix com request additional i...,1,"[pdf, form, solicit, graphix, order, com, prin..."
4,do not have money get software cds from here ...,1,"[finish, tradgedi, compat, great, marriag, alo..."


In [33]:
final_df = df.drop('text',axis=1)

In [34]:
# convert into final dataset

def list_to_str(text_tokens):
    out_string = '' 
    for i in text_tokens:
        out_string = out_string + i + ' '
    return out_string
        

In [35]:
# list_to_str(['hello','world','vijay'])

In [36]:
final_df['text_tokens'] = final_df['text_tokens'].apply(list_to_str)

In [37]:
final_df

Unnamed: 0,spam,text_tokens
0,1,provid extra look result chang surethat love i...
1,1,colza tanzania mcdougal chameleon optima hall ...
2,1,home unbeliev post form credit websit visit ma...
3,1,pdf form solicit graphix order com printabl ca...
4,1,finish tradgedi compat great marriag along yet...
...,...,...
5723,0,juli would 274 whether per charg watson date r...
5724,0,lsu html garven research edu
5725,0,home html martinj hsb edu baylor
5726,0,kaminski vinc interest


In [38]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(final_df['text_tokens'], final_df['spam'], test_size = 0.30)

In [39]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()

In [40]:
X = cv.fit_transform(X_train).toarray()

In [41]:
input_data = cv.transform(X_train)

In [42]:
input_data

<4009x14365 sparse matrix of type '<class 'numpy.int64'>'
	with 135295 stored elements in Compressed Sparse Row format>

In [43]:
# import numpy as np

In [44]:
data = input_data.toarray()

In [45]:
# d[10][2000:3000]

In [46]:
data

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [47]:
from sklearn.naive_bayes import GaussianNB

In [48]:
nb = GaussianNB()

In [49]:
nb.fit(data,y_train)

GaussianNB()

In [50]:
out_data = cv.transform(X_test)
out = out_data.toarray()

In [51]:
out

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [52]:
predict = nb.predict(out)

In [53]:
from sklearn.metrics import accuracy_score,confusion_matrix

In [54]:
accuracy_score(predict,y_test)

0.9139034322280396

In [55]:
confusion_matrix(predict,y_test)

array([[1224,   51],
       [  97,  347]], dtype=int64)

In [57]:
from sklearn.feature_extraction.text import TfidfVectorizer
tv = TfidfVectorizer()

In [58]:
X = tv.fit_transform(X_train).toarray()

In [66]:
from sklearn.linear_model import LogisticRegression

In [67]:
# print(X[200][1000:2000])

In [68]:
lr = LogisticRegression()

In [69]:
lr.fit(X,y_train)

LogisticRegression()

In [70]:
test_data = tv.transform(X_test).toarray()

In [71]:
predicted_data = lr.predict(test_data)

In [72]:
accuracy_score(predicted_data,y_test)

0.9424083769633508

In [73]:
import pickle

In [74]:
filename = 'finalized_model.sav'
pickle.dump(lr, open(filename, 'wb'))

In [87]:
test_sub_mail =   

In [88]:
test_data = tv.transform(test_sub_mail)

In [89]:
t = test_data.toarray()

In [97]:
X_test[5:10]

94                     notic net legal www insurancemail 
1869    meet might would vinc invest next thank struct...
2287    pleas let know 88563580 crenshaw 5290 thank po...
172        gperk au net netspac rcpt expir command repli 
1272    measur email team ignor 03 secur receiv updat ...
Name: text_tokens, dtype: object

In [105]:
X_test[6:7]

1869    meet might would vinc invest next thank struct...
Name: text_tokens, dtype: object

In [99]:
y_test[5:10]

94      1
1869    0
2287    0
172     1
1272    1
Name: spam, dtype: int64

In [100]:
test_data = tv.transform(X_test[5:10])

In [101]:
t = test_data.toarray()

In [102]:
lr.predict(t)

array([1, 0, 0, 1, 1], dtype=int64)