In [55]:
import numpy as np 
import pandas as pd 
import nltk
from nltk.corpus import stopwords
import string
import os
import re
import matplotlib.pyplot as plt

In [56]:
df = pd.read_csv('emails.txt') #read the CSV file

In [57]:
df.tail(100)

Unnamed: 0,text,spam
5628,Subject: retail markets conference i would li...,0
5629,Subject: re : friday morning meeting ? vince ...,0
5630,Subject: membership mixer tomorrow - paesanos ...,0
5631,Subject: re : your comments on metals var mode...,0
5632,Subject: term project : this is the list of p...,0
...,...,...
5723,Subject: re : research and development charges...,0
5724,"Subject: re : receipts from visit jim , than...",0
5725,Subject: re : enron case study update wow ! a...,0
5726,"Subject: re : interest david , please , call...",0


In [58]:
df.shape

(5728, 2)

In [59]:
df.spam.value_counts()

0    4360
1    1368
Name: spam, dtype: int64

In [60]:
df.columns

Index(['text', 'spam'], dtype='object')

In [61]:
df.drop_duplicates(inplace = True)

In [62]:
df.shape

(5695, 2)

In [63]:
df.spam.value_counts()

0    4327
1    1368
Name: spam, dtype: int64

In [64]:
df.isnull().sum()

text    0
spam    0
dtype: int64

In [65]:
from string import punctuation
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import SnowballStemmer

In [66]:
sent1 = "Subject: why is this"

In [67]:
sent2 = re.sub("Subject:","",sent1)
sent2

' why is this'

In [68]:
tokens = word_tokenize(sent1.lower())
tokens

['subject', ':', 'why', 'is', 'this']

In [69]:
def clean_txt(sent):    
    #Stripping white spaces before and after the text
    sent = sent.strip()
    #Replacing multiple spaces with a single space
    result = re.sub("\s+", " ", sent)
    result = re.sub("Subject:","",result)
    #Replacing Non-Alpha-numeric and non space charecters with nothing
    result = re.sub("[^\w\s]","",result)
    
    tokens = word_tokenize(result.lower())
    stop_updated = stopwords.words("english")  +  ["would", "could","told"]
    text = [term for term in tokens if term not in stop_updated and len(term) > 2] 
    res = " ".join(text)
    return res

In [70]:
df.iloc[4000]['text']

'Subject: prc review : list of key projects  hi dale & vince ,  for your benefit i have compiled a shortlist of the main projects worked on  over the past five months :  1 ) inflation curve modelling ( february and march )  2 ) uk power monthly vol curve generator  3 ) nordic power monthly vol curve generator  4 ) energydesk . com models & support  5 ) compound options for uk power desk ( options to build power stations )  6 ) continental power non - generic options ( using arbitrary trader - specified  distributions )  7 ) global products : non - generic options modelling and new commodity forward  curve construction ( benzene fwd curve from naphtha )  8 ) exotic options library upgrade / model test / bug fixes ( e . g . testing new / old  asian models )  9 ) continental gas volatility curve construction  the best summary for this is in the attached presentation that i gave to the  london and oslo staff recently .  regards ,  anjam  x 35383  presentation attached :'

In [71]:
df['ctext'] = df.text.apply(clean_txt)

In [72]:
df

Unnamed: 0,text,spam,ctext
0,Subject: naturally irresistible your corporate...,1,naturally irresistible corporate identity real...
1,Subject: the stock trading gunslinger fanny i...,1,stock trading gunslinger fanny merrill muzo co...
2,Subject: unbelievable new homes made easy im ...,1,unbelievable new homes made easy wanting show ...
3,Subject: 4 color printing special request add...,1,color printing special request additional info...
4,"Subject: do not have money , get software cds ...",1,money get software cds software compatibility ...
...,...,...,...
5723,Subject: re : research and development charges...,0,research development charges gpg forwarded shi...
5724,"Subject: re : receipts from visit jim , than...",0,receipts visit jim thanks invitation visit lsu...
5725,Subject: re : enron case study update wow ! a...,0,enron case study update wow day super thank mu...
5726,"Subject: re : interest david , please , call...",0,interest david please call shirley crenshaw as...


In [73]:
df.head(10)

Unnamed: 0,text,spam,ctext
0,Subject: naturally irresistible your corporate...,1,naturally irresistible corporate identity real...
1,Subject: the stock trading gunslinger fanny i...,1,stock trading gunslinger fanny merrill muzo co...
2,Subject: unbelievable new homes made easy im ...,1,unbelievable new homes made easy wanting show ...
3,Subject: 4 color printing special request add...,1,color printing special request additional info...
4,"Subject: do not have money , get software cds ...",1,money get software cds software compatibility ...
5,"Subject: great nnews hello , welcome to medzo...",1,great nnews hello welcome medzonline groundsel...
6,Subject: here ' s a hot play in motion homela...,1,hot play motion homeland security investments ...
7,Subject: save your money buy getting this thin...,1,save money buy getting thing tried cialls yet ...
8,Subject: undeliverable : home based business f...,1,undeliverable home based business grownups mes...
9,Subject: save your money buy getting this thin...,1,save money buy getting thing tried cialls yet ...


In [74]:
#Seperate text column and the labels into X and y
X_text = df.ctext.values
y = df.spam.values

In [75]:
X_text[4000]

'prc review list key projects dale vince benefit compiled shortlist main projects worked past five months inflation curve modelling february march power monthly vol curve generator nordic power monthly vol curve generator energydesk com models support compound options power desk options build power stations continental power non generic options using arbitrary trader specified distributions global products non generic options modelling new commodity forward curve construction benzene fwd curve naphtha exotic options library upgrade model test bug fixes testing new old asian models continental gas volatility curve construction best summary attached presentation gave london oslo staff recently regards anjam 35383 presentation attached'

In [76]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_text, y, test_size = 0.20, random_state=244)
classifier = LogisticRegression()


In [77]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
X_trainmat = tfidf_vectorizer.fit_transform(X_train)

In [78]:
classifier.fit(X_trainmat.toarray(), y_train)

LogisticRegression()

In [79]:
X_testmat = tfidf_vectorizer.transform(X_test)
X_testmat

<1139x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 53773 stored elements in Compressed Sparse Row format>

In [80]:
y_pred = classifier.predict(X_testmat.toarray())

In [81]:
from sklearn.metrics import confusion_matrix, accuracy_score
accuracy_score(y_test,y_pred)

0.9929762949956101

In [82]:
confusion_matrix(y_test,y_pred)

array([[877,   3],
       [  5, 254]], dtype=int64)

In [83]:
y_predtrain = classifier.predict(X_trainmat)

In [84]:
accuracy_score(y_train,y_predtrain)

0.9901229148375769

In [85]:
np.round(classifier.predict_proba(X_testmat),2)

array([[0.98, 0.02],
       [0.94, 0.06],
       [0.98, 0.02],
       ...,
       [0.96, 0.04],
       [0.99, 0.01],
       [0.3 , 0.7 ]])

In [86]:
probabtest = np.round(classifier.predict_proba(X_testmat),2)

In [87]:
probdf = pd.DataFrame(probabtest)
probdf.loc[10]

0    1.0
1    0.0
Name: 10, dtype: float64

In [88]:
probdf[1].sort_values(ascending = False)

204    0.98
725    0.98
577    0.98
975    0.98
876    0.98
       ... 
623    0.00
387    0.00
624    0.00
383    0.00
478    0.00
Name: 1, Length: 1139, dtype: float64

In [89]:
y_pred_new_threshold = (classifier.predict_proba(X_testmat)[:,1]>=0.60).astype(int)

In [90]:
confusion_matrix(y_test,y_pred_new_threshold)

array([[880,   0],
       [ 20, 239]], dtype=int64)

In [91]:
np.array([0.4]).astype(int)

array([0])

In [92]:
np.array([0.8]).astype(int)

array([0])

In [93]:
accuracy_score(y_test,y_pred_new_threshold)

0.9824407374890255

In [94]:
probabtest = np.round(classifier.predict_proba(X_testmat),2)

In [95]:
probabtest

array([[0.98, 0.02],
       [0.94, 0.06],
       [0.98, 0.02],
       ...,
       [0.96, 0.04],
       [0.99, 0.01],
       [0.3 , 0.7 ]])

In [96]:
probdf = pd.DataFrame(probabtest)

In [97]:
probdf[1].sort_values(ascending = False)

204    0.98
725    0.98
577    0.98
975    0.98
876    0.98
       ... 
623    0.00
387    0.00
624    0.00
383    0.00
478    0.00
Name: 1, Length: 1139, dtype: float64