In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


In [2]:
email_data = pd.read_csv('emails.csv')
email_data.head()

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


In [3]:
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [6]:
email_data.isnull().sum()

text    0
spam    0
dtype: int64

In [7]:
port_stem = PorterStemmer()

In [8]:
def stemming(content):
    stemmingcontent = re.sub('[^a-zA-Z]',' ',content)
    stemmingcontent = stemmingcontent.lower()
    stemmingcontent = stemmingcontent.split()
    stemmingcontent = [port_stem.stem(word) for word in stemmingcontent if not word in stopwords.words('english')] 
    stemmingcontent = ' '.join(stemmingcontent)
    return stemmingcontent

In [9]:
email_data['text'] = email_data['text'].apply(stemming)

In [10]:
email_data.shape

(5728, 2)

In [11]:
X = email_data['text']
X.head()

0    subject natur irresist corpor ident lt realli ...
1    subject stock trade gunsling fanni merril muzo...
2    subject unbeliev new home made easi im want sh...
3    subject color print special request addit info...
4    subject money get softwar cd softwar compat gr...
Name: text, dtype: object

In [12]:
y = email_data['spam']
y.head()

0    1
1    1
2    1
3    1
4    1
Name: spam, dtype: int64

In [13]:
vectorizer = TfidfVectorizer()

In [14]:
vectorizer.fit(X)
X = vectorizer.transform(X)

In [15]:
X

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 474998 stored elements and shape (5728, 25607)>

In [16]:
print(X)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 474998 stored elements and shape (5728, 25607)>
  Coords	Values
  (0, 374)	0.09615698001084273
  (0, 479)	0.09769480016493992
  (0, 812)	0.0766230909849381
  (0, 1523)	0.1409592834788327
  (0, 1965)	0.14993362938864338
  (0, 2063)	0.07670590981724348
  (0, 2673)	0.0854008107589886
  (0, 2870)	0.09087637865049199
  (0, 2966)	0.09603017481729875
  (0, 3362)	0.11537073712751421
  (0, 3594)	0.11423709103924454
  (0, 3861)	0.13180005473093287
  (0, 3997)	0.08324405029671961
  (0, 4186)	0.09280622286300132
  (0, 4305)	0.15915249094845213
  (0, 4583)	0.15425168215833238
  (0, 4630)	0.07534751360942472
  (0, 4726)	0.06899847622359105
  (0, 4893)	0.08219927447975249
  (0, 5353)	0.047324745219530444
  (0, 6052)	0.1065007121544151
  (0, 6208)	0.07028575473363927
  (0, 6308)	0.08153511539170166
  (0, 6574)	0.08058284319512465
  (0, 6575)	0.09342814342665173
  :	:
  (5727, 21253)	0.05830126312717506
  (5727, 21293)	0.03587191084923955
  

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.1, random_state= 1)


In [18]:
from sklearn.linear_model import LogisticRegression

In [19]:
linreg = LogisticRegression()
linreg.fit(X_train, y_train)

In [20]:
y_pred = linreg.predict(X_train)

In [21]:
y.value_counts()

spam
0    4360
1    1368
Name: count, dtype: int64

In [22]:
accuracy = accuracy_score(y_pred, y_train)
accuracy

0.9967022308438409

In [23]:
y_pred_test = linreg.predict(X_test)

In [24]:
accuracy_test = accuracy_score(y_pred_test, y_test)

In [25]:
accuracy_test

0.9825479930191972

In [41]:
new_content = 'Why do universities teach C/C++?Top stories for ChannappaQuorWhy do universities teach C/C++?Joshua Gross, Associate Professor of Computer Science at CSUMB • Updated September 28There’s an old joke. Two fish are swimming along, and another, older fish swims by and says, “Water’s nice today, huh?” A moment later, one of the two fish turns to the othe... Read more »UpvoteComment	172I smoked cigarettes irregularly for 2 years like someday smoking 5-6 and someday not at all. Now I have quit. What are the chances of me developing any of the smoking related complications?Michael J. McFadden, Author of "Dissecting Antismokers Brainsand TobakkoNacht -- The Antismoking ... • Answered December 28, 2018According to the well-designed UK Million Women Study, if youd started smoking in your teens and smoked a half pack to a pack a day until you were 30 and then QUIT at 30, ... Read more »'

In [42]:
content=pd.DataFrame([[new_content]])

In [44]:
content[0].apply(stemming)

0    univers teach c c top stori channappaquorwhi u...
Name: 0, dtype: object

In [46]:
content = [str(doc) for doc in content]


In [49]:
content_vector = vectorizer.transform(content)
content_vector

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 0 stored elements and shape (1, 25607)>

In [48]:
linreg.predict(content_vector)

array([0])