In [5]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [6]:
# Data Collection and Pre Processing
# 1 Load Data from csv file to a pandas datafram
raw_mail_data = pd.read_csv('./mail_data.csv')
raw_mail_data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
# Replace the null values with a null string
mail_data = raw_mail_data.where(pd.notnull(raw_mail_data),'')

In [8]:
# Printing first 5 row of mail data
mail_data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [9]:
# Checking the number of rows and columns
mail_data.shape

(5572, 2)

In [10]:
#  Label Encoding
mail_data.loc[mail_data['Category'] == 'spam', 'Category'] = 0
mail_data.loc[mail_data['Category'] == 'ham', 'Category'] = 1


In [11]:
mail_data.head()

Unnamed: 0,Category,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."


In [12]:
# Seperating the text as texts and label
X = mail_data['Message']
Y = mail_data['Category']

In [13]:
X.head()

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
Name: Message, dtype: object

In [14]:
Y.head()

0    1
1    1
2    0
3    1
4    1
Name: Category, dtype: object

In [15]:
X_Train,X_test,Y_Train,Y_test = train_test_split(X,Y,test_size=0.2,random_state=3)

In [16]:
X.shape

(5572,)

In [17]:
X_Train.shape

(4457,)

In [18]:
Y_test.shape

(1115,)

In [25]:
# Feature Extraction 
# Transform text data to feature vectors that can be used as input to the logistic regression
feature_extraction = TfidfVectorizer(min_df=1,stop_words='english')

In [26]:
X_train_feature = feature_extraction.fit_transform(X_Train)
X_test_feature = feature_extraction.transform(X_test)

#  Convert Y_train and T_test as Integers

Y_Train = Y_Train.astype('int')
Y_test = Y_test.astype('int')


In [27]:
print(X_train_feature)

  (0, 2329)	0.38783870336935383
  (0, 3811)	0.34780165336891333
  (0, 2224)	0.413103377943378
  (0, 4456)	0.4168658090846482
  (0, 5413)	0.6198254967574347
  (1, 3811)	0.17419952275504033
  (1, 3046)	0.2503712792613518
  (1, 1991)	0.33036995955537024
  (1, 2956)	0.33036995955537024
  (1, 2758)	0.3226407885943799
  (1, 1839)	0.2784903590561455
  (1, 918)	0.22871581159877646
  (1, 2746)	0.3398297002864083
  (1, 2957)	0.3398297002864083
  (1, 3325)	0.31610586766078863
  (1, 3185)	0.29694482957694585
  (1, 4080)	0.18880584110891163
  (2, 6601)	0.6056811524587518
  (2, 2404)	0.45287711070606745
  (2, 3156)	0.4107239318312698
  (2, 407)	0.509272536051008
  (3, 7414)	0.8100020912469564
  (3, 2870)	0.5864269879324768
  (4, 2870)	0.41872147309323743
  (4, 487)	0.2899118421746198
  :	:
  (4454, 2855)	0.47210665083641806
  (4454, 2246)	0.47210665083641806
  (4455, 4456)	0.24920025316220423
  (4455, 3922)	0.31287563163368587
  (4455, 6916)	0.19636985317119715
  (4455, 4715)	0.30714144758811196
  (

In [28]:
X_Train

3075                  Don know. I did't msg him recently.
1787    Do you know why god created gap between your f...
1614                         Thnx dude. u guys out 2nite?
4304                                      Yup i'm free...
3266    44 7732584351, Do you want a New Nokia 3510i c...
                              ...                        
789     5 Free Top Polyphonic Tones call 087018728737,...
968     What do u want when i come back?.a beautiful n...
1667    Guess who spent all last night phasing in and ...
3321    Eh sorry leh... I din c ur msg. Not sad alread...
1688    Free Top ringtone -sub to weekly ringtone-get ...
Name: Message, Length: 4457, dtype: object

In [29]:
# Training the Model
# Logistic Regression
model = LogisticRegression()

In [30]:
Y_Train

3075    1
1787    1
1614    1
4304    1
3266    0
       ..
789     0
968     1
1667    1
3321    1
1688    0
Name: Category, Length: 4457, dtype: int64

In [31]:
model.fit(X_train_feature,Y_Train)

In [32]:
# Evaluating the Trained Model
# Predition on Training Model
prediction_on_Training_Data = model.predict(X_train_feature)
accuracy_on_training_data = accuracy_score(Y_Train,prediction_on_Training_Data)

In [33]:
print("Accuracy for Training : ",accuracy_on_training_data * 100)

Accuracy for Training :  96.76912721561588


In [34]:
# Predict on Test Data
prediction_on_Test_Data = model.predict(X_test_feature)
accuracy_on_test_data = accuracy_score(Y_test,prediction_on_Test_Data)

In [35]:
print("Accuracy for Training : ",accuracy_on_test_data * 100)


Accuracy for Training :  96.68161434977578


In [37]:
#  Building a Predictable System
input_mail = ["GitHub Galaxy Virtual EMEA is starting in 5 minutes! It's time to enter the event using the button below, grab a snack and get ready to learn something new. See you soon!"]

# Convert Text to feature vectors
input_data_feature = feature_extraction.transform(input_mail)

# Making Prediction
prediction = model.predict(input_data_feature)

print(prediction)

if(prediction == [1]):
    print("This is the Ham Mail.")
else:
    print("This is the Spam Mail.")


[1]
This is the Ham Mail.


In [42]:
# Email Spam Detection Project

import pandas as pd

# Load dataset
emails = pd.read_csv('./mail_data.csv')
emails.head()


Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [43]:
import nltk
from nltk.corpus import stopwords
import string

In [44]:
# Download stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/muhammad-
[nltk_data]     rizwan/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [49]:
# Preprocessing function
def preprocess_text(text):
    text = text.lower()  
    text = "".join([char for char in text if char not in string.punctuation])  
    words = text.split()
    stop_words = set(stopwords.words('english'))
    text = " ".join([word for word in words if word not in stop_words]) 
    return text


In [50]:
# Apply preprocessing
emails['text'] = emails['Message'].apply(preprocess_text)
emails.head()

Unnamed: 0,Category,Message,text
0,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry 2 wkly comp win fa cup final tkts 2...
3,ham,U dun say so early hor... U c already then say...,u dun say early hor u c already say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah dont think goes usf lives around though


In [52]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Feature extraction
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(emails['text'])
y = emails['Category']


In [53]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
import time

In [54]:
# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [55]:
# Function to evaluate model
def evaluate_model(model, X_train, y_train, X_test, y_test):
    start_time = time.time()
    model.fit(X_train, y_train)
    training_time = time.time() - start_time
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    class_report = classification_report(y_test, y_pred)
    return accuracy, training_time, conf_matrix, class_report


In [56]:
# Naive Bayes
nb_model = GaussianNB()
nb_model.fit(X_train.toarray(), y_train)
nb_accuracy, nb_training_time, nb_conf_matrix, nb_class_report = evaluate_model(nb_model, X_train.toarray(), y_train, X_test.toarray(), y_test)


In [69]:
print('accuracyy\n ', nb_accuracy)
print('training_time\n',nb_training_time)
print('conf_matrix\n',nb_conf_matrix)
print('class_report\n' , nb_class_report )

accuracyy
  0.8989234449760766
training_time
 0.7139842510223389
conf_matrix
 [[1300  148]
 [  21  203]]
class_report
               precision    recall  f1-score   support

         ham       0.98      0.90      0.94      1448
        spam       0.58      0.91      0.71       224

    accuracy                           0.90      1672
   macro avg       0.78      0.90      0.82      1672
weighted avg       0.93      0.90      0.91      1672



In [70]:
# Multinomial Naive Bayes
mnb_model = MultinomialNB()
mnb_accuracy, mnb_training_time, mnb_conf_matrix, mnb_class_report = evaluate_model(mnb_model, X_train, y_train, X_test, y_test)


In [71]:
print('accuracyy\n ', mnb_accuracy)
print('training_time\n',mnb_training_time)
print('conf_matrix\n',mnb_conf_matrix)
print('class_report\n' , mnb_class_report )

accuracyy
  0.9641148325358851
training_time
 0.05435490608215332
conf_matrix
 [[1448    0]
 [  60  164]]
class_report
               precision    recall  f1-score   support

         ham       0.96      1.00      0.98      1448
        spam       1.00      0.73      0.85       224

    accuracy                           0.96      1672
   macro avg       0.98      0.87      0.91      1672
weighted avg       0.97      0.96      0.96      1672



In [72]:
# Decision Tree (J48 equivalent)
dt_model = DecisionTreeClassifier()
dt_accuracy, dt_training_time, dt_conf_matrix, dt_class_report = evaluate_model(dt_model, X_train, y_train, X_test, y_test)

In [73]:
print('accuracyy\n ', dt_accuracy)
print('training_time\n',dt_training_time)
print('conf_matrix\n',dt_conf_matrix)
print('class_report\n' , dt_class_report )

accuracyy
  0.9599282296650717
training_time
 0.7069339752197266
conf_matrix
 [[1420   28]
 [  39  185]]
class_report
               precision    recall  f1-score   support

         ham       0.97      0.98      0.98      1448
        spam       0.87      0.83      0.85       224

    accuracy                           0.96      1672
   macro avg       0.92      0.90      0.91      1672
weighted avg       0.96      0.96      0.96      1672



In [74]:
# Print Confusion Matrices
print("Naive Bayes Confusion Matrix:\n", nb_conf_matrix)
print("Multinomial Naive Bayes Confusion Matrix:\n", mnb_conf_matrix)
print("Decision Tree Confusion Matrix:\n", dt_conf_matrix)


Naive Bayes Confusion Matrix:
 [[1300  148]
 [  21  203]]
Multinomial Naive Bayes Confusion Matrix:
 [[1448    0]
 [  60  164]]
Decision Tree Confusion Matrix:
 [[1420   28]
 [  39  185]]


In [75]:
# Print Accuracy and Training Time
print("Naive Bayes Accuracy:", nb_accuracy)
print("Naive Bayes Training Time:", nb_training_time)
print("Naive Bayes Classification Report:\n", nb_class_report)

print("\nMultinomial Naive Bayes Accuracy:", mnb_accuracy)
print("Multinomial Naive Bayes Training Time:", mnb_training_time)
print("Multinomial Naive Bayes Classification Report:\n", mnb_class_report)

print("\nDecision Tree Accuracy:", dt_accuracy)
print("Decision Tree Training Time:", dt_training_time)
print("Decision Tree Classification Report:\n", dt_class_report)


Naive Bayes Accuracy: 0.8989234449760766
Naive Bayes Training Time: 0.7139842510223389
Naive Bayes Classification Report:
               precision    recall  f1-score   support

         ham       0.98      0.90      0.94      1448
        spam       0.58      0.91      0.71       224

    accuracy                           0.90      1672
   macro avg       0.78      0.90      0.82      1672
weighted avg       0.93      0.90      0.91      1672


Multinomial Naive Bayes Accuracy: 0.9641148325358851
Multinomial Naive Bayes Training Time: 0.05435490608215332
Multinomial Naive Bayes Classification Report:
               precision    recall  f1-score   support

         ham       0.96      1.00      0.98      1448
        spam       1.00      0.73      0.85       224

    accuracy                           0.96      1672
   macro avg       0.98      0.87      0.91      1672
weighted avg       0.97      0.96      0.96      1672


Decision Tree Accuracy: 0.9599282296650717
Decision Tree Trai

In [77]:
import nbformat as nbf

# Define the content of the notebook
nb = nbf.v4.new_notebook()
nb.cells = [
    nbf.v4.new_markdown_cell("# Email Spam Detection Project"),
    nbf.v4.new_markdown_cell("## Step 1: Introduction and Abstract"),
    nbf.v4.new_markdown_cell("""
Email becomes a powerful tool for communication as it saves a lot of time and cost. It is one of the most popular and secure medium for online transferring and communication messages or data through the web. But, due to the social networks, most of the emails contain unwanted information which is called spam. To identify such spam email is one of the important challenges.

In this project we will use PYTHON text classification technique to identify or classify email spam message. We will find accuracy, time and error rate by applying suitable algorithms (such as NaiveBayes, NaiveBayesMultinomial and J48 etc.) on Email Dataset and we will also compare which algorithm is best for text classification.
"""),
    nbf.v4.new_markdown_cell("## Step 2: Data Collection"),
    nbf.v4.new_code_cell("""
import pandas as pd

# Load dataset
emails = pd.read_csv('/mnt/data/mail_data.csv')
emails.head()
"""),
    nbf.v4.new_markdown_cell("## Step 3: Pre-processing"),
    nbf.v4.new_code_cell("""
import nltk
from nltk.corpus import stopwords
import string

# Download stopwords
nltk.download('stopwords')

# Preprocessing function
def preprocess_text(text):
    text = text.lower()  # Lowercase text
    text = "".join([char for char in text if char not in string.punctuation])  # Remove punctuation
    words = text.split()
    stop_words = set(stopwords.words('english'))
    text = " ".join([word for word in words if word not in stop_words])  # Remove stopwords
    return text

# Apply preprocessing
emails['text'] = emails['text'].apply(preprocess_text)
emails.head()
"""),
    nbf.v4.new_markdown_cell("## Step 4: Feature Selection"),
    nbf.v4.new_code_cell("""
from sklearn.feature_extraction.text import TfidfVectorizer

# Feature extraction
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(emails['text'])
y = emails['label']
"""),
    nbf.v4.new_markdown_cell("## Step 5: Apply Spam Filter Algorithms"),
    nbf.v4.new_code_cell("""
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
import time

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Function to evaluate model
def evaluate_model(model, X_train, y_train, X_test, y_test):
    start_time = time.time()
    model.fit(X_train, y_train)
    training_time = time.time() - start_time
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    class_report = classification_report(y_test, y_pred)
    return accuracy, training_time, conf_matrix, class_report

# Naive Bayes
nb_model = GaussianNB()
nb_model.fit(X_train.toarray(), y_train)
nb_accuracy, nb_training_time, nb_conf_matrix, nb_class_report = evaluate_model(nb_model, X_train.toarray(), y_train, X_test.toarray(), y_test)

# Multinomial Naive Bayes
mnb_model = MultinomialNB()
mnb_accuracy, mnb_training_time, mnb_conf_matrix, mnb_class_report = evaluate_model(mnb_model, X_train, y_train, X_test, y_test)

# Decision Tree (J48 equivalent)
dt_model = DecisionTreeClassifier()
dt_accuracy, dt_training_time, dt_conf_matrix, dt_class_report = evaluate_model(dt_model, X_train, y_train, X_test, y_test)
"""),
    nbf.v4.new_markdown_cell("## Step 6: Confusion Matrix"),
    nbf.v4.new_code_cell("""
# Print Confusion Matrices
print("Naive Bayes Confusion Matrix:\\n", nb_conf_matrix)
print("Multinomial Naive Bayes Confusion Matrix:\\n", mnb_conf_matrix)
print("Decision Tree Confusion Matrix:\\n", dt_conf_matrix)
"""),
    nbf.v4.new_markdown_cell("## Step 7: Accuracy and Comparison"),
    nbf.v4.new_code_cell("""
# Print Accuracy and Training Time
print("Naive Bayes Accuracy:", nb_accuracy)
print("Naive Bayes Training Time:", nb_training_time)
print("Naive Bayes Classification Report:\\n", nb_class_report)

print("\\nMultinomial Naive Bayes Accuracy:", mnb_accuracy)
print("Multinomial Naive Bayes Training Time:", mnb_training_time)
print("Multinomial Naive Bayes Classification Report:\\n", mnb_class_report)

print("\\nDecision Tree Accuracy:", dt_accuracy)
print("Decision Tree Training Time:", dt_training_time)
print("Decision Tree Classification Report:\\n", dt_class_report)
""")
]

# Save the notebook
path = "./Email_Spam_Detection_Project_Step_by_Step.ipynb"
with open(path, 'w') as f:
    nbf.write(nb, f)

path


'./Email_Spam_Detection_Project_Step_by_Step.ipynb'