In [None]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.decomposition import TruncatedSVD

nltk.download('punkt')
nltk.download('stopwords')
# Prepare Data
data = pd.read_csv('Spam_Email_Data.csv')


# Function for Model Evaluation
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score
from sklearn.metrics import classification_report
def model_evaluation(model, x_test, y_test):
    evaluation = []
    y_pred = model.predict(x_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='weighted')
    evaluation.append({
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    })
    # Make a DataFrame
    evaluation = pd.DataFrame(evaluation)
    return evaluation

# def model_evaluation(model, x_test, y_test):
#     y_pred = model.predict(x_test)
#     return classification_report(y_test, y_pred,zero_division=0)


In [4]:
data

Unnamed: 0,text,target
0,From ilug-admin@linux.ie Mon Jul 29 11:28:02 2...,0
1,From gort44@excite.com Mon Jun 24 17:54:21 200...,1
2,From fork-admin@xent.com Mon Jul 29 11:39:57 2...,1
3,From dcm123@btamail.net.cn Mon Jun 24 17:49:23...,1
4,From ilug-admin@linux.ie Mon Aug 19 11:02:47 2...,0
...,...,...
5791,From ilug-admin@linux.ie Mon Jul 22 18:12:45 2...,0
5792,From fork-admin@xent.com Mon Oct 7 20:37:02 20...,0
5793,Received: from hq.pro-ns.net (localhost [127.0...,1
5794,From razor-users-admin@lists.sourceforge.net T...,0


In [5]:
# Preprocessing function
import re
from nltk.stem import PorterStemmer
def preprocess(text):
    # Convert to lower case
    text = text.lower()

    # Remove email addresses
    text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', '', text)

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Remove non-alphanumeric characters and punctuation
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenize
    tokens = word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]

    return ' '.join(tokens)



processed_data = data['text'].apply(preprocess)


In [6]:
processed_data

0       mon jul returnpath deliveredto receiv localhos...
1       mon jun returnpath deliveryd tue jun receiv ma...
2       mon jul returnpath deliveredto receiv localhos...
3       mon jun returnpath deliveryd mon jun receiv ma...
4       mon aug returnpath deliveredto receiv localhos...
                              ...                        
5791    mon jul returnpath deliveredto receiv localhos...
5792    mon oct returnpath deliveredto receiv localhos...
5793    receiv hqpronsnet localhost hqpronsnet esmtp i...
5794    thu sep returnpath deliveredto receiv localhos...
5795    mon sep returnpath deliveredto receiv localhos...
Name: text, Length: 5796, dtype: object

In [7]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(processed_data, data['target'], test_size=0.4, random_state=50)

In [8]:
# Import Text embedding libraries
from gensim.models.doc2vec import Doc2Vec, TaggedDocument # Doc2Vec
from sklearn.feature_extraction.text import TfidfVectorizer # TF-IDF
from sklearn.feature_extraction.text import CountVectorizer # Bag of Words
from gensim.models import Word2Vec # word2vec


In [11]:
def train_Doc2Vec(train_data):
  # Prepare TaggedDocument objects for training
  documents = [TaggedDocument(words=text.split(), tags=[str(i)]) for i, text in enumerate(train_data)]
  # Initialize a Doc2Vec model
  doc2vec_model = Doc2Vec()
  # Build the vocabulary from the training data
  doc2vec_model.build_vocab(documents)
  # Train the Doc2Vec model on the training data
  doc2vec_model.train(documents, total_examples=doc2vec_model.corpus_count, epochs=15)

  return doc2vec_model

def get_Doc2Vec(doc2vec_model, corpus):
  # Generate document vectors for the provided corpus using the trained model
  doc_vectors = []
  for text in corpus:
        # Infer the vector representation of each document in the corpus
        doc_vectors.append(doc2vec_model.infer_vector(text.split()))

  return doc_vectors


modelD2V = train_Doc2Vec(x_train)
x_train_vector_Doc2v = get_Doc2Vec(modelD2V, x_train)
x_test_vector_Doc2v = get_Doc2Vec(modelD2V, x_test)

In [None]:
x_train_vector_Doc2v

In [13]:
# Text Embedding using TF-IDF
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(x_train)
X_test_tfidf = vectorizer.transform(x_test)

# Apply TruncatedSVD
svd = TruncatedSVD(n_components=50)  # Set n_components based on available memory
X_train_tfidf = svd.fit_transform(X_train_tfidf)
X_test_tfidf = svd.transform(X_test_tfidf)

In [14]:
# Text Embedding using Bag of Words
from sklearn.preprocessing import StandardScaler
vectorizer = CountVectorizer(max_features=1000)  # Limiting the number of features
x_train_bow = vectorizer.fit_transform(x_train)
x_test_bow = vectorizer.transform(x_test)

# Scale the data
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train_bow.toarray())
x_test_scaled = scaler.transform(x_test_bow.toarray())

In [15]:
# Text Embedding using Word2Vec
def get_embedding_Word2Vec(data):
    tokenized_data = [word_tokenize(_d.lower()) for _d in data]
    model = Word2Vec(sentences=tokenized_data, vector_size=300, window=5, min_count=1, workers=4)
    return model

model_w2v = get_embedding_Word2Vec(x_train)

def get_average_word2vec(words, model, size):
    feature_vec = np.zeros((size,), dtype='float32')
    num_words = 0
    for word in words:
        if word in model.wv.key_to_index:
            num_words += 1
            feature_vec = np.add(feature_vec, model.wv[word])

    if num_words > 0:
        feature_vec = np.divide(feature_vec, num_words)

    return feature_vec

X_train_w2v = [get_average_word2vec(word_tokenize(text.lower()), model_w2v, 300) for text in x_train]
X_test_w2v = [get_average_word2vec(word_tokenize(text.lower()), model_w2v, 300) for text in x_test]

In [16]:
# Import and initialize Logistic Regression classifiers
from sklearn.linear_model import LogisticRegression
lr_classifier = LogisticRegression(C=1.0)  # Increase max_iter

# Import and initialize Decision Tree Classifier
from sklearn.tree import DecisionTreeClassifier
dt_classifier = DecisionTreeClassifier()

In [17]:
# Model training using Logistic Regression Classifier and Doc2Vec Embedding
lr_classifier.fit(x_train_vector_Doc2v, y_train)

In [18]:
print("Evaluation of Logistic Regression with Doc2Vec:")
print(model_evaluation(lr_classifier, x_test_vector_Doc2v, y_test))

Evaluation of Logistic Regression with Doc2Vec:
   accuracy  precision    recall        f1
0  0.962484   0.962375  0.962484  0.962339


In [19]:
# Model training using Logistic Regression Classifier and TF-IDF Embedding
# lr_classifier.fit(X_train_tdidf, y_train)
lr_classifier.fit(X_train_tfidf, y_train)

In [20]:
print("Evaluation of Logistic Regression with TF-IDF:")
print(model_evaluation(lr_classifier, X_test_tfidf, y_test))

Evaluation of Logistic Regression with TF-IDF:
   accuracy  precision    recall        f1
0  0.971971   0.972599  0.971971  0.971683


In [None]:
# Model training using Logistic Regression Classifier and Bag of Words Embedding
lr_classifier.fit(x_train_bow, y_train)

In [22]:
print("Evaluation of Logistic Regression with Bag of Words:")
print(model_evaluation(lr_classifier, x_test_bow, y_test))

Evaluation of Logistic Regression with Bag of Words:
   accuracy  precision    recall        f1
0  0.995257   0.995255  0.995257  0.995256


In [23]:
# Model training using Logistic Regression Classifier and Word2Vec Embedding
lr_classifier.fit(X_train_w2v, y_train)

In [24]:
print("Evaluation of Logistic Regression with Word2Vec:")
print(model_evaluation(lr_classifier, X_test_w2v, y_test))

Evaluation of Logistic Regression with Word2Vec:
   accuracy  precision    recall       f1
0  0.983614   0.983593  0.983614  0.98359


In [25]:
# Model training using Decision Tree Classifier and Doc2Vec Embedding
dt_classifier.fit(x_train_vector_Doc2v, y_train)

In [26]:
print("Evaluation of Decision Tree with Doc2Vec:")
print(model_evaluation(dt_classifier, x_test_vector_Doc2v, y_test))

Evaluation of Decision Tree with Doc2Vec:
   accuracy  precision    recall        f1
0  0.845623   0.847644  0.845623  0.846461


In [27]:
# Model training using Decision Tree Classifier and TF-IDF Embedding
dt_classifier.fit(X_train_tfidf, y_train)

In [28]:
print("Evaluation of Decision Tree with TF-IDF:")
print(model_evaluation(dt_classifier, X_test_tfidf, y_test))

Evaluation of Decision Tree with TF-IDF:
   accuracy  precision    recall        f1
0  0.963346   0.963976  0.963346  0.963516


In [29]:
# Model training using Decision Tree Classifier and bag of words Embedding
dt_classifier.fit(x_train_bow, y_train)

In [30]:
print("Evaluation of Decision Tree with Bag of Words:")
print(model_evaluation(dt_classifier, x_test_bow, y_test))

Evaluation of Decision Tree with Bag of Words:
   accuracy  precision    recall        f1
0  0.974127   0.974127  0.974127  0.974127


In [31]:
# Model training using Decision Tree Classifier and Word2Vec Embedding
dt_classifier.fit(X_train_w2v, y_train)

In [32]:
print("Evaluation of Decision Tree with Word2Vec:")
print(model_evaluation(dt_classifier, X_test_w2v, y_test))

Evaluation of Decision Tree with Word2Vec:
   accuracy  precision    recall        f1
0  0.963777   0.964034  0.963777  0.963866
