In [11]:
import pandas as pd
import re
import nltk
import numpy as np
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [12]:
df = pd.read_csv("Spam_Email_Data.csv")

In [13]:
lemmatizer = WordNetLemmatizer()
tfidf_vectorizer = TfidfVectorizer()
ngram_vectorizer = CountVectorizer(ngram_range=(1, 1), max_features=85000)
knn = KNeighborsClassifier(n_neighbors=5) 
logistic_regression = LogisticRegression(random_state=42, max_iter=1000)
decisionTree = DecisionTreeClassifier(random_state=42)

In [14]:
results_df = pd.DataFrame(columns=['label', 'accuracy'])

In [15]:
def clean_email_content(email_body):
    # Remove HTML tags using BeautifulSoup
    soup = BeautifulSoup(email_body, 'html.parser')
    cleaned_text = soup.get_text()

    # Remove special characters and punctuation using regex
    cleaned_text = re.sub(r'[^a-zA-Z\s]', '', cleaned_text)

    # Convert text to lowercase
    cleaned_text = cleaned_text.lower()

    # Tokenize the text
    tokens = nltk.word_tokenize(cleaned_text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]

    # Return the cleaned tokens
    return lemmatized_tokens

In [16]:
def apply_knn(X, y, label, results_df):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)
    # Train the classifier using the training data
    knn.fit(X_train, y_train)

    # Predict the labels for the testing data
    y_pred = knn.predict(X_test)

    # Evaluate the classifier's performance
    accuracy = accuracy_score(y_test, y_pred)
    classification_report_text = classification_report(y_test, y_pred)

    # Print the evaluation results
    print(f"Accuracy: {accuracy:.4f}")
    print("Classification Report:")
    print(classification_report_text)

    # Create a dictionary with the results and label
    result = {
        'label': label,
        'accuracy': accuracy
    }
    
    # Convert the result dictionary to a DataFrame
    result_df = pd.DataFrame([result])
    
    # Use pd.concat to concatenate the result DataFrame with the results DataFrame
    results_df = pd.concat([results_df, result_df], ignore_index=True)
    
    # Return the updated DataFrame
    return results_df

In [17]:
def apply_logistic_regression(X, y, label, results_df):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)
    # Train the classifier using the training data
    logistic_regression.fit(X_train, y_train)

    # Predict the labels for the testing data
    y_pred = logistic_regression.predict(X_test)

    # Evaluate the classifier's performance
    accuracy = accuracy_score(y_test, y_pred)
    classification_report_text = classification_report(y_test, y_pred)

    # Print the evaluation results
    print(f"Accuracy: {accuracy:.4f}")
    print("Classification Report:")
    print(classification_report_text)

    # Create a dictionary with the results and label
    result = {
        'label': label,
        'accuracy': accuracy
    }
    
    # Convert the result dictionary to a DataFrame
    result_df = pd.DataFrame([result])
    
    # Use pd.concat to concatenate the result DataFrame with the results DataFrame
    results_df = pd.concat([results_df, result_df], ignore_index=True)
    
    # Return the updated DataFrame
    return results_df

In [18]:
def apply_decisionTree(X, y, label, results_df):
    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

    # Train the classifier using the training data
    decisionTree.fit(X_train, y_train)

    # Predict the labels for the testing data
    y_pred = decisionTree.predict(X_test)

    # Evaluate the classifier's performance
    accuracy = accuracy_score(y_test, y_pred)
    classification_report_text = classification_report(y_test, y_pred)

    # Print the evaluation results
    print(f"Label: {label}")
    print(f"Accuracy: {accuracy:.4f}")
    print("Classification Report:")
    print(classification_report_text)
    
    # Create a dictionary with the results and label
    result = {
        'label': label,
        'accuracy': accuracy
    }
    
    # Convert the result dictionary to a DataFrame
    result_df = pd.DataFrame([result])
    
    # Use pd.concat to concatenate the result DataFrame with the results DataFrame
    results_df = pd.concat([results_df, result_df], ignore_index=True)
    
    # Return the updated DataFrame
    return results_df

In [19]:
# Apply the cleaning function to each entry in the 'text' column
df['cleaned_text'] = df['text'].apply(clean_email_content)
# drop old text column
df = df.drop('text', axis=1)
# Display the first few rows of the DataFrame with the new 'cleaned_text' column
df.head()

Unnamed: 0,target,cleaned_text
0,0,"[ilugadminlinuxie, mon, jul, returnpath, deliv..."
1,1,"[gortexcitecom, mon, jun, returnpath, gortexci..."
2,1,"[forkadminxentcom, mon, jul, returnpath, deliv..."
3,1,"[dcmbtamailnetcn, mon, jun, returnpath, dcmbta..."
4,0,"[ilugadminlinuxie, mon, aug, returnpath, deliv..."


In [20]:
# term-frequency inverse document frequency technique

# Join the tokens back into single strings
df['cleaned_text'] = df['cleaned_text'].apply(lambda tokens: ' '.join(tokens))

# Fit and transform the cleaned_text column using TfidfVectorizer (TF-IDF)
tfidf_features = tfidf_vectorizer.fit_transform(df['cleaned_text'])
tfidf_df = pd.DataFrame(tfidf_features.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

print('TFIDF with KNN classifier')
results_df = apply_knn(tfidf_df, df['target'], 'tfidfKNN', results_df)
print('TFIDF with Logistic Regression classifier')
results_df = apply_logistic_regression(tfidf_df, df['target'], 'tfidfLogisticRegression', results_df)

TFIDF with KNN classifier
Accuracy: 0.9593
Classification Report:
              precision    recall  f1-score   support

           0       0.95      1.00      0.97      1932
           1       0.99      0.89      0.94       966

    accuracy                           0.96      2898
   macro avg       0.97      0.94      0.95      2898
weighted avg       0.96      0.96      0.96      2898



  results_df = pd.concat([results_df, result_df], ignore_index=True)


TFIDF with Logistic Regression classifier
Accuracy: 0.9790
Classification Report:
              precision    recall  f1-score   support

           0       0.97      1.00      0.98      1932
           1       1.00      0.94      0.97       966

    accuracy                           0.98      2898
   macro avg       0.98      0.97      0.98      2898
weighted avg       0.98      0.98      0.98      2898



In [79]:
# N-gram technique

# Fit and transform the cleaned_text column using the n-gram vectorizer
ngram_features = ngram_vectorizer.fit_transform(df['cleaned_text'])

# Convert the feature matrix to a pandas DataFrame
ngram_df = pd.DataFrame(ngram_features.toarray(), columns=ngram_vectorizer.get_feature_names_out())

print('N-gram with KNN classifier')
results_df = apply_knn(ngram_df, df['target'], 'ngramKNN', results_df)
print('N-gram with Logistic Regression classifier')
results_df = apply_logistic_regression(ngram_df, df['target'], 'ngramLogisticRegression', results_df)

N-gram with KNN classifier
Accuracy: 0.9489
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.98      0.96      1932
           1       0.96      0.88      0.92       966

    accuracy                           0.95      2898
   macro avg       0.95      0.93      0.94      2898
weighted avg       0.95      0.95      0.95      2898

N-gram with Logistic Regression classifier
Accuracy: 0.9938
Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      1.00      1932
           1       1.00      0.98      0.99       966

    accuracy                           0.99      2898
   macro avg       0.99      0.99      0.99      2898
weighted avg       0.99      0.99      0.99      2898



In [80]:
del ngram_df
del tfidf_df

In [81]:
# Train your own Word2Vec model using your text data
# Example: Word2Vec model trained on your 'cleaned_text' data
word2vec_model = Word2Vec(df['cleaned_text'], vector_size=100, window=5, min_count=1, sg=1)

# Convert text data into Word2Vec embeddings by averaging the word vectors for each document
def get_word2vec_embedding(text):
    vectors = [word2vec_model.wv[word] for word in text if word in word2vec_model.wv]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        # Return a zero vector if no words in the text are in the Word2Vec model vocabulary
        return np.zeros(word2vec_model.vector_size)

# Apply Word2Vec embedding extraction to the 'cleaned_text' column
X = np.vstack(df['cleaned_text'].apply(get_word2vec_embedding))

print('W2V with KNN classifier')
results_df = apply_knn(X, df['target'], 'W2VKNN', results_df)
print('W2V with Decision Tree classifier')
results_df = apply_decisionTree(X, df['target'], 'W2VDecisionTree', results_df)

W2V with KNN classifier
Accuracy: 0.9089
Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.97      0.93      1932
           1       0.93      0.79      0.85       966

    accuracy                           0.91      2898
   macro avg       0.91      0.88      0.89      2898
weighted avg       0.91      0.91      0.91      2898

W2V with Decision Tree classifier
Label: W2VDecisionTree
Accuracy: 0.8544
Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.89      0.89      1932
           1       0.78      0.78      0.78       966

    accuracy                           0.85      2898
   macro avg       0.84      0.84      0.84      2898
weighted avg       0.85      0.85      0.85      2898



In [83]:
# Sample 5% of the DataFrame
df_sample = df.sample(frac=0.05, random_state=42)


In [84]:
# Bert technique for word embedding
from transformers import BertTokenizer

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the text in the sampled DataFrame
inputs = tokenizer(list(df_sample['cleaned_text']), padding=True, truncation=True, return_tensors='pt')


In [85]:
from transformers import BertModel
import torch

# Load BERT model
model = BertModel.from_pretrained('bert-base-uncased')

# Obtain embeddings using the BERT model
with torch.no_grad():
    outputs = model(**inputs)

# Use the pooled output (sentence embeddings)
sentence_embeddings = outputs.pooler_output


In [88]:
print('Bert with Logistic Regression classifier')
results_df = apply_logistic_regression(sentence_embeddings, df_sample['target'], 'bertLogisticRegression', results_df)
print('Bert with Decision Tree classifier')
results_df = apply_decisionTree(sentence_embeddings, df_sample['target'], 'bertDecisionTree', results_df)

Bert with Logistic Regression classifier
Accuracy: 0.9103
Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.97      0.93        93
           1       0.93      0.81      0.87        52

    accuracy                           0.91       145
   macro avg       0.92      0.89      0.90       145
weighted avg       0.91      0.91      0.91       145

Bert with Decision Tree classifier
Label: bertDecisionTree
Accuracy: 0.7379
Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.83      0.80        93
           1       0.65      0.58      0.61        52

    accuracy                           0.74       145
   macro avg       0.71      0.70      0.71       145
weighted avg       0.73      0.74      0.73       145



In [89]:
results_df

Unnamed: 0,label,accuracy
0,tfidfKNN,0.959282
1,tfidfLogisticRegression,0.978951
2,ngramKNN,0.94893
3,ngramLogisticRegression,0.993789
4,W2VKNN,0.908903
5,W2VDecisionTree,0.854382
6,bertLogisticRegression,0.910345
7,bertDecisionTree,0.737931
