In [None]:
%pip install nltk
%pip install lxml
%pip install -U scikit-learn


In [8]:
# Import necessary libraries
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
import email
from bs4 import BeautifulSoup

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

# Load the dataset
df = pd.read_csv('email_origin.csv')

def read_email_from_string(s):
    message = email.message_from_string(s)
    return message

def extract_email_body(message):
    if message.is_multipart():
        for part in message.walk():
            type_content = part.get_content_maintype()
            if type_content == 'text':
                message = part
                break
        else:
            return ''
    body = message.get_payload(decode=False)
    return body

def remove_html(s):
    soup = BeautifulSoup(s, 'lxml')
    for sp in soup(['script', 'style', 'head', 'meta', 'noscript']):
        sp.decompose()
    s = ' '.join(soup.stripped_strings)
    return s

# Function to preprocess text
def preprocess_text(text):
    # Extract email body
    body = extract_email_body(read_email_from_string(text))
    body = remove_html(body)

    # Tokenize the text
    words = nltk.word_tokenize(body)

    # Remove punctuation and convert to lower case
    words = [word.lower() for word in words if word.isalpha()]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]

    # Lemmatize the words
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    words = [word for word in words if word != 'subject']
    return ' '.join(words)

df['origin'] = df['origin'].apply(preprocess_text)

# Save the refactored emails back to the same file
df.to_csv('email_origin.csv', index=False)
df.head()


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Martin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Martin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Martin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  soup = BeautifulSoup(s, 'lxml')


Unnamed: 0,label,origin
0,0,wrong bill grace forward original message rodr...
1,0,continued hilcorp old ocean deal dan hyvl writ...
2,0,several related issue resulted increase level ...
3,0,one year rate one mm volume greater mm day pri...
4,0,attached weekly deal report lex carroll enron ...


In [9]:
df = df.sample(frac = 1, random_state = 1)
df = df.reset_index(drop = True)

# Create split index for 80/20 split
split_index = int(len(df) * 0.8)

# Split data
train_df, test_df = df[:split_index], df[split_index:]
df = train_df.reset_index(drop = True)
df = test_df.reset_index(drop = True)

len(train_df)

26961

In [None]:
# Count tokens in training set

token_counter = {}
for message in train_df['origin']:
  words = nltk.word_tokenize(message)

  for token in words:
    if token in token_counter:
      token_counter[token] += 1
    else:
      token_counter[token] = 1

token_counter

In [11]:
def keep_token(proccessed_token, threshold):
  if proccessed_token not in token_counter:
    return False
  else:
    # Add condition to check length of token
    return token_counter[proccessed_token] > threshold and len(proccessed_token) > 1


In [None]:
# for the current dataset 500 is a good value which generates a
# set with an acceptable length to be considered a feature set for the
# machine learning algorithms

# this is the Bag of Words approach

features = set()

for token in token_counter:
  if keep_token(token, 8000):
    features.add(token)

features = list(features)
features

In [None]:
token_to_index_mapping = {t:i for t, i in zip(features, range(len(features)))}
token_to_index_mapping

In [14]:
import numpy as np

def message_to_count_vector(message):
  count_vector = np.zeros(len(features))

  processed_list_of_tokens = nltk.word_tokenize(message)

  for token in processed_list_of_tokens:
    if token not in features:
      continue
    index = token_to_index_mapping[token]
    count_vector[index] += 1

  return count_vector

In [18]:
message_to_count_vector(train_df['origin'].iloc[9010])

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [19]:
# 1 spam; 0 not spam
train_df.iloc[9010]

label                                                     1
origin    hello viagra med struggle men erectile dysfunc...
Name: 9010, dtype: object

In [20]:
def extract_features_and_labels(dataframe):
  # Extract labels and convert to integer type
  labels = dataframe['label'].values.astype(int)

  # Extract messages
  messages = dataframe['origin']
  vector_counts = []

  # Convert each message to a count vector
  for msg in messages:
    vector = message_to_count_vector(msg)
    vector_counts.append(vector)

  # Convert list of count vectors to a numpy array and cast to integer type
  feature_matrix = np.asarray(vector_counts).astype(int)

  return feature_matrix, labels


In [29]:
X_train_BOW, Y_train_BOW = extract_features_and_labels(train_df)

X_test_BOW, Y_test_BOW = extract_features_and_labels(test_df)

X_train_BOW.shape, Y_train_BOW.shape, X_test_BOW.shape, Y_test_BOW.shape

((26961, 26), (26961,), (6741, 26), (6741,))

In [41]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler().fit(X_train_BOW)

X_train_BOW, X_test_BOW = scaler.transform(X_train_BOW), scaler.transform(X_test_BOW)

X_train_BOW

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.00133869, ..., 0.00142653, 0.        ,
        0.01851852],
       [0.        , 0.        , 0.        , ..., 0.        , 0.04878049,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [42]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

lrBOW = LogisticRegression().fit(X_train_BOW, Y_train_BOW)
print(classification_report(Y_test_BOW, lrBOW.predict(X_test_BOW)))

              precision    recall  f1-score   support

           0       0.80      0.80      0.80      3300
           1       0.81      0.81      0.81      3441

    accuracy                           0.80      6741
   macro avg       0.80      0.80      0.80      6741
weighted avg       0.80      0.80      0.80      6741



In [35]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split


In [36]:
X = df['origin']
Y = df['label']
X_train_TFIDF, X_test_TFIDF, Y_train_TFIDF, Y_test_TFIDF = train_test_split(X, Y, test_size = 0.2, random_state = 3)

In [38]:
feature_extraciton = TfidfVectorizer(min_df = 1, stop_words='english')
X_train_features = feature_extraciton.fit_transform(X_train_TFIDF)
X_test_features = feature_extraciton.transform(X_test_TFIDF)

Y_train_TFIDF = Y_train_TFIDF.astype('int')
Y_test_TFIDF = Y_test_TFIDF.astype('int')

In [None]:
print(X_train_features)

In [52]:
lrTFIDF = LogisticRegression()

lrTFIDF.fit(X_train_features, Y_train_TFIDF)

prediction_training_data = lrTFIDF.predict(X_train_features)

accuracy_training_data = accuracy_score(Y_train_TFIDF, prediction_training_data)

print('accuracy on training data: ', accuracy_training_data)

accuracy on training data:  0.9842359050445104


In [53]:
prediction_test_data = lrTFIDF.predict(X_test_features)
accuracy_test_data = accuracy_score(Y_test_TFIDF, prediction_test_data)
print('accuracy on training data: ', accuracy_test_data)

accuracy on training data:  0.9710896960711638


In [61]:
input_email = ["You have 18 unread notifications to review"]
input_data_features = feature_extraciton.transform(input_email)

prediction = lrTFIDF.predict(input_data_features)

print(prediction)

if(prediction[0]==1):
    print('Spam mail')

else:
    print('Ham mail')


[0]
Ham mail
