In [None]:
from sklearn.model_selection import train_test_split;
from sklearn.preprocessing import StandardScaler;
import pandas as pd;

In [None]:
#load data
email_data = pd.read_csv('./sample_data/phis_email.csv')
email_data.head()

In [None]:
#check for nulls
print(f'data info: {email_data.info()}');
print(f'\n duplicate rows: {email_data.duplicated().sum()}');

print(f'\n checking for null rows: {email_data.isnull().sum()}');


In [None]:
#check  the email type distribution
email_data['Email Type'].value_counts().plot(kind='bar');

In [None]:
email_data.drop(columns=['Unnamed: 0'], inplace=True);


In [None]:
email_data.columns
email_data.head()

In [None]:
#use 1 for safe email and 0 for Phising email
email_data['Email Type'] = email_data['Email Type'].replace(['Safe Email','Phishing Email'],[1,0])
email_data['Email Type']

In [None]:
from wordcloud import WordCloud
phishing_emails = email_data[email_data['Email Type'] == 0]
non_phishing_emails = email_data[email_data['Email Type'] == 1]


# Check for NaN values
print(phishing_emails['Email Text'].isnull().sum())
phishing_emails.loc[:, 'Email Text'] = phishing_emails['Email Text'].astype(str).fillna('')


In [None]:
import matplotlib.pyplot as plt

# Generate WordCloud for phishing emails
wordcloud = WordCloud().generate(' '.join(phishing_emails['Email Text']))

# Display the WordCloud
plt.figure(figsize=(10, 6))  # Set figure size
plt.imshow(wordcloud, interpolation='bilinear')  # Display the generated WordCloud
plt.axis('off')  # Turn off the axis
plt.show() 

In [None]:
wordcloud.to_file('./sample_data/phishing_wordcloud.png') 

In [None]:
# Generate WordCloud for non-phishing emails
non_phishing_wordcloud = WordCloud().generate(' '.join(non_phishing_emails['Email Text']))

# Display the WordCloud for non-phishing emails
plt.figure(figsize=(10, 6))
plt.imshow(non_phishing_wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
wordcloud.to_file('./sample_data/safe_emails_wordcloud.png') 

In [None]:
# email_data['text_length'] = email_data['Email Text'].apply(len)
# email_data['text_length'].hist(bins=50)

# Calculate email length
email_data['email_length'] = email_data['Email Text'].apply(lambda x: len(str(x)))

In [None]:
#Univariate visualisation

import matplotlib.pyplot as plt
import seaborn as sns

# Univariate Visualization: Histogram of Email Length
plt.figure(figsize=(10, 6))
sns.histplot(email_data['email_length'], bins=30, kde=True)  # KDE for smoothness
plt.title('Distribution of Email Lengths')
plt.xlabel('Email Length (Characters)')
plt.ylabel('Frequency')
plt.axvline(x=email_data['email_length'].mean(), color='red', linestyle='--', label='Mean Email Length')
plt.legend()
plt.show()

In [None]:
# Multivariate Visualization: Email Length vs. Word Count

email_data['word_count'] = email_data['Email Text'].apply(lambda x: len(str(x).split()))


plt.figure(figsize=(10, 6))
sns.scatterplot(x='email_length', y='word_count', hue='Email Type', data=email_data, alpha=0.7)
plt.title('Email Length vs. Word Count by Email Type')
plt.xlabel('Email Length (Characters)')
plt.ylabel('Word Count')
plt.axhline(y=email_data['word_count'].mean(), color='red', linestyle='--', label='Mean Word Count')
plt.legend()
plt.show()

In [None]:
#preprocessing

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.preprocessing import LabelEncoder

# Prepare your features and labels
X = email_data['Email Text']
y = email_data['Email Type']


# le = LabelEncoder()
# y = le.fit_transform(y) 

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorize the text data
tfidf = TfidfVectorizer(max_features=5000)  # Limit to the top 5000 words
X_train_tfidf = tfidf.fit_transform(X_train).toarray()
X_test_tfidf = tfidf.transform(X_test).toarray()

In [None]:
# Classical ML Model: Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

# Initialize the model
logistic_model = LogisticRegression()

# Train the model
logistic_model.fit(X_train_tfidf, y_train)

# Make predictions
y_pred_logistic = logistic_model.predict(X_test_tfidf)

# Evaluate the model
print("Logistic Regression Classification Report:")
print(classification_report(y_test, y_pred_logistic))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_logistic))


In [None]:
# Neural Network: Feedforward Neural Network

import numpy as np
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam


# from tensorflow.keras import Sequential
# from tensorflow.keras import metrics


# Create the neural network model
nn_model = Sequential()
nn_model.add(Dense(128, activation='relu', input_shape=(X_train_tfidf.shape[1],)))
nn_model.add(Dense(64, activation='relu'))
nn_model.add(Dense(1, activation='sigmoid'))  # Binary classification

# Compile the model
nn_model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
nn_model.fit(X_train_tfidf, y_train, epochs=10, batch_size=32, validation_split=0.2)

# Make predictions
y_pred_nn = (nn_model.predict(X_test_tfidf) > 0.5).astype("int32")

# Evaluate the model
print("Neural Network Classification Report:")
print(classification_report(y_test, y_pred_nn))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_nn))
