## Importing Libraries

In [32]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import OneClassSVM
from sklearn.preprocessing import StandardScaler
from keras.models import Sequential
from keras.layers import Dense

## Data Preprocessing

In [33]:
df = pd.read_csv('Dataset I.csv')

In [34]:
df['phishing']=df['Category'].apply(lambda x: 1 if x=='spam' else 0)
df.head()

Unnamed: 0,Category,Message,phishing
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [35]:
# Encode labels
le = LabelEncoder()
df['phishing'] = le.fit_transform(df['phishing'])

In [36]:
# Data Preprocessing
def preprocess_text(text):
    if isinstance(text, str):  # Check if the input is a string
        # Remove URLs and punctuation/numbers in one go
        text = re.sub(r'http\S+|www\S+|https\S+', '', text)
        text = re.sub(r'\W|\d', ' ', text)
        # Tokenization and stop word removal
        tokens = word_tokenize(text.lower())
        tokens = [word for word in tokens if word not in stopwords.words('english')]
        return ' '.join(tokens)
    else:
        return ''  # Return an empty string for non-string inputs

In [37]:
# Apply preprocessing to the text_combined column
df['processed_text'] = df['Message'].apply(preprocess_text)

In [38]:
# Feature Engineering using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=1000)  
tfidf_features = tfidf_vectorizer.fit_transform(df['processed_text']).toarray()

In [39]:
# Create a DataFrame for TF-IDF features
tfidf_df = pd.DataFrame(tfidf_features, columns=tfidf_vectorizer.get_feature_names_out())

In [40]:
# Combine TF-IDF features with the original labels
features = pd.concat([tfidf_df, df['phishing']], axis=1)

In [41]:
# Train-test split
X = features.drop('phishing', axis=1)
y = features['phishing']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Isolation Forest Model

In [42]:
# Initialize and fit the Isolation Forest model
model = IsolationForest(contamination=0.10, random_state=42)
model.fit(X_train)



In [43]:
# Predict anomalies with Isolation Forest
y_pred_if = model.predict(X_test)
y_pred_if = np.where(y_pred_if == -1, 1, 0)  # Convert predictions

In [44]:
# Calculate accuracy and other metrics for Isolation Forest
cm_if = confusion_matrix(y_test, y_pred_if)
report_if = classification_report(y_test, y_pred_if, output_dict=True)
accuracy_if = accuracy_score(y_test, y_pred_if)

In [45]:
# Store Isolation Forest results
results_if = {
    'model': 'Isolation Forest',
    'precision': report_if['1']['precision'],
    'recall': report_if['1']['recall'],
    'f1-score': report_if['1']['f1-score'],
    'support': report_if['1']['support'],
    'accuracy': accuracy_if
}

## One Class SVM Model

In [46]:
# Initialize and fit the One-Class SVM model
svm_model = OneClassSVM(kernel='rbf', gamma='auto', nu=0.1)
svm_model.fit(X_train)

In [47]:
# Predict anomalies with One-Class SVM
y_pred_svm = svm_model.predict(X_test)
y_pred_svm = np.where(y_pred_svm == -1, 1, 0)  # Convert predictions

In [48]:
# Calculate accuracy and other metrics for SVM
cm_svm = confusion_matrix(y_test, y_pred_svm)
report_svm = classification_report(y_test, y_pred_svm, output_dict=True)
accuracy_svm = accuracy_score(y_test, y_pred_svm)

In [49]:
# Store SVM results
results_svm = {
    'model': 'One-Class SVM',
    'precision': report_svm['1']['precision'],
    'recall': report_svm['1']['recall'],
    'f1-score': report_svm['1']['f1-score'],
    'support': report_svm['1']['support'],
    'accuracy': accuracy_svm
}

## AutoEncoder Results

In [50]:
# Autoencoder
# Normalize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [51]:
# Build the Autoencoder model
autoencoder = Sequential()
autoencoder.add(Dense(64, activation='relu', input_shape=(X_train_scaled.shape[1],)))
autoencoder.add(Dense(32, activation='relu'))
autoencoder.add(Dense(64, activation='relu'))
autoencoder.add(Dense(X_train_scaled.shape[1], activation='sigmoid'))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [52]:
# Compile the model
autoencoder.compile(optimizer='adam', loss='mean_squared_error')

In [53]:
# Train the Autoencoder
autoencoder.fit(X_train_scaled, X_train_scaled, epochs=100, batch_size=32, validation_split=0.2, verbose=1)

Epoch 1/100
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 13ms/step - loss: 1.0973 - val_loss: 1.0202
Epoch 2/100
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - loss: 0.9848 - val_loss: 1.0187
Epoch 3/100
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - loss: 0.9915 - val_loss: 1.0144
Epoch 4/100
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - loss: 0.9781 - val_loss: 1.0091
Epoch 5/100
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step - loss: 0.9844 - val_loss: 1.0045
Epoch 6/100
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step - loss: 0.9806 - val_loss: 0.9997
Epoch 7/100
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 15ms/step - loss: 0.9594 - val_loss: 0.9956
Epoch 8/100
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - loss: 0.9659 - val_loss: 0.9923
Epoch 9/100
[1m112/112[0m

<keras.src.callbacks.history.History at 0x2383ba88e10>

In [54]:
# Predict using the Autoencoder
X_test_pred = autoencoder.predict(X_test_scaled)
mse = np.mean(np.power(X_test_scaled - X_test_pred, 2), axis=1)
threshold = np.percentile(mse, 95)  # Set threshold for anomaly detection
y_pred_autoencoder = [1 if mse_i > threshold else 0 for mse_i in mse] 

[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step


In [55]:
# Calculate accuracy and other metrics for Autoencoder
cm_autoencoder = confusion_matrix(y_test, y_pred_autoencoder)
report_autoencoder = classification_report(y_test, y_pred_autoencoder, output_dict=True)
accuracy_autoencoder = accuracy_score(y_test, y_pred_autoencoder)

In [56]:
# Store Autoencoder results
results_autoencoder = {
    'model': 'Autoencoder',
    'precision': report_autoencoder['1']['precision'],
    'recall': report_autoencoder['1']['recall'],
    'f1-score': report_autoencoder['1']['f1-score'],
    'support': report_autoencoder['1']['support'],
    'accuracy': accuracy_autoencoder
}

In [57]:
# Print results for all models
print("Test Results:")
for result in [results_svm, results_if, results_autoencoder]:
    print(f"Model: {result['model']}")
    print(f"Precision: {result['precision']:.2f}")
    print(f"Recall: {result['recall']:.2f}")
    print(f"F1-Score: {result['f1-score']:.2f}")
    print(f"Support: {result['support']}")
    print(f"Accuracy: {result['accuracy']:.2f}")
    print("-" * 80)

Test Results:
Model: One-Class SVM
Precision: 0.16
Recall: 0.23
F1-Score: 0.19
Support: 149
Accuracy: 0.73
--------------------------------------------------------------------------------
Model: Isolation Forest
Precision: 0.52
Recall: 0.34
F1-Score: 0.41
Support: 149
Accuracy: 0.87
--------------------------------------------------------------------------------
Model: Autoencoder
Precision: 0.41
Recall: 0.15
F1-Score: 0.22
Support: 149
Accuracy: 0.86
--------------------------------------------------------------------------------


## Test Cases

In [58]:
legitimate_emails = [
    "Hi John, I hope you are doing well. Please find the attached report for your review. Best regards, Alice.",
    "Dear Customer, Thank you for your purchase! Your order will be shipped shortly. Sincerely, The Store Team.",
    "Hello, I wanted to follow up on our last meeting. Let me know if you have any questions. Regards, Bob.",
    "Reminder: Your appointment is scheduled for next Tuesday at 10 AM. Please confirm your attendance.",
    "Congratulations! You've been selected for a special offer. Click here to claim your prize."
]

In [59]:
phishing_emails = [
    "Dear User, Your account has been compromised. Please click the link below to verify your identity: http://fake-link.com",
    "Urgent: Your payment is overdue! Please provide your credit card information to avoid suspension.",
    "Congratulations! You've won a $1000 gift card. Click here to claim your prize: http://scam-link.com",
    "Dear Customer, We need you to verify your account information immediately. Failure to do so will result in account closure.",
    "Your invoice is attached. Please review and make payment immediately to avoid penalties."
]

In [60]:
# Function to test models with provided test cases
def test_models(legitimate_emails, phishing_emails, tfidf_vectorizer, svm_model, autoencoder, scaler):
    test_cases = legitimate_emails + phishing_emails
    processed_cases = [preprocess_text(email) for email in test_cases]
    tfidf_cases = tfidf_vectorizer.transform(processed_cases).toarray()
    tfidf_cases_scaled = scaler.transform(tfidf_cases)

    # Predictions
    svm_predictions = svm_model.predict(tfidf_cases)
    svm_predictions = np.where(svm_predictions == -1, 1, 0)

    autoencoder_predictions = autoencoder.predict(tfidf_cases_scaled)
    mse_cases = np.mean(np.power(tfidf_cases_scaled - autoencoder_predictions, 2), axis=1)
    autoencoder_threshold = np.percentile(mse_cases, 95)
    autoencoder_predictions = [1 if mse_i > autoencoder_threshold else 0 for mse_i in mse_cases]

    isolation_forest_predictions = model.predict(tfidf_cases)
    isolation_forest_predictions = np.where(isolation_forest_predictions == -1, 1, 0)

    # Print results
    for i, email in enumerate(test_cases):
        print(f"Email: {email}")
        print(f"SVM Prediction: {'Phishing' if svm_predictions[i] == 1 else 'Legitimate'}")
        print(f"Autoencoder Prediction: {'Phishing' if autoencoder_predictions[i] == 1 else 'Legitimate'}")
        print(f"Isolation Forest Prediction: {'Phishing' if isolation_forest_predictions[i] == 1 else 'Legitimate'}")
        print("-" * 80)

# Run the test with the defined test cases
test_models(legitimate_emails, phishing_emails, tfidf_vectorizer, svm_model, autoencoder, scaler)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 88ms/step




Email: Hi John, I hope you are doing well. Please find the attached report for your review. Best regards, Alice.
SVM Prediction: Legitimate
Autoencoder Prediction: Legitimate
Isolation Forest Prediction: Legitimate
--------------------------------------------------------------------------------
Email: Dear Customer, Thank you for your purchase! Your order will be shipped shortly. Sincerely, The Store Team.
SVM Prediction: Legitimate
Autoencoder Prediction: Legitimate
Isolation Forest Prediction: Legitimate
--------------------------------------------------------------------------------
Email: Hello, I wanted to follow up on our last meeting. Let me know if you have any questions. Regards, Bob.
SVM Prediction: Legitimate
Autoencoder Prediction: Legitimate
Isolation Forest Prediction: Legitimate
--------------------------------------------------------------------------------
Email: Reminder: Your appointment is scheduled for next Tuesday at 10 AM. Please confirm your attendance.
SVM Pred



## Flask API

In [61]:
import pickle

In [62]:
# Assuming best_svm is your trained One-Class SVM model
with open("one_class_svm_model.pkl", "wb") as f:
    pickle.dump(svm_model, f)

In [63]:
# Assuming tfidf_vectorizer is your trained TF-IDF vectorizer
with open("one_class_vectorizer.pkl", "wb") as f:
    pickle.dump(tfidf_vectorizer, f)