# **Phish in the Web**

### **Model Testing**

**Data Preprocessing**

In [1]:
#import libraries only ONCE
import pandas as pd
from sklearn.preprocessing import MinMaxScaler #no one hot // all columns numeric
from sklearn.model_selection import train_test_split #data splitting


#read in data & check for shape
phish_df = pd.read_csv("Phishing_Legitimate_full.csv")
phish_df.shape

#specifying the columns we want to keep
selected_columns = ['NumDots', 'SubdomainLevel', 'PathLevel', 'UrlLength', 'NumDash',
    'NumDashInHostname', 'AtSymbol','TildeSymbol', 'NumUnderscore','NumPercent',
    'NumQueryComponents','NumAmpersand','NumHash','NumNumericChars','NoHttps',
    'RandomString','IpAddress','DomainInSubdomains','DomainInPaths','HttpsInHostname','HostnameLength',
    'PathLength','QueryLength','DoubleSlashInPath','NumSensitiveWords','EmbeddedBrandName',
    'PctExtHyperlinks','PctExtResourceUrls','ExtFavicon','InsecureForms','RelativeFormAction',
    'ExtFormAction','AbnormalFormAction','PctNullSelfRedirectHyperlinks','FrequentDomainNameMismatch',
    'FakeLinkInStatusBar','RightClickDisabled','PopUpWindow','SubmitInfoToEmail','IframeOrFrame',
    'MissingTitle','ImagesOnlyInForm','SubdomainLevelRT','UrlLengthRT','PctExtResourceUrlsRT',
    'AbnormalExtFormActionR','ExtMetaScriptLinkRT','PctExtNullSelfRedirectHyperlinksRT', 'CLASS_LABEL']

#features
feature_columns = ['NumDots', 'SubdomainLevel', 'PathLevel', 'UrlLength', 'NumDash', 'NumDashInHostname', 'AtSymbol','TildeSymbol', 'NumUnderscore','NumPercent','NumQueryComponents','NumAmpersand','NumHash','NumNumericChars','NoHttps','RandomString','IpAddress','DomainInSubdomains','DomainInPaths','HttpsInHostname','HostnameLength','PathLength','QueryLength','DoubleSlashInPath','NumSensitiveWords','EmbeddedBrandName','PctExtHyperlinks','PctExtResourceUrls','ExtFavicon','InsecureForms','RelativeFormAction', 'ExtFormAction','AbnormalFormAction','PctNullSelfRedirectHyperlinks','FrequentDomainNameMismatch','FakeLinkInStatusBar','RightClickDisabled','PopUpWindow','SubmitInfoToEmail','IframeOrFrame','MissingTitle','ImagesOnlyInForm','SubdomainLevelRT','UrlLengthRT','PctExtResourceUrlsRT','AbnormalExtFormActionR','ExtMetaScriptLinkRT','PctExtNullSelfRedirectHyperlinksRT']
    #without class label

#label
label_columns = ['CLASS_LABEL']

phishing_features_df = phish_df[feature_columns]
phishing_label_df = phish_df[label_columns]

phishing_features_df.head()

#initiate scaler
min_scaler = MinMaxScaler()

#scale features // this includes id (maybe drop later)
phishing_features_df[feature_columns] = min_scaler.fit_transform(phishing_features_df[feature_columns])

phishing_features_df.head()

#data splitting

#split data!
x_train, x_test, y_train, y_test = train_test_split(phishing_features_df, phishing_label_df, test_size=0.2, random_state=42)

#print training data shape and label's shape
print (f"Training: Features' shape [no. of examples * feature vector size] =  {x_train.shape}")
print (f"Training: Label's shape [no. of examples * 1] = {y_train.shape}\n")

#print test data shape and label's shape
print (f"Test: Features' shape [no. of examples * feature vector size] =  {x_test.shape}")
print (f"Test: Label's shape [no. of examples * 1] = {y_test.shape}\n")

Training: Features' shape [no. of examples * feature vector size] =  (8000, 48)
Training: Label's shape [no. of examples * 1] = (8000, 1)

Test: Features' shape [no. of examples * feature vector size] =  (2000, 48)
Test: Label's shape [no. of examples * 1] = (2000, 1)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  phishing_features_df[feature_columns] = min_scaler.fit_transform(phishing_features_df[feature_columns])


### **Train all Models**

- Logistic Regression

- Random Forest

- SVM

- MLP

**import necessary files**

In [2]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

#do i need this
from sklearn.metrics import roc_auc_score, confusion_matrix

**testing all models**

In [3]:
#SVM
svm_model = SVC(kernel="linear")
svm_model.fit(x_train, y_train.values.ravel())  #ravel() to convert y_train to 1d array
svm_predictions = svm_model.predict(x_test)

print("SVM Classification Report:")
print(classification_report(y_test, svm_predictions))

#Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(x_train, y_train.values.ravel())
rf_predictions = rf_model.predict(x_test)

print("\nRandom Forest Classification Report:")
print(classification_report(y_test, rf_predictions))

#MLP (Multi-Layer Perceptron)
mlp_model = MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000, random_state=42)
mlp_model.fit(x_train, y_train.values.ravel())
mlp_predictions = mlp_model.predict(x_test)

print("\nMLP Classification Report:")
print(classification_report(y_test, mlp_predictions))

# Logistic Regression
logistic_model = LogisticRegression(max_iter=1000, random_state=42)
logistic_model.fit(x_train, y_train.values.ravel())
logistic_predictions = logistic_model.predict(x_test)

print("\nLogistic Regression Classification Report:")
print(classification_report(y_test, logistic_predictions))

SVM Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.93      0.94       988
           1       0.93      0.95      0.94      1012

    accuracy                           0.94      2000
   macro avg       0.94      0.94      0.94      2000
weighted avg       0.94      0.94      0.94      2000


Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.98      0.98       988
           1       0.98      0.98      0.98      1012

    accuracy                           0.98      2000
   macro avg       0.98      0.98      0.98      2000
weighted avg       0.98      0.98      0.98      2000


MLP Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.98      0.98       988
           1       0.98      0.98      0.98      1012

    accuracy                           0.98      2000
   macro avg       0.98      0.98  

In [4]:
# Function to print evaluation metrics
def print_evaluation_metrics(y_true, y_pred):
    print("ROC AUC Score:", roc_auc_score(y_true, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))

# SVM Evaluation Metrics
print("\nSVM Evaluation Metrics:")
print_evaluation_metrics(y_test, svm_predictions)

# Random Forest Evaluation Metrics
print("\nRandom Forest Evaluation Metrics:")
print_evaluation_metrics(y_test, rf_predictions)

# MLP Evaluation Metrics
print("\nMLP Evaluation Metrics:")
print_evaluation_metrics(y_test, mlp_predictions)

# Logistic Regression Evaluation Metrics
print("\nLogistic Regression Evaluation Metrics:")
print_evaluation_metrics(y_test, logistic_predictions)


SVM Evaluation Metrics:
ROC AUC Score: 0.940389416075915
Confusion Matrix:
 [[920  68]
 [ 51 961]]

Random Forest Evaluation Metrics:
ROC AUC Score: 0.9819974076266982
Confusion Matrix:
 [[970  18]
 [ 18 994]]

MLP Evaluation Metrics:
ROC AUC Score: 0.9785389096029828
Confusion Matrix:
 [[970  18]
 [ 25 987]]

Logistic Regression Evaluation Metrics:
ROC AUC Score: 0.938389128034437
Confusion Matrix:
 [[918  70]
 [ 53 959]]
