In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# This allows charts to show up right here in the page
%matplotlib inline

In [None]:
#load uci dataset
uci_data= pd.read_csv('Data/phishing_uci.csv')
#load phishtank dataset
phishtank_data = pd.read_csv('Data/verified_online.csv')

print("UCI Data Size:", uci_data.shape)
print("PhishTank Data Size:", phishtank_data.shape)

uci_data.head()

In [None]:
counts =uci_data['Result'].value_counts()
print('Count of eac type:')
print(counts)

plt.figure(figsize=(8, 5))
sns.countplot(x='Result', data=uci_data, palette='magma')
plt.title('Distribution of Phishing (1) vs Legitimate (-1)')
plt.show()

In [None]:
from urllib.parse import urlparse  #it is used to break the url

def check_url_length(url):
    if len(url) < 54:
        return -1
    elif len(url)>= 54 and len(url) <75:
        return 0
    else:
        return 1

def check_symbol(url):
    if "@" in url:
        return 1
    else:
        return -1

def check_https(url):
    if url.startswith("https"):
        return -1
    else :
        return 1


test_url = "http://login-secure-bank-update.com/verify@account"

print(f"URL:{test_url}")
print(f"Length Score:{check_url_length(test_url)}")
print(f"At Symbol Score: {check_symbol(test_url)}")
print(f"HTTPS Score: {check_https(test_url)}")

        

In [None]:
import re

def check_prefix_suffix(url): 
    domain = urlparse(url).netloc
    if '-' in domain:
        return 1 
    else:
        return -1 

def check_subdomain(url):
    if url.count(".") < 3:
        return -1 
    elif url.count(".") == 3:
        return 0 
    else:
        return 1

def check_https_token(url):
    domain = urlparse(url).netloc
    if 'https' in domain:
        return 1 
    else:
        return -1

test_url = "https://secure-login-bank.com/update"
print(f"URL: {test_url}")
print(f"Prefix/Suffix Score: {check_prefix_suffix(test_url)}")
print(f"HTTPS Token Score: {check_https_token(test_url)}")

In [None]:
raw_data = pd.read_csv('Data/verified_online.csv')
urls = raw_data['url'].head(100)
feature_list = []
for url in urls:
    features = {
        'url': url,
        'URL_Length': check_url_length(url),
        'At_Symbol': check_symbol(url),
        'Prefix_Suffix': check_prefix_suffix(url),
        'HTTPS_Token': check_https_token(url)
    }
    feature_list.append(features)

feature_matrix = pd.DataFrame(feature_list)
print(feature_matrix.head())

In [None]:
from sklearn.model_selection import train_test_split

df = pd.read_csv('Data/phishing_uci.csv')
df = df.drop(['id'], axis=1)
X = df.drop(['Result'], axis=1)
y = df['Result']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Data successfully split")
print(f"Training samples: {len(X_train)}")
print(f"Testing samples: {len(X_test)}")

In [None]:
# training random forest model

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_predictions = rf_model.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_predictions)

print(f"Random Forest Training Complete")
print(f"Accuracy: {rf_accuracy * 100:.2f}%")

In [None]:
#traing svm model

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
svm_model = SVC(kernel='linear', random_state=42)

svm_model.fit(X_train, y_train)
svm_predictions = svm_model.predict(X_test)

svm_accuracy = accuracy_score(y_test, svm_predictions)

print(f"SVM Training Complete")
print(f"SVM Accuracy: {svm_accuracy * 100:.2f}%")

In [None]:
#training XGBoost

In [None]:
pip install xgboost

In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
y_train_mapped = (y_train + 1) / 2
y_test_mapped = (y_test + 1) / 2

xgb_model = XGBClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
xgb_model.fit(X_train, y_train_mapped)
xgb_predictions = xgb_model.predict(X_test)
xgb_accuracy = accuracy_score(y_test_mapped, xgb_predictions)

print(f"XGBoost Training Complete")
print(f"XGBoost Accuracy: {xgb_accuracy * 100:.2f}%")

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
def get_metrics(y_true, y_pred, model_name):
    return {
        'Model': model_name,
        'Accuracy': accuracy_score(y_true, y_pred),
        'Precision': precision_score(y_true, y_pred),
        'Recall': recall_score(y_true, y_pred),
        'F1-Score': f1_score(y_true, y_pred)
    }

rf_results = get_metrics(y_test, rf_predictions, 'Random Forest')
svm_results = get_metrics(y_test, svm_predictions, 'SVM')
xgb_results = get_metrics(y_test_mapped, xgb_predictions, 'XGBoost')
comparison_df = pd.DataFrame([rf_results, svm_results, xgb_results])
print("Model Comparison")
print(comparison_df.to_string(index=False, formatters={
    'Accuracy': '{:,.2%}'.format,
    'Precision': '{:,.2%}'.format,
    'Recall': '{:,.2%}'.format,
    'F1-Score': '{:,.2%}'.format
}))