In [4]:
# Initial imports
import pandas as pd
import hvplot.pandas
import plotly.express as px
import matplotlib as mp
import numpy as np

In [5]:
import warnings
warnings.filterwarnings('ignore')

In [6]:
from path import Path
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from collections import Counter
from imblearn.over_sampling import RandomOverSampler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from imblearn.ensemble import BalancedRandomForestClassifier
from sqlalchemy import create_engine
from config import db_password

In [7]:
db_string = f"postgresql://postgres:{db_password}@localhost:5432/ToLegit_ToPhish"
engine = create_engine(db_string)

In [8]:
# Load the dataset.
initial_phish_df = pd.read_csv('Resources/Phishing_Legitimate_full.csv')
#initial_phish_df.to_sql(name='first_set', con=engine)
#for data in pd.read_csv(f'Resources/Phishing_Legitimate_full.csv', chunksize=1000000):data.to_sql(name='first_set', con=engine, if_exists='append')

initial_phish_df.head()

Unnamed: 0,id,NumDots,SubdomainLevel,PathLevel,UrlLength,NumDash,NumDashInHostname,AtSymbol,TildeSymbol,NumUnderscore,...,IframeOrFrame,MissingTitle,ImagesOnlyInForm,SubdomainLevelRT,UrlLengthRT,PctExtResourceUrlsRT,AbnormalExtFormActionR,ExtMetaScriptLinkRT,PctExtNullSelfRedirectHyperlinksRT,CLASS_LABEL
0,1,3,1,5,72,0,0,0,0,0,...,0,0,1,1,0,1,1,-1,1,1
1,2,3,1,3,144,0,0,0,0,2,...,0,0,0,1,-1,1,1,1,1,1
2,3,3,1,2,58,0,0,0,0,0,...,0,0,0,1,0,-1,1,-1,0,1
3,4,3,1,6,79,1,0,0,0,0,...,0,0,0,1,-1,1,1,1,-1,1
4,5,3,0,4,46,0,0,0,0,0,...,1,0,0,1,1,-1,0,-1,-1,1


# Start Logistical Regression Model

In [9]:
# Create our features
X = initial_phish_df.drop(["CLASS_LABEL", "id"], axis=1)

# Create our target
y = initial_phish_df["CLASS_LABEL"]

In [10]:
X.describe()

Unnamed: 0,NumDots,SubdomainLevel,PathLevel,UrlLength,NumDash,NumDashInHostname,AtSymbol,TildeSymbol,NumUnderscore,NumPercent,...,SubmitInfoToEmail,IframeOrFrame,MissingTitle,ImagesOnlyInForm,SubdomainLevelRT,UrlLengthRT,PctExtResourceUrlsRT,AbnormalExtFormActionR,ExtMetaScriptLinkRT,PctExtNullSelfRedirectHyperlinksRT
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,...,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,2.4451,0.5868,3.3003,70.2641,1.818,0.1389,0.0003,0.0131,0.3232,0.0738,...,0.1288,0.3396,0.0322,0.0304,0.9566,0.0202,0.3533,0.7932,0.1734,0.3141
std,1.346836,0.751214,1.863241,33.369877,3.106258,0.545744,0.017319,0.113709,1.11466,0.622248,...,0.334995,0.473597,0.17654,0.171694,0.248037,0.820036,0.888908,0.521019,0.755771,0.897843
min,1.0,0.0,0.0,12.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
25%,2.0,0.0,2.0,48.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,-1.0,-1.0,1.0,0.0,-1.0
50%,2.0,1.0,3.0,62.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0
75%,3.0,1.0,4.0,84.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0
max,21.0,14.0,18.0,253.0,55.0,9.0,1.0,1.0,18.0,19.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [11]:
# Check the balance of our target values
y.value_counts()

1    5000
0    5000
Name: CLASS_LABEL, dtype: int64

In [12]:
#Create our test and train sample sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [13]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=42)
model.fit(X_train, y_train)

LogisticRegression(random_state=42)

In [14]:
# Calculated the balanced accuracy score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.9247485301046852

In [15]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[1132,   96],
       [  92, 1180]], dtype=int64)

In [16]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.92      0.92      0.93      0.92      0.92      0.85      1228
          1       0.92      0.93      0.92      0.93      0.92      0.86      1272

avg / total       0.92      0.92      0.92      0.92      0.92      0.86      2500



In [17]:
# List the features sorted in descending order by feature importance
feature_names = X.columns
feature_coefs = model.coef_

feature_coefs

array([[ 0.70794186, -1.42190338,  0.37188616, -0.57079729, -0.47591287,
         1.5879833 , -0.03921137,  0.39993031, -0.12164188, -0.94296245,
        -1.18992547,  0.50638538,  0.14051041,  0.05585954, -1.00332815,
        -0.44269348,  0.42326463,  0.23628242, -0.75046236,  0.        ,
         0.66193815,  0.59212549,  0.62052284,  0.02013584,  1.66364667,
        -0.05999056, -0.9133241 , -0.58208859,  1.44247151,  2.96583596,
        -0.17210033, -0.53606337, -0.38780902,  0.60660534,  3.0134935 ,
        -0.14329391,  0.0840036 , -0.08266576, -2.6952825 , -2.14882749,
         1.01714596, -0.00918363, -0.4868462 ,  0.86233227,  0.16736247,
         0.56656371,  0.68574914, -1.61544391]])

In [18]:
feature_names

Index(['NumDots', 'SubdomainLevel', 'PathLevel', 'UrlLength', 'NumDash',
       'NumDashInHostname', 'AtSymbol', 'TildeSymbol', 'NumUnderscore',
       'NumPercent', 'NumQueryComponents', 'NumAmpersand', 'NumHash',
       'NumNumericChars', 'NoHttps', 'RandomString', 'IpAddress',
       'DomainInSubdomains', 'DomainInPaths', 'HttpsInHostname',
       'HostnameLength', 'PathLength', 'QueryLength', 'DoubleSlashInPath',
       'NumSensitiveWords', 'EmbeddedBrandName', 'PctExtHyperlinks',
       'PctExtResourceUrls', 'ExtFavicon', 'InsecureForms',
       'RelativeFormAction', 'ExtFormAction', 'AbnormalFormAction',
       'PctNullSelfRedirectHyperlinks', 'FrequentDomainNameMismatch',
       'FakeLinkInStatusBar', 'RightClickDisabled', 'PopUpWindow',
       'SubmitInfoToEmail', 'IframeOrFrame', 'MissingTitle',
       'ImagesOnlyInForm', 'SubdomainLevelRT', 'UrlLengthRT',
       'PctExtResourceUrlsRT', 'AbnormalExtFormActionR', 'ExtMetaScriptLinkRT',
       'PctExtNullSelfRedirectHyperlinksRT

In [19]:
# Feature Importance
#feature_importance_list = np.concatenate((feature_names, feature_coefs))

# Balanced Random Forest Clarifier

In [20]:
rf = BalancedRandomForestClassifier(n_estimators = 100)
rf = rf.fit(X_train, y_train)

In [21]:
rf_y_pred = rf.predict(X_test)
balanced_accuracy_score(y_test, rf_y_pred)

0.9828157970212853

In [22]:
confusion_matrix(y_test, rf_y_pred)

array([[1208,   20],
       [  23, 1249]], dtype=int64)

In [23]:
print(classification_report_imbalanced(y_test, rf_y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.98      0.98      0.98      0.98      0.98      0.97      1228
          1       0.98      0.98      0.98      0.98      0.98      0.97      1272

avg / total       0.98      0.98      0.98      0.98      0.98      0.97      2500



In [24]:
# List the features sorted in descending order by feature importance
rf_feature_names = X.columns
rf_list_of_features = zip(rf_feature_names, rf.feature_importances_)
sorted(rf_list_of_features, key=lambda x: x[1], reverse=True)

[('PctExtHyperlinks', 0.18957391025730494),
 ('PctExtNullSelfRedirectHyperlinksRT', 0.1597679535017532),
 ('PctExtResourceUrls', 0.08648403154345367),
 ('FrequentDomainNameMismatch', 0.08141795577719084),
 ('PctNullSelfRedirectHyperlinks', 0.06028471851402238),
 ('NumDash', 0.04487677018537764),
 ('ExtMetaScriptLinkRT', 0.04303762453208684),
 ('SubmitInfoToEmail', 0.03478449145629864),
 ('NumNumericChars', 0.03426142660379655),
 ('InsecureForms', 0.02952041465331485),
 ('PathLevel', 0.025508759236350972),
 ('NumDots', 0.021089824392744694),
 ('PathLength', 0.019318740579215805),
 ('UrlLength', 0.017443894803032753),
 ('NumQueryComponents', 0.015263054516075396),
 ('NumSensitiveWords', 0.0141825833692426),
 ('HostnameLength', 0.014087335563819223),
 ('QueryLength', 0.013805146772317095),
 ('IframeOrFrame', 0.011971644860072676),
 ('ExtFavicon', 0.010439366183585002),
 ('PctExtResourceUrlsRT', 0.007177979174690212),
 ('NumUnderscore', 0.006286559535260959),
 ('NumDashInHostname', 0.00616