# Exploratory Data Analysis

## Import the PhiUSIIL dataset

In [1]:
# install the ucimlrepo package
%pip install ucimlrepo

Note: you may need to restart the kernel to use updated packages.


In [2]:
from ucimlrepo import fetch_ucirepo 
import pandas as pd
import pickle
import os

def load_dataset(id, path):
    if os.path.isfile(path):
        with open(path, "rb") as f:
            data = pickle.load(f)
    else:
        data = fetch_ucirepo(id=id)
        with open(path, "wb") as f:
            pickle.dump(data, f)
    return data

# fetch dataset
dataset_id = 967
fname = f"downloads/id_{dataset_id}.pkl"
phiusiil_dataset = load_dataset(dataset_id, fname)
  
# data (as pandas dataframes)
X = phiusiil_dataset.data.features
y = phiusiil_dataset.data.targets
# merge X and y into a single dataframe
dataset = pd.merge(X, y, left_index=True, right_index=True)

In [3]:
from pprint import pprint

# print metadata
pprint(phiusiil_dataset.metadata)

{'abstract': 'PhiUSIIL Phishing URL Dataset is a substantial dataset '
             'comprising 134,850 legitimate and 100,945 phishing URLs. Most of '
             'the URLs we analyzed, while constructing the dataset, are the '
             'latest URLs. Features are extracted from the source code of the '
             'webpage and URL. Features such as CharContinuationRate, '
             'URLTitleMatchScore, URLCharProb, and TLDLegitimateProb are '
             'derived from existing features.',
 'additional_info': {'citation': 'Prasad, A., & Chandra, S. (2023). PhiUSIIL: '
                                 'A diverse security profile empowered '
                                 'phishing URL detection framework based on '
                                 'similarity index and incremental learning. '
                                 'Computers & Security, 103545. doi: '
                                 'https://doi.org/10.1016/j.cose.2023.103545',
                     'funded_by': N

In [4]:
# variable information
phiusiil_dataset.variables

Unnamed: 0,name,role,type,demographic,description,units,missing_values
0,FILENAME,Other,Categorical,,,,no
1,URL,Feature,Categorical,,,,no
2,URLLength,Feature,Integer,,,,no
3,Domain,Feature,Categorical,,,,no
4,DomainLength,Feature,Integer,,,,no
5,IsDomainIP,Feature,Integer,,,,no
6,TLD,Feature,Categorical,,,,no
7,URLSimilarityIndex,Feature,Integer,,,,no
8,CharContinuationRate,Feature,Integer,,,,no
9,TLDLegitimateProb,Feature,Continuous,,,,no


## Data exploration

In [5]:
# dataset dimensions (rows: samples, columns: features)
dataset.shape

(235795, 55)

In [6]:
# class distribution (0: phishing, 1: legitimate)
dataset.groupby('label').size()

label
0    100945
1    134850
dtype: int64

In [7]:
# peek at the first 10 rows of the dataset
dataset.head(10)

Unnamed: 0,URL,URLLength,Domain,DomainLength,IsDomainIP,TLD,URLSimilarityIndex,CharContinuationRate,TLDLegitimateProb,URLCharProb,...,Pay,Crypto,HasCopyrightInfo,NoOfImage,NoOfCSS,NoOfJS,NoOfSelfRef,NoOfEmptyRef,NoOfExternalRef,label
0,https://www.southbankmosaics.com,31,www.southbankmosaics.com,24,0,com,100.0,1.0,0.522907,0.061933,...,0,0,1,34,20,28,119,0,124,1
1,https://www.uni-mainz.de,23,www.uni-mainz.de,16,0,de,100.0,0.666667,0.03265,0.050207,...,0,0,1,50,9,8,39,0,217,1
2,https://www.voicefmradio.co.uk,29,www.voicefmradio.co.uk,22,0,uk,100.0,0.866667,0.028555,0.064129,...,0,0,1,10,2,7,42,2,5,1
3,https://www.sfnmjournal.com,26,www.sfnmjournal.com,19,0,com,100.0,1.0,0.522907,0.057606,...,1,1,1,3,27,15,22,1,31,1
4,https://www.rewildingargentina.org,33,www.rewildingargentina.org,26,0,org,100.0,1.0,0.079963,0.059441,...,1,0,1,244,15,34,72,1,85,1
5,https://www.globalreporting.org,30,www.globalreporting.org,23,0,org,100.0,1.0,0.079963,0.060614,...,0,0,1,35,1,11,86,0,14,1
6,https://www.saffronart.com,25,www.saffronart.com,18,0,com,100.0,1.0,0.522907,0.063549,...,0,0,1,32,4,14,44,2,17,1
7,https://www.nerdscandy.com,25,www.nerdscandy.com,18,0,com,100.0,1.0,0.522907,0.060486,...,0,0,1,24,2,22,36,0,15,1
8,https://www.hyderabadonline.in,29,www.hyderabadonline.in,22,0,in,100.0,1.0,0.005084,0.05698,...,0,0,1,71,4,9,40,1,317,1
9,https://www.aap.org,18,www.aap.org,11,0,org,100.0,1.0,0.079963,0.070497,...,0,0,1,10,1,12,173,6,65,1


In [8]:
# statistical summary
X.describe()

Unnamed: 0,URLLength,DomainLength,IsDomainIP,URLSimilarityIndex,CharContinuationRate,TLDLegitimateProb,URLCharProb,TLDLength,NoOfSubDomain,HasObfuscation,...,Bank,Pay,Crypto,HasCopyrightInfo,NoOfImage,NoOfCSS,NoOfJS,NoOfSelfRef,NoOfEmptyRef,NoOfExternalRef
count,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,...,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0
mean,34.573095,21.470396,0.002706,78.430778,0.845508,0.260423,0.055747,2.764456,1.164758,0.002057,...,0.127089,0.237007,0.023474,0.486775,26.075689,6.333111,10.522305,65.071113,2.377629,49.262516
std,41.314153,9.150793,0.051946,28.976055,0.216632,0.251628,0.010587,0.599739,0.600969,0.045306,...,0.333074,0.425247,0.151403,0.499826,79.411815,74.866296,22.312192,176.687539,17.641097,161.02743
min,13.0,4.0,0.0,0.155574,0.0,0.0,0.001083,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,23.0,16.0,0.0,57.024793,0.68,0.005977,0.050747,2.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,27.0,20.0,0.0,100.0,1.0,0.079963,0.05797,3.0,1.0,0.0,...,0.0,0.0,0.0,0.0,8.0,2.0,6.0,12.0,0.0,10.0
75%,34.0,24.0,0.0,100.0,1.0,0.522907,0.062875,3.0,1.0,0.0,...,0.0,0.0,0.0,1.0,29.0,8.0,15.0,88.0,1.0,57.0
max,6097.0,110.0,1.0,100.0,1.0,0.522907,0.090824,13.0,10.0,1.0,...,1.0,1.0,1.0,1.0,8956.0,35820.0,6957.0,27397.0,4887.0,27516.0
