<a href="https://colab.research.google.com/github/maximsachs/AlienVault_office365_Phishing_analysis/blob/master/phishtank_deliverable3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!wget https://raw.githubusercontent.com/maximsachs/AlienVault_office365_Phishing_analysis/master/Phishing_Detection_AI/combined_online_valid.csv
!wget https://raw.githubusercontent.com/maximsachs/AlienVault_office365_Phishing_analysis/master/Phishing_Detection_AI/top-1m_umbrella.csv

--2020-10-22 16:04:25--  https://raw.githubusercontent.com/maximsachs/AlienVault_office365_Phishing_analysis/master/Phishing_Detection_AI/combined_online_valid.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4019565 (3.8M) [text/plain]
Saving to: ‘combined_online_valid.csv’


2020-10-22 16:04:25 (14.9 MB/s) - ‘combined_online_valid.csv’ saved [4019565/4019565]

--2020-10-22 16:04:25--  https://raw.githubusercontent.com/maximsachs/AlienVault_office365_Phishing_analysis/master/Phishing_Detection_AI/top-1m_umbrella.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting

In [2]:
import pandas as pd
import numpy as np
from collections import defaultdict
import pprint
import urllib.request
from prettytable import PrettyTable
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.preprocessing import sequence
from tqdm import tqdm


In [3]:
online_valid_df = pd.read_csv("combined_online_valid.csv", index_col=0)
whitelist_file_umbrella = "top-1m_umbrella.csv"
alexa_whitelist_df = pd.read_csv(whitelist_file_umbrella, header=None, names=["rank", "domain_names"])



---

# Analysing http vs https


In [4]:
protocol_count = defaultdict(lambda: 0)
for index, row in online_valid_df.iterrows():
  # Checking if first 5 elements of the url match https
  if "https" in row["url"][:6]:
    protocol_count["https"] += 1
  # else checking if its just http:
  elif "http:" in row["url"][:6]:
    protocol_count["http"] += 1
  # otherwise its some random other thing.
  else:
    protocol = row["url"].split(":")[0]
    protocol_count[protocol] += 1

pprint.pprint(dict(protocol_count))

{'http': 8755, 'https': 10077}




---

# Analysing top level domains

In [5]:
# Extracting tld and domain name.
tld_count = defaultdict(lambda: 0)
domain_names = []
for index, row in online_valid_df.iterrows():
    # Extracting the tld from the url
    domain_name = row["url"].replace("https://","").replace("http://","").split("/")[0]
    domain_names.append(domain_name)
    tld = domain_name.split(".")[-1]
    tld_count[tld] += 1

tld_df = pd.Series(dict(tld_count))
tld_df.sort_values(ascending=False, inplace=True)

show_top_n = 20

tld_print = tld_df.iloc[:show_top_n]
tld_print["OTHERS"] = tld_df.iloc[show_top_n:].sum()

online_valid_df["domain_names"] = domain_names


whitelist_file_alexa = "top-1m_alexa.csv"
whitelist_file_umbrella = "top-1m_umbrella.csv"
alexa_whitelist_df = pd.read_csv(whitelist_file_umbrella, header=None, names=["rank", "domain_names"])

# Finding if there are any domains that are also in the whitelist.
domains_in_whitelist = np.intersect1d(online_valid_df["domain_names"], alexa_whitelist_df["domain_names"])
# Tagging the whitelisted domains as such.
online_valid_df["in_whitelist"] = np.in1d(online_valid_df["domain_names"], domains_in_whitelist)

print(online_valid_df.shape[0], "rows")
print(online_valid_df.head(20).to_string())
print(tld_print.to_frame(name="TLD Count").transpose().to_string())
print(f"Percentage of top {show_top_n} tlds: {np.round(100*tld_df.iloc[:show_top_n].sum()/tld_df.sum(), decimals=2)} %")
print()
print("Number of urls that have domains which are in the whilelist:", online_valid_df["in_whitelist"].sum())

# For the analysis, excluding all where the domain name is in the whitelist.

online_valid_df_without_intersection = online_valid_df.loc[online_valid_df['in_whitelist'] == False]
alexa_whitelist_df_without_intersection = alexa_whitelist_df.loc[np.invert(alexa_whitelist_df['domain_names'].isin(domains_in_whitelist))]

18832 rows
                                                                                                                                                                                   url                                            phish_detail_url            submission_time verified          verification_time online     target                                                     domain_names  in_whitelist
0                                                                                                                                          https://mise-a-jour-orange7.webnode.com.co/  http://www.phishtank.com/phish_detail.php?phish_id=6800831  2020-10-08T12:04:27+00:00      yes  2020-10-08T12:09:47+00:00    yes      Other                               mise-a-jour-orange7.webnode.com.co         False
1                                                                                                                       https://rubysoap.hk/serv/fatturazione-staff/home/z0n51/lod.php 



---

# Target Company


In [6]:
# First analysing the targets as they are in the raw dataset.
# Calculating target counts
target_counts = online_valid_df["target"].value_counts()
target_counts["unidentified"] = target_counts.pop("Other")
target_counts.sort_values(inplace=True, ascending=False)
show_top_n = 15
targets_print = target_counts.iloc[:show_top_n]
targets_print["OTHERS"] = target_counts.iloc[show_top_n:].sum()
print(targets_print.to_frame(name="Target Count").to_string())
print(np.round(100*(targets_print["unidentified"]/online_valid_df.shape[0]), decimals=2), "% of targets are unidentified.")

                          Target Count
unidentified                     15776
RuneScape                          492
PayPal                             443
eBay, Inc.                         305
Facebook                           293
Halifax                            245
Microsoft                          222
Amazon.com                         132
Google                              77
Virustotal                          68
Internal Revenue Service            49
Orange                              49
HSBC Group                          44
Lloyds Bank                         39
ABSA Bank                           37
OTHERS                             561
83.77 % of targets are unidentified.




---

# Heuristics and Rule Implementation


In [7]:
# Heuristics
# Each heuristic returns 1 if phishing was detected, and returns 0 if not phishing.
# Heuristics required for rules 2,4,5,6,7:
# Number of slashes >= 5 , implemented
# Special charaters = yes , implemented
# Transport layer security = http , implemented
# Top level domain = yes , implemented
# Length of host >= 75 , implemented
# Number of terms in host > 4 , implemented
# Length of url >= 75 , implemented
# Unicode in URL = yes, implemented
# Dots in hostname of url > 4 , implemented
# subdomain = yes , implemented

# All other heuristics are not implememented because not used in any predictive apriori rule.

# importing the string module
import string


def heuristic_1_len_host(domain_name, threshold=75):
  """
  Decides if phishing based on the length of the domain name (host).
  Threshold, below this number returns not phishing.
  """
  if len(domain_name) > threshold:
    return 1
  return 0

def heuristic_2_slashes_url(url, threshold=5):
  """
  Number of slash in url >= 5 then phishing.
  """
  if url.count("/") >= threshold:
    return 1
  return 0

def heuristic_3_dots_host(domain_name, threshold=4):
  """
  Number of dots in host > 4 then phishing.
  """
  if domain_name.count(".") > threshold:
    return 1
  return 0

def heuristic_4_terms_host(domain_name, threshold=4):
  """
  Number of terms in host > 4 then phishing.
  Terms are assumed to be separated by .
  """
  if domain_name.count(".")+1 > threshold:
    return 1
  return 0

def heuristic_5_specialchar_host(domain_name):
  """
  If special characters in host then phishing.
  The special characters used in the heuristic are not well defined.
  Therefore using all special characters except . (This is used to separate domains in host)
  !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
  """
  # special characters
  special_chars = string.punctuation
  # print(special_chars)
  special_chars = special_chars.replace(".", "")
  return int(any(list(map(lambda char: char in special_chars, domain_name))))

def heuristic_7_unicode_in_url(url):
  """
  Checks if any unicode characters are used in the URL then phishing.
  In python conversion to ascii fails if unicode characters are present.
  """
  try:
    url.encode('ascii', 'strict')
    return 0
  except:
    return 1

def heuristic_8_tls_url(url):
  """
  If http is used then phishing.
  """
  if "https" in url[:6]:
    return 0
  else:
    return 1

def heuristic_9_subdomain_host(domain_name):
  """
  Returns 1 if a subdomain is used in the domain_name.
  Subdomain is separated from the domain using a dot. 
  And there is 1 dot for the TLD. Making use of the dot counting heuristic.
  """
  return heuristic_3_dots_host(domain_name, threshold=1)

def heuristic_11_TLD_host(domain_name):
  """
  Returns 1 if the domain name contains no TLD.
  Since the samples are all valid internet addresses they must include a TLD to be reachable.
  There is at least 1 dot for the TLD. Making use of the dot counting heuristic.
  """
  return heuristic_3_dots_host(domain_name, threshold=0)

def heuristic_14_len_url(url, threshold=75):
  """
  Decides if phishing based on the length of the url.
  Threshold, below this number returns not phishing.
  """
  if len(url) > threshold:
    return 1
  return 0

test_url = online_valid_df_without_intersection["url"][10]
# test_url = "asdf"
print(test_url)
heuristic_8_tls_url(test_url)

http://vvvvvv.amazcn.co.jp-ce24fdb6b628b008fd45b60ce3d15dd264d9388e.ph/


1

In [8]:
# Rules
# Implementing apriori rules 3, 4, 5, 6

def apriori_rule_3(url, domain_name):
  if heuristic_5_specialchar_host(domain_name) and heuristic_8_tls_url(url) and heuristic_4_terms_host(domain_name, threshold=4):
    return 1
  else:
    return 0

def apriori_rule_4(url, domain_name):
  if heuristic_3_dots_host(domain_name, threshold=4) and heuristic_8_tls_url(url) and heuristic_4_terms_host(domain_name, threshold=4):
    return 1
  else:
    return 0

def apriori_rule_5(url, domain_name):
  if heuristic_2_slashes_url(url, threshold=5) and heuristic_3_dots_host(domain_name, threshold=4) and heuristic_14_len_url(url, threshold=75):
    return 1
  else:
    return 0

def apriori_rule_6(url, domain_name):
  if heuristic_5_specialchar_host(domain_name) and heuristic_8_tls_url(url) and heuristic_11_TLD_host(domain_name):
    return 1
  else:
    return 0
  

# Implementing predictive apriori rules 2,4,5,6,7 The other rules are not evelatued because heuristic 10 keywords in path of url is not sufficiently described to be reproduced.
def predictive_apriori_rule_2(url, domain_name):
  if heuristic_2_slashes_url(url, threshold=5) and heuristic_7_unicode_in_url(url) and heuristic_8_tls_url(url) and heuristic_14_len_url(url, threshold=75):
    return 1
  else:
    return 0

def predictive_apriori_rule_4(url, domain_name):
  if heuristic_2_slashes_url(url, threshold=5) and heuristic_5_specialchar_host(domain_name) and heuristic_7_unicode_in_url(url) and heuristic_11_TLD_host(domain_name) and heuristic_14_len_url(url, threshold=75):
    return 1
  else:
    return 0

def predictive_apriori_rule_5(url, domain_name):
  if heuristic_2_slashes_url(url, threshold=5) and heuristic_3_dots_host(domain_name, threshold=4) and heuristic_5_specialchar_host(domain_name) and heuristic_7_unicode_in_url(url) and heuristic_11_TLD_host(domain_name):
    return 1
  else:
    return 0

def predictive_apriori_rule_6(url, domain_name):
  if heuristic_2_slashes_url(url, threshold=5) and heuristic_3_dots_host(domain_name, threshold=4) and heuristic_5_specialchar_host(domain_name) and heuristic_14_len_url(url, threshold=75):
    return 1
  else:
    return 0

def predictive_apriori_rule_7(url, domain_name):
  if heuristic_5_specialchar_host(domain_name) and heuristic_9_subdomain_host(domain_name) and heuristic_4_terms_host(domain_name):
    return 1
  else:
    return 0



---
# Rule evaluation


In [9]:
def statistics_table_printer(predictions_binary, y_binary, decimals=3):
    # Concattenating the strings of the binary value of the prediction and the truth.
    # First value is the prediction, second the actual label
    # Hypothesis is: is phishing -> positive: yes phishing, negative: no phishing
    # Then 00 would be a TN, 01 is a FP, 10 is a FN, 11 is a TP. 
    # Converting the binary outcomes to integer: 0 TN, 1 FP, 2 FN, 3 TP
    hypothesis_tests = [int(str(label)+str(prediction), 2) for prediction, label in zip(predictions_binary, y_binary)]
    # Counting the number of times each unique value in the tests is returned.
    unique_elements, counts_elements = np.unique(hypothesis_tests, return_counts=True)
    counts_elements = dict(zip(unique_elements, counts_elements))
    outcome_labels = ["TN", "FP", "FN", "TP"]
    evaluation_ratios_counts = dict(zip(outcome_labels, [counts_elements.get(0, 0), counts_elements.get(1, 0), counts_elements.get(2, 0), counts_elements.get(3, 0)]))
    print("Evaluation counts:", evaluation_ratios_counts)
    try:
        positive_predictive_value = evaluation_ratios_counts["TP"]/(evaluation_ratios_counts["TP"]+evaluation_ratios_counts["FP"])
    except:
        positive_predictive_value = 0
    try:
        true_positive_rate = evaluation_ratios_counts["TP"]/(evaluation_ratios_counts["TP"]+evaluation_ratios_counts["FN"])
    except:
        true_positive_rate = 0
    try:
        false_discovery_rate = evaluation_ratios_counts["FP"]/(evaluation_ratios_counts["TP"]+evaluation_ratios_counts["FP"])
    except:
        false_discovery_rate = 0
    try:
        false_positive_rate = evaluation_ratios_counts["FP"]/(evaluation_ratios_counts["FP"]+evaluation_ratios_counts["TN"])
    except:
        false_positive_rate = 0
    try:
        false_omission_rate = evaluation_ratios_counts["FN"]/(evaluation_ratios_counts["TN"]+evaluation_ratios_counts["FN"])
    except:
        false_omission_rate = 0
    try:
        false_negative_rate = evaluation_ratios_counts["FN"]/(evaluation_ratios_counts["TP"]+evaluation_ratios_counts["FN"])
    except:
        false_negative_rate = 0
    try:
        negative_predictive_value = evaluation_ratios_counts["TN"]/(evaluation_ratios_counts["TN"]+evaluation_ratios_counts["FN"])
    except:
        negative_predictive_value = 0
    try:
        true_negative_rate = evaluation_ratios_counts["TN"]/(evaluation_ratios_counts["TN"]+evaluation_ratios_counts["FP"])
    except:
        true_negative_rate = 0

    t = PrettyTable(["", 'Is phishing', "Not phishing"])
    t.add_row(['Predicted phishing', "TP: {TP}".format(**evaluation_ratios_counts), "FP: {FP}".format(**evaluation_ratios_counts)])
    t.add_row(['', f"PPV: {np.round(positive_predictive_value*100, decimals=decimals)}%", f"FDR: {np.round(false_discovery_rate*100, decimals=decimals)}%"])
    t.add_row(['', f"TPR: {np.round(true_positive_rate*100, decimals=decimals)}%", f"FPR: {np.round(false_positive_rate*100, decimals=decimals)}%"])
    t.add_row(['Predicted safe', "FN: {FN}".format(**evaluation_ratios_counts), "TN: {TN}".format(**evaluation_ratios_counts)])
    t.add_row(['', f"FOR: {np.round(false_omission_rate*100, decimals=decimals)}%", f"NPV: {np.round(negative_predictive_value*100, decimals=decimals)}%"])
    t.add_row(['', f"FNR: {np.round(false_negative_rate*100, decimals=decimals)}%", f"TNR: {np.round(true_negative_rate*100, decimals=decimals)}%"])
    print(t)

In [21]:
evaluation_rule_3 = []
evaluation_rule_4 = []
evaluation_rule_5 = []
evaluation_rule_6 = []

evaluation_predictive_rule_2 = []
evaluation_predictive_rule_4 = []
evaluation_predictive_rule_5 = []
evaluation_predictive_rule_6 = []
evaluation_predictive_rule_7 = []

for index, row in online_valid_df_without_intersection.iterrows():
  url, domain_name = row["url"], row["domain_names"]
  evaluation_rule_3.append(apriori_rule_3(url, domain_name))
  evaluation_rule_4.append(apriori_rule_4(url, domain_name))
  evaluation_rule_5.append(apriori_rule_5(url, domain_name))
  evaluation_rule_6.append(apriori_rule_6(url, domain_name))
  evaluation_predictive_rule_2.append(predictive_apriori_rule_2(url, domain_name))
  evaluation_predictive_rule_4.append(predictive_apriori_rule_4(url, domain_name))
  evaluation_predictive_rule_5.append(predictive_apriori_rule_5(url, domain_name))
  evaluation_predictive_rule_6.append(predictive_apriori_rule_6(url, domain_name))
  evaluation_predictive_rule_7.append(predictive_apriori_rule_7(url, domain_name))

# Printing the statistics about each rule. The true target should be all phishing, so target is 1
print("Evaluation rule 3")
statistics_table_printer(evaluation_rule_3, [1]*len(evaluation_rule_3))
print("Evaluation rule 4")
statistics_table_printer(evaluation_rule_4, [1]*len(evaluation_rule_4))
print("Evaluation rule 5")
statistics_table_printer(evaluation_rule_5, [1]*len(evaluation_rule_5))
print("Evaluation rule 6")
statistics_table_printer(evaluation_rule_6, [1]*len(evaluation_rule_6))

print("Evaluation predictive rule 2")
statistics_table_printer(evaluation_predictive_rule_2, [1]*len(evaluation_predictive_rule_2))
print("Evaluation predictive rule 4")
statistics_table_printer(evaluation_predictive_rule_4, [1]*len(evaluation_predictive_rule_4))
print("Evaluation predictive rule 5")
statistics_table_printer(evaluation_predictive_rule_5, [1]*len(evaluation_predictive_rule_5))
print("Evaluation predictive rule 6")
statistics_table_printer(evaluation_predictive_rule_6, [1]*len(evaluation_predictive_rule_6))
print("Evaluation predictive rule 7")
statistics_table_printer(evaluation_predictive_rule_7, [1]*len(evaluation_predictive_rule_7))

Evaluation rule 3
Evaluation counts: {'TN': 0, 'FP': 0, 'FN': 16799, 'TP': 655}
+--------------------+--------------+--------------+
|                    | Is phishing  | Not phishing |
+--------------------+--------------+--------------+
| Predicted phishing |   TP: 655    |    FP: 0     |
|                    | PPV: 100.0%  |  FDR: 0.0%   |
|                    | TPR: 3.753%  |   FPR: 0%    |
|   Predicted safe   |  FN: 16799   |    TN: 0     |
|                    | FOR: 100.0%  |  NPV: 0.0%   |
|                    | FNR: 96.247% |   TNR: 0%    |
+--------------------+--------------+--------------+
Evaluation rule 4
Evaluation counts: {'TN': 0, 'FP': 0, 'FN': 17287, 'TP': 167}
+--------------------+--------------+--------------+
|                    | Is phishing  | Not phishing |
+--------------------+--------------+--------------+
| Predicted phishing |   TP: 167    |    FP: 0     |
|                    | PPV: 100.0%  |  FDR: 0.0%   |
|                    | TPR: 0.957%  |   FPR: 

In [22]:
combined_evaluation = np.max([evaluation_rule_3, evaluation_rule_4, evaluation_rule_5, evaluation_rule_6, evaluation_predictive_rule_2, evaluation_predictive_rule_4, evaluation_predictive_rule_5, evaluation_predictive_rule_6, evaluation_predictive_rule_7], axis = 0)
print("combined_evaluation of apriori rules 3, 4, 5, 6 and predictive apriori rule 2, 4, 5, 6, 7")
statistics_table_printer(combined_evaluation, [1]*len(combined_evaluation))

combined_evaluation of apriori rules 3, 4, 5, 6 and predictive apriori rule 2, 4, 5, 6, 7
Evaluation counts: {'TN': 0, 'FP': 0, 'FN': 14118, 'TP': 3336}
+--------------------+--------------+--------------+
|                    | Is phishing  | Not phishing |
+--------------------+--------------+--------------+
| Predicted phishing |   TP: 3336   |    FP: 0     |
|                    | PPV: 100.0%  |  FDR: 0.0%   |
|                    | TPR: 19.113% |   FPR: 0%    |
|   Predicted safe   |  FN: 14118   |    TN: 0     |
|                    | FOR: 100.0%  |  NPV: 0.0%   |
|                    | FNR: 80.887% |   TNR: 0%    |
+--------------------+--------------+--------------+




---
# AI Prediction with whitelisted data.


In [12]:
random_seed = 15
oversampling_rate = 1 # Set this to 1 to have the positive samples match the phishing samples. Set to greater than 1 to use more negative samples.
np.random.seed(random_seed)

phishing_domains = online_valid_df_without_intersection["domain_names"].values
whitelist_domains = np.random.choice(alexa_whitelist_df_without_intersection["domain_names"].values, size=int(oversampling_rate*len(phishing_domains)), replace=False)

# Calling a phishing url 1 and a benign url 0.
# Using character encoding as the vocabulary.
# Feeding the url as the sequence.

print()
print("Selected Data Examples:")
print("Phishing domains:", phishing_domains, len(phishing_domains))
print("Benign domains:", whitelist_domains, len(whitelist_domains))

# Creating the samples array and the label array
print()
X = list(phishing_domains) + list(whitelist_domains)
y = [1]*len(phishing_domains) + [0]*len(whitelist_domains)
sample_weights = [1]*len(phishing_domains) + [1/oversampling_rate]*len(whitelist_domains)


Selected Data Examples:
Phishing domains: ['mise-a-jour-orange7.webnode.com.co' 'rubysoap.hk' 'rubysoap.hk' ...
 'ffqyw.com' 'smcc-cacc.smdboc.top' 'vozyvisionradio.com'] 17454
Benign domains: ['mt.com.gccdn.net' 'img-d01.moneycontrol.co.in' 'merlin-business.com' ...
 'cdn.video.abc.com' 'cefetmg.br' 'linier.rctiplus.id'] 17454



In [13]:
# Vocabulary generation for the character encoding used in neural network.
vocab = sorted(set("".join(X)), reverse=True)
# Inserting a space at index 0, since it is not used in url and will be used for padding the examples.
vocab.insert(0, " ")
vocab_size = len(vocab)

print()
print(f"Encoding Vocabulary ({vocab_size}) used:")
print(vocab)
# Creating a mapping from unique characters to indices
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)

def text_to_int(text):
  return np.array([char2idx[c] for c in text])

print("Encoding example:")
print(text_to_int(phishing_domains[0]))

def int_to_text(ints):
  try:
    ints = ints.numpy()
  except:
    pass
  return ''.join(idx2char[ints])

print(int_to_text(text_to_int(phishing_domains[0])))


Encoding Vocabulary (73) used:
[' ', 'z', 'y', 'x', 'w', 'v', 'u', 't', 's', 'r', 'q', 'p', 'o', 'n', 'm', 'l', 'k', 'j', 'i', 'h', 'g', 'f', 'e', 'd', 'c', 'b', 'a', '_', 'Z', 'Y', 'X', 'W', 'V', 'U', 'T', 'S', 'R', 'Q', 'P', 'O', 'N', 'M', 'L', 'K', 'J', 'I', 'H', 'G', 'F', 'E', 'D', 'C', 'B', 'A', '@', '?', '=', ';', ':', '9', '8', '7', '6', '5', '4', '3', '2', '1', '0', '.', '-', '&', '#']
Encoding example:
[14 18  8 22 70 26 70 17 12  6  9 70 12  9 26 13 20 22 61 69  4 22 25 13
 12 23 22 69 24 12 14 69 24 12]
mise-a-jour-orange7.webnode.com.co


In [14]:
# Investigating the domain name length for the combined domain names:
X_elem_len = [len(domain_name) for domain_name in X]
print("Longest domain name:", np.max(X_elem_len))

print(sorted(X_elem_len, reverse=True)[:show_top_n])
# Only 10 urls are longer than 100 characters. So going with that for sequence length.
max_seq_len = 50
print((np.array(X_elem_len) > max_seq_len).sum(), "URLs longer than the cutoff length", max_seq_len)

Longest domain name: 926
[926, 253, 179, 150, 150, 150, 122, 113, 113, 111, 108, 105, 104, 102, 102]
1637 URLs longer than the cutoff length 50


In [15]:
# Creating test and training datasets
print()
X_train, X_test, y_train, y_test, sample_weights_train, sample_weights_test = train_test_split(np.array(X), np.array(y), np.array(sample_weights), test_size=0.15, random_state=random_seed)

show_top_n = 5
print(f"Training and testing data: (showing first {show_top_n})")
print(f"Train data {len(X_train)} samples")
print(list(zip(X_train[:show_top_n], y_train[:show_top_n], sample_weights_train[:show_top_n])))
print(f"Test data {len(X_test)} samples")
print(list(zip(X_test[:show_top_n], y_test[:show_top_n], sample_weights_test[:show_top_n])))

# Encoding the domain names using the vocabulary

X_train_encoded = [text_to_int(domain_name) for domain_name in X_train]
X_test_encoded = [text_to_int(domain_name) for domain_name in X_test]
print()
print(f"Encoded data: (showing first {show_top_n})")
print(f"Train data {len(X_train_encoded)} samples, encoded")
print(list(zip(X_train_encoded[:show_top_n], y_train[:show_top_n])))
print(f"Test data {len(X_test_encoded)} samples, encoded")
print(list(zip(X_test_encoded[:show_top_n], y_test[:show_top_n])))

# Padding to the right sequence length.
X_train_encoded_padded = sequence.pad_sequences(X_train_encoded, max_seq_len)
X_test_encoded_padded = sequence.pad_sequences(X_test_encoded, max_seq_len)
print()
print(f"Encoded and padded data: (showing first {show_top_n})")
print(f"Train data {len(X_train_encoded_padded)} samples, encoded")
print(list(zip(X_train_encoded_padded[:show_top_n], y_train[:show_top_n])))
print(f"Test data {len(X_test_encoded_padded)} samples, encoded")
print(list(zip(X_test_encoded_padded[:show_top_n], y_test[:show_top_n])))


Training and testing data: (showing first 5)
Train data 29671 samples
[('txxtxx.cn', 0, 1.0), ('valleyobgynassociates.com', 1, 1.0), ('centralusr-notifyp.svc.ms', 0, 1.0), ('dahe.cn', 0, 1.0), ('cm16ps01.ohsum01.ohsu.edu', 0, 1.0)]
Test data 5237 samples
[('cp1sawprxngp04d.saw.msft.net', 0, 1.0), ('ekspert-kujawy.pl', 0, 1.0), ('4980338.fls.doubleclick.net', 0, 1.0), ('ticsoetangspar1936.blogspot.bg', 1, 1.0), ('centromusicalpaternense.es', 1, 1.0)]

Encoded data: (showing first 5)
Train data 29671 samples, encoded
[(array([ 7,  3,  3,  7,  3,  3, 69, 24, 13]), 0), (array([ 5, 26, 15, 15, 22,  2, 12, 25, 20,  2, 13, 26,  8,  8, 12, 24, 18,
       26,  7, 22,  8, 69, 24, 12, 14]), 1), (array([24, 22, 13,  7,  9, 26, 15,  6,  8,  9, 70, 13, 12,  7, 18, 21,  2,
       11, 69,  8,  5, 24, 69, 14,  8]), 0), (array([23, 26, 19, 22, 69, 24, 13]), 0), (array([24, 14, 67, 62, 11,  8, 68, 67, 69, 12, 19,  8,  6, 14, 68, 67, 69,
       12, 19,  8,  6, 69, 22, 23,  6]), 0)]
Test data 5237 samples



---

# Testing rule predictive apriory for false positives

In [23]:
evaluation_predictive_rule_7 = []

for domain_name in X_test:
  evaluation_predictive_rule_7.append(predictive_apriori_rule_7("", domain_name))

print("Evaluation predictive rule 7")
statistics_table_printer(evaluation_predictive_rule_7, y_test)

Evaluation predictive rule 7
Evaluation counts: {'TN': 2504, 'FP': 106, 'FN': 2396, 'TP': 231}
+--------------------+--------------+--------------+
|                    | Is phishing  | Not phishing |
+--------------------+--------------+--------------+
| Predicted phishing |   TP: 231    |   FP: 106    |
|                    | PPV: 68.546% | FDR: 31.454% |
|                    | TPR: 8.793%  | FPR: 4.061%  |
|   Predicted safe   |   FN: 2396   |   TN: 2504   |
|                    | FOR: 48.898% | NPV: 51.102% |
|                    | FNR: 91.207% | TNR: 95.939% |
+--------------------+--------------+--------------+




---

# Running the Neural Network

In [17]:
#Evaluating the model
def evaluate_nn_model(X, y, threshold=0.5):
    """
    Custom nn evaluation to get the TP, TN, FP, FN rates.
    Anything below threshold is considered not phishing.
    Anything above threshold is considered phishing.

    """
    print()
    predictions = model.predict(X_test_encoded_padded).flatten()
    mean_prediction = np.mean(predictions)
    print(f"Calculated {len(predictions)} predictions with a mean value of {mean_prediction}")
    print(f"Evaluating using threshold {threshold}")
    # Turning the predictions into 0 and 1 by checking the threshold. (0 safe, 1 phishing)
    predictions_boolean = predictions > threshold
    predictions_binary = predictions_boolean.astype(np.int)
    print(f"Cut-off threshold: {np.round(threshold, decimals=4)}")
    statistics_table_printer(predictions_binary, y)
    return mean_prediction

In [18]:
# Creating the recurrent model for the predictions:
print("\n---------------Tensorflow magic------------------\n")
# print(tf.config.list_physical_devices('GPU'))
# For some reason needed so the code runs properly on the gpu.
gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
    print('gpu', gpu)
    tf.config.experimental.set_memory_growth(gpu, True)
    print('memory growth:' , tf.config.experimental.get_memory_growth(gpu))

# https://www.tensorflow.org/api_docs/python/tf/keras/layers/LSTM
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, 64),
    # tf.keras.layers.LSTM(512, return_sequences=True),
    tf.keras.layers.LSTM(512),
    # tf.keras.layers.LSTM(128, go_backwards=True),
    # tf.keras.layers.Dense(512),
    # tf.keras.layers.Dense(128,activation="tanh"),
    # tf.keras.layers.Dense(128),
    tf.keras.layers.Dense(512,activation="tanh"),
    # tf.keras.layers.Dense(512,activation="tanh"),
    # tf.keras.layers.Dense(512,activation="tanh"),
    # tf.keras.layers.Dense(512,activation="tanh"),
    # tf.keras.layers.Dense(32,activation="sigmoid"),
    # tf.keras.layers.Dense(32),
    # tf.keras.layers.Dense(16),
    tf.keras.layers.Dense(1, activation="sigmoid")
])


# Compiling the model
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=['acc'])
print(model.summary())

# Training the model
history = model.fit(X_train_encoded_padded, y_train, epochs=10, validation_data=(X_test_encoded_padded, y_test), sample_weight=sample_weights_train)


---------------Tensorflow magic------------------

gpu PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')
memory growth: True
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 64)          4672      
_________________________________________________________________
lstm (LSTM)                  (None, 512)               1181696   
_________________________________________________________________
dense (Dense)                (None, 512)               262656    
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 513       
Total params: 1,449,537
Trainable params: 1,449,537
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10

In [24]:
# Some custom evaluation
mean_prediction = evaluate_nn_model(X_test_encoded_padded, y_test, threshold=0.5)
mean_prediction = evaluate_nn_model(X_test_encoded_padded, y_test, threshold=mean_prediction)
evaluate_nn_model(X_test_encoded_padded, y_test, threshold=1-(1/(oversampling_rate+1)))


# Making a prediction on a url using the model:
print()
show_top_n = 10
print(f"Predicting the first {show_top_n} examples from the test data:")

first_n_predictions = model.predict(X_test_encoded_padded[:show_top_n])
print(first_n_predictions.flatten())

prediction_df = pd.DataFrame(data={"domain_names": X_test[:show_top_n], "predictions": first_n_predictions.flatten(), "truth": y_test[:show_top_n]})
print(prediction_df)

def predict_url(url):
    encoded_text = sequence.pad_sequences([text_to_int(url)], max_seq_len)
    result = model.predict(encoded_text) 
    print("Prediction on url:", url, result[0][0])

print("\nPhishing ULR examples:")
predict_url("frgcxtmjzfjpdcusge.top")
predict_url("evilmadeupurl.phish")
predict_url("evil.madeupurl.phish")


print("\nSafe URL examples:")
predict_url("sharelatex.cryptobro.eu")
predict_url("sharelatex.cryptobro.eu:5000")

predict_url("google.com")
predict_url("www.google.com")
predict_url("gmail.google.com")
predict_url("mail.google.com")

predict_url("tudelft.nl")

predict_url("brightspace.tudelft.nl")

predict_url("colab.research.google.com")

predict_url("00-gayrettepe-t3-8---00-gayrettepe-xrs-t2-1.statik.turktelekom.com.tr")


Calculated 5237 predictions with a mean value of 0.5333784818649292
Evaluating using threshold 0.5
Cut-off threshold: 0.5
Evaluation counts: {'TN': 2114, 'FP': 496, 'FN': 249, 'TP': 2378}
+--------------------+--------------+--------------+
|                    | Is phishing  | Not phishing |
+--------------------+--------------+--------------+
| Predicted phishing |   TP: 2378   |   FP: 496    |
|                    | PPV: 82.742% | FDR: 17.258% |
|                    | TPR: 90.522% | FPR: 19.004% |
|   Predicted safe   |   FN: 249    |   TN: 2114   |
|                    | FOR: 10.537% | NPV: 89.463% |
|                    | FNR: 9.478%  | TNR: 80.996% |
+--------------------+--------------+--------------+

Calculated 5237 predictions with a mean value of 0.5333784818649292
Evaluating using threshold 0.5333784818649292
Cut-off threshold: 0.5333999991416931
Evaluation counts: {'TN': 2146, 'FP': 464, 'FN': 269, 'TP': 2358}
+--------------------+--------------+--------------+
|        

In [27]:
mean_prediction = evaluate_nn_model(X_test_encoded_padded, y_test, threshold=0.65)


Calculated 5237 predictions with a mean value of 0.5333784818649292
Evaluating using threshold 0.65
Cut-off threshold: 0.65
Evaluation counts: {'TN': 2244, 'FP': 366, 'FN': 376, 'TP': 2251}
+--------------------+--------------+--------------+
|                    | Is phishing  | Not phishing |
+--------------------+--------------+--------------+
| Predicted phishing |   TP: 2251   |   FP: 366    |
|                    | PPV: 86.015% | FDR: 13.985% |
|                    | TPR: 85.687% | FPR: 14.023% |
|   Predicted safe   |   FN: 376    |   TN: 2244   |
|                    | FOR: 14.351% | NPV: 85.649% |
|                    | FNR: 14.313% | TNR: 85.977% |
+--------------------+--------------+--------------+
