<a href="https://colab.research.google.com/github/roshsoftco/phishing-website-detector/blob/1.0/model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Preprocessing

---



Dataset contains 30 attributes. Each row has 31 values, 30 attributes and the class.

The `labelSplitter` function will split the last value from each row, and return them as an array.

In [1]:
import time

def labelSplitter(csv):
  """
  Splits the last value from each row of the parsed csv dataset as class.
  
  Parameters
  ----------
  csv : read_csv value from pandas

  Returns
  -------
  array
    containing the data[] and the labels[]
  """

  data = []
  labels = []
  for rowArray in csv.values: # iterate through dataset row by row
    row = rowArray.tolist()
    labels.append(row[-1]) # extract last value as class [a1, a2, ..., class]
    
    if rm is not None:
      for ele in sorted(rm, reverse = True):  
        del row[ele] # remove element from row
    
    data.append(row[:-1]); # rest of the data as attributes
  
  return [data, labels];

def timeDiff(start, end):
  return "{t:.3f} ms".format(t = (end-start))

Read CSV files and assign to variables.

In [None]:
import pandas as pd

training_dataset_url = "https://raw.githubusercontent.com/roshsoftco/phishing-website-detector/1.0/Dataset/training_dataset.csv"
validation_dataset_url = "https://raw.githubusercontent.com/roshsoftco/phishing-website-detector/1.0/Dataset/validation_dataset.csv"
test_dataset_url = "https://raw.githubusercontent.com/roshsoftco/phishing-website-detector/1.0/Dataset/test_dateset.csv"

# rm = [3,5,6,9,12,13,15,18,20,21,23,25,26,27,28,29] # attributes to remove (experiment)
rm = None

attributes = ["having_IP_Address",
              "URL_Length",
              "Shortining_Service",
              "having_At_Symbol",
              "double_slash_redirecting",
              "Prefix_Suffix",
              "having_Sub_Domain",
              "SSLfinal_State",
              "Domain_registeration_length",
              "Favicon",
              "port",
              "HTTPS_token",
              "Request_URL",
              "URL_of_Anchor",
              "Links_in_tags",
              "SFH",
              "Submitting_to_email",
              "Abnormal_URL",
              "Redirect",
              "on_mouseover",
              "RightClick",
              "popUpWidnow",
              "Iframe",
              "age_of_domain",
              "DNSRecord",
              "web_traffic",
              "Page_Rank",
              "Google_Index",
              "Links_pointing_to_page",
              "Statistical_report"]

if rm is not None:
  for ele in sorted(rm, reverse = True):  
    del attributes[ele] # remove element from row

labels = ["Legitamate", "Phishing"]

## Training Data
splitted = labelSplitter(pd.read_csv(training_dataset_url, header=None))
training_data = splitted[0]
training_data_labels = splitted[1]

## Validation Data
splitted = labelSplitter(pd.read_csv(validation_dataset_url, header=None))
validation_data = splitted[0]
validation_data_labels = splitted[1]

## Test Data
splitted = labelSplitter(pd.read_csv(test_dataset_url, header=None))
test_data = splitted[0]
test_data_labels = splitted[1]

splitted = None
total = len(training_data_labels) + len(validation_data_labels) + len(test_data_labels)

print("Training dataset size: ", len(training_data_labels), "(75%)")
print("Validation dataset size: ", len(validation_data_labels), "(20%)")
print("Testing dataset size: ", len(test_data_labels), "(5%)")

# Training (Decision Tree)

---



In [None]:
from sklearn import tree
clf = tree.DecisionTreeClassifier(criterion="entropy")

start = time.time()*1000
clf = clf.fit(training_data, training_data_labels) # train
end = time.time()*1000

print("Training completed in", timeDiff(start, end))

In [4]:
# import graphviz
# dot_data = tree.export_graphviz(clf, out_file=None,
#                                 feature_names=attributes,
#                                 class_names=labels,
#                                 filled=True, rounded=True, special_characters=True)
# graph = graphviz.Source(dot_data)
# graph

# # dot -Tpng tree.dot -o tree.png # run in terminal

# Validating Predictions

---

In [None]:
print("Accuracy: ")
clf.score(validation_data, validation_data_labels)

# Testing Model

---

Check the accuracy of the testing dataset.

In [None]:
# print("Accuracy: ")
# clf.score(test_data, test_data_labels)

Predict in real time.

In [None]:
while(True):
  print()
  print("There are", len(test_data_labels), "instances which can be tested.")
  print()

  row = int(input("Enter row index (-1 exit): "))

  if(row < 0):
    break
  elif(row >= len(test_data_labels)):
    print("Error: Enter a number between 0 and", (len(test_data_labels)-1))
    continue

  i = []
  i.append(test_data[row])

  start = time.time()*1000
  prediction = clf.predict(i)[0]
  end = time.time()*1000

  testLbl = labels[0] if (test_data_labels[row] == -1) else labels[1]
  predLbl = labels[0] if (prediction == -1) else labels[1]

  print()
  print("Predicted answer:", prediction, predLbl)
  print("Correct answer:", test_data_labels[row], testLbl)
  print()

  state = "Incorrect!"
  if (prediction == test_data_labels[row]):
    state = "Correct!"
  print("Prediction is", state, "(", timeDiff(start, end) , ")")
  print("***")
  print()

print()
print("Application terminated.")