<a href="https://colab.research.google.com/github/roshsoftco/phishing-website-detector/blob/0.2/model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Preprocessing

---



Dataset contains 30 attributes. Each row has 31 values, 30 attributes and the class.

The `labelSplitter` function will split the last value from each row, and return them as an array.

In [None]:
def labelSplitter(csv):
  """
  Splits the last value from each row of the parsed csv dataset as class.
  
  Parameters
  ----------
  csv : read_csv value from pandas

  Returns
  -------
  array
    containing the data[] and the labels[]
  """

  data = []
  labels = []
  for row in csv.values: # iterate through dataset row by row
    labels.append(row[-1]) # extract last value as class [a1, a2, ..., class]
    data.append(row[:-1]); # rest of the data as attributes
  return [data, labels];

Read CSV files and assign to variables.

In [None]:
import pandas as pd

training_dataset_url = "https://raw.githubusercontent.com/roshsoftco/phishing-website-detector/0.2/Dataset/training_dataset.csv?token=AFSK45UKKVPMD5X5QRAUAR3AILN54"
validation_dataset_url = "https://raw.githubusercontent.com/roshsoftco/phishing-website-detector/0.2/Dataset/validation_dataset.csv?token=AFSK45VZNMAQO4SVPQFPFELAILN3I"
test_dataset_url = "https://raw.githubusercontent.com/roshsoftco/phishing-website-detector/0.2/Dataset/test_dateset.csv?token=AFSK45USBWRLYLNNBBCJQMDAILN7U"

attributes = ["having_IP_Address",
              "URL_Length",
              "Shortining_Service",
              "having_At_Symbol",
              "double_slash_redirecting",
              "Prefix_Suffix",
              "having_Sub_Domain",
              "SSLfinal_State",
              "Domain_registeration_length",
              "Favicon",
              "port",
              "HTTPS_token",
              "Request_URL",
              "URL_of_Anchor",
              "Links_in_tags",
              "SFH",
              "Submitting_to_email",
              "Abnormal_URL",
              "Redirect",
              "on_mouseover",
              "RightClick",
              "popUpWidnow",
              "Iframe",
              "age_of_domain",
              "DNSRecord",
              "web_traffic",
              "Page_Rank",
              "Google_Index",
              "Links_pointing_to_page",
              "Statistical_report"]
labels = ["Legitamate", "Phishing"]

## Training Data
splitted = labelSplitter(pd.read_csv(training_dataset_url, header=None))
training_data = splitted[0]
training_data_labels = splitted[1]

## Validation Data
splitted = labelSplitter(pd.read_csv(validation_dataset_url, header=None))
validation_data = splitted[0]
validation_data_labels = splitted[1]

## Test Data
splitted = labelSplitter(pd.read_csv(test_dataset_url, header=None))
test_data = splitted[0]
test_data_labels = splitted[1]

splitted = None

print("Training dataset size: ", len(training_data_labels))
print("Validation dataset size: ", len(validation_data_labels))
print("Testing dataset size: ", len(test_data_labels))

Training dataset size:  8444
Validation dataset size:  2111
Testing dataset size:  500


# Classifier Training (Decision Tree)

---



In [None]:
from sklearn import tree
clf = tree.DecisionTreeClassifier()
clf = clf.fit(training_data, training_data_labels) # train

In [None]:
# import graphviz
# dot_data = tree.export_graphviz(clf, out_file=None,
#                                 feature_names=attributes,
#                                 class_names=labels,
#                                 filled=True, rounded=True, special_characters=True)
# graph = graphviz.Source(dot_data)
# graph

# Validating Predictions

---

In [None]:
predictions = clf.predict(validation_data)

correct = 0
correct_percentage = 0
incorrect = 0
incorrect_percentage = 0

for x in range(len(predictions)):
  if(predictions[x] == validation_data_labels[x]):
    correct += 1
  else:
    incorrect += 1

correct_percentage = (float(correct)/len(validation_data_labels))*100
incorrect_percentage = (float(incorrect)/len(validation_data_labels))*100

print("Correct: ", correct, "(", correct_percentage, "%)")
print("Incorrect: ", incorrect, "(", incorrect_percentage, "%)")

Correct:  1883 ( 89.1994315490289 %)
Incorrect:  228 ( 10.800568450971104 %)


# Testing Model

---

In [None]:
predictions = clf.predict(test_data)

while(True):
  print()
  print("There are", len(test_data_labels), "instances which can be tested.")
  print()

  row = int(input("Enter a row number (0 exit): "))

  if(row == 0):
    break
  elif(row < 0 or row > len(test_data_labels)):
    print("Error: Enter a number between 1 and", len(test_data_labels))
    continue

  print()
  print("Correct answer:", labels[test_data_labels[row-1]]);
  print("Predicted answer:", labels[predictions[row-1]])
  print()

  state = "Incorrect!"
  if (predictions[row-1] == test_data_labels[row-1]):
    state = "Correct!"
  print("Prediction is", state)
  print("***")
  print()

print()
print("Application terminated.")


There are 500 instances which can be tested.

Enter a row number (0 exit): 120

Correct answer: Phishing
Predicted answer: Phishing

Prediction is Correct!
***


There are 500 instances which can be tested.

Enter a row number (0 exit): 50

Correct answer: Phishing
Predicted answer: Phishing

Prediction is Correct!
***


There are 500 instances which can be tested.

Enter a row number (0 exit): 6

Correct answer: Phishing
Predicted answer: Phishing

Prediction is Incorrect!
***


There are 500 instances which can be tested.

Enter a row number (0 exit): 8

Correct answer: Phishing
Predicted answer: Phishing

Prediction is Correct!
***


There are 500 instances which can be tested.

Enter a row number (0 exit): 9

Correct answer: Phishing
Predicted answer: Phishing

Prediction is Correct!
***


There are 500 instances which can be tested.

Enter a row number (0 exit): 10

Correct answer: Phishing
Predicted answer: Phishing

Prediction is Correct!
***


There are 500 instances which can