In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import Perceptron
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB 
from sklearn.tree import DecisionTreeClassifier
import tensorflow as tf
from tensorflow import keras
from sklearn.base import clone
from itertools import combinations
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix


import import_ipynb
from utils import generate_data_set
import sys

In [8]:
def load_data():
    '''
    Load data from excel file
    '''
    # Load the training data from the CSV file
    
    data=pd.read_excel("phisingdataset.xlsx",header=None)
    data.columns=["having_IP_Address","URL_Length","Shortining_Service","having_At_Symbol","double_slash_redirecting","Prefix_Suffix",
"having_Sub_Domain","SSLfinal_State","Domain_registeration_length","Favicon","port","HTTPS_token","Request_URL","URL_of_Anchor",
"Links_in_tags","SFH","Submitting_to_email","Abnormal_URL","Redirect","on_mouseover","RightClick","popUpWidnow","Iframe",
"age_of_domain","DNSRecord","web_traffic","Page_Rank","Google_Index","Links_pointing_to_page","Statistical_report","Result"]
    # Extract the inputs from the training data
    inputs=data[["having_IP_Address","URL_Length","Shortining_Service","having_At_Symbol","double_slash_redirecting","Prefix_Suffix",
"having_Sub_Domain","SSLfinal_State","Domain_registeration_length","Favicon","port","HTTPS_token","Request_URL","URL_of_Anchor",
"Links_in_tags","SFH","Submitting_to_email","Abnormal_URL","Redirect","on_mouseover","RightClick","popUpWidnow","Iframe",
"age_of_domain","DNSRecord","web_traffic","Page_Rank","Google_Index","Links_pointing_to_page","Statistical_report"]]
    # Extract the outputs from the training data
    outputs = data["Result"]
    # Split 70% for traning and 30% testing
    training_inputs, training_outputs, testing_inputs, testing_outputs = train_test_split(inputs, outputs, test_size=0.3)
    # Return the four arrays
    return training_inputs, training_outputs, testing_inputs, testing_outputs

In [21]:
def run(classifier, name):
    '''
    Run the classifier to calculate the accuracy score
    '''
    # Load the training data
    train_inputs, test_inputs,train_outputs, test_outputs = load_data()
    # Train the decision tree classifier
    classifier.fit(train_inputs, train_outputs)
    # Use the trained classifier to make predictions on the test data
    predictions = classifier.predict(test_inputs)
    # Print the accuracy (percentage of phishing websites correctly predicted)
    accuracy = 100.0 * accuracy_score(test_outputs, predictions)
    print ("Accuracy score using {} is: {}\n".format(name, accuracy))

In [23]:
if __name__ == '__main__':
    '''
    Main function -
    Following are several models trained to detect phishing webstes.
    Only the best and worst classifier outputs are displayed.
    '''
    
    # Random Forest
    forest=RandomForestClassifier(n_estimators=10000, max_depth=15, max_leaf_nodes=10000,random_state=0)
    run(forest,"Random Forest")
    
    # SVM
    sv=SVC(C=10,kernel='linear')
    run(sv,"SVM")
    
    # Ensemble Learning
    clf1=Perceptron()
    clf2=Pipeline([['SC',StandardScaler()],['PPN',clf1]])
    clf3=BaggingClassifier(base_estimator=DecisionTreeClassifier(),n_estimators=100)
    clf4=AdaBoostClassifier(base_estimator=DecisionTreeClassifier(),n_estimators=100)
    clf5=VotingClassifier(estimators=[('PPN',clf1),("Pipe",clf2),("Bag",clf3),("Ada",clf4)]
                      ,voting='hard'
                      ,weights=[1,1,1,1])
    run(clf1,"Perceptron")
    run(clf2,"Pipleline Perceptron")
    run(clf3,"Bagging")
    run(clf4,"Adaboost")
    run(clf5,"Ensemble")
    
    
    # Logistic Regression
    Log_Regression = LogisticRegression(solver='liblinear', C=0.05,random_state=0)
    run(Log_Regression,"Logistic_Regression")
    
    # KNN
    nbrs = KNeighborsClassifier(n_neighbors=5, algorithm='ball_tree')
    run(nbrs, "K nearest neighbours")
    
    
    # Naive Bayes Classifier
    Naive_classifier = GaussianNB()
    run(Naive_classifier,"Naive Bayes classifier")
    
    # Decison Tree
    Decision_Tree = DecisionTreeClassifier()
    run(Decision_Tree,"Decision Tree")
    
#!python Capstron_code "https://www.pythoncentral.io/how-to-create-a-python-package/""


    # Take user input and check whether its phishing URL or not.
    if len(sys.argv) > 1:
        data_set = generate_data_set(sys.argv[1])

        # Reshape the array
        data_set = np.array(data_set).reshape(1, -1)

        # Load the date
        train_inputs, test_inputs,train_outputs, test_outputs = load_data()

        # Create and train the classifier
        classifier = RandomForestClassifier(n_estimators=500, max_depth=15, max_leaf_nodes=10000)
        classifier.fit(train_inputs, train_outputs)

        print (classifier.predict(data_set))

Accuracy score using Random Forest is: 97.10581851070245

Accuracy score using SVM is: 92.7042508290624

Accuracy score using Perceptron is: 92.67410310521555

Accuracy score using Pipleline Perceptron is: 90.02110340669279

Accuracy score using Bagging is: 96.17123907145012

Accuracy score using Adaboost is: 96.68375037684655

Accuracy score using Ensemble is: 96.50286403376545

Accuracy score using Logistic_Regression is: 92.04100090443171

Accuracy score using K nearest neighbours is: 93.94030750678324

Accuracy score using Naive Bayes classifier is: 61.95357250527586

Accuracy score using Decision Tree is: 95.990352728369



AttributeError: 'str' object has no attribute 'text'