In [None]:
#importing required packages for this module
import pandas as pd

#Downloading the phishing URLs file
!wget http://data.phishtank.com/data/online-valid.csv

In [None]:
#loading the phishing URLs data to dataframe
data = pd.read_csv("online-valid.csv")
data.head()

In [None]:
phishingurl = data.sample(n = 4000, random_state = 12).copy()
phishingurl = phishingurl.reset_index(drop=True)
phishingurl.head(5)

> Legitimate URLs:

In [None]:
#Loading legitimate files 
ldata = pd.read_csv("legitimate.csv")
ldata.columns = ['URLs']
ldata.head(5)

In [None]:

legiurl = ldata.sample(n = 4000, random_state = 13).copy()
legiurl = legiurl.reset_index(drop=True)
legiurl.head(3)

>  Feature Extraction for Address Bar based Features, Domain based Features and HTML & Javascript based Features

In [None]:
from urllib.parse import urlparse,urlencode
import ipaddress
import re
#Domain
def getDomain(url):  
  domain = urlparse(url).netloc
  if re.match(r"^www.",domain):
	       domain = domain.replace("www.","")
  return domain
#ip
def havingIP(url):
  try:
    ipaddress.ip_address(url)
    ip = 1
  except:
    ip = 0
  return ip
#"@" Symbol in URL
def haveAtSign(url):
  if "@" in url:
    at = 1    
  else:
    at = 0    
  return at
#Finding the length of URL and categorizing 
def getLength(url):
  if len(url) < 54:
    length = 0            
  else:
    length = 1            
  return length
#depth
def getDepth(url):
  s = urlparse(url).path.split('/')
  depth = 0
  for j in range(len(s)):
    if len(s[j]) != 0:
      depth = depth+1
  return depth
#redirection
def redirection(url):
  pos = url.rfind('//')
  if pos > 6:
    if pos > 7:
      return 1
    else:
      return 0
  else:
    return 0
#https/http
def httpDomain(url):
  domain = urlparse(url).netloc
  if 'https' in domain:
    return 1
  else:
    return 0
#listing shortening services
shortening_services = r"bit\.ly|goo\.gl|shorte\.st|go2l\.ink|x\.co|ow\.ly|t\.co|tinyurl|tr\.im|is\.gd|cli\.gs|" \
                      r"yfrog\.com|migre\.me|ff\.im|tiny\.cc|url4\.eu|twit\.ac|su\.pr|twurl\.nl|snipurl\.com|" \
                      r"short\.to|BudURL\.com|ping\.fm|post\.ly|Just\.as|bkite\.com|snipr\.com|fic\.kr|loopt\.us|" \
                      r"doiop\.com|short\.ie|kl\.am|wp\.me|rubyurl\.com|om\.ly|to\.ly|bit\.do|t\.co|lnkd\.in|db\.tt|" \
                      r"qr\.ae|adf\.ly|goo\.gl|bitly\.com|cur\.lv|tinyurl\.com|ow\.ly|bit\.ly|ity\.im|q\.gs|is\.gd|" \
                      r"po\.st|bc\.vc|twitthis\.com|u\.to|j\.mp|buzurl\.com|cutt\.us|u\.bb|yourls\.org|x\.co|" \
                      r"prettylinkpro\.com|scrnch\.me|filoops\.info|vzturl\.com|qr\.net|1url\.com|tweez\.me|v\.gd|" \
                      r"tr\.im|link\.zip\.net"
#checking shortening
def tinyURL(url):
    match=re.search(shortening_services,url)
    if match:
        return 1
    else:
        return 0
#prefix and surfix
def prefixSuffix(url):
    if '-' in urlparse(url).netloc:
        return 1            # phishing
    else:
        return 0            # legitimate

> HTML and JavaScript based Features

In [None]:
import requests
def iframe(response):
  if response == "":
      return 1
  else:
      if re.findall(r"[<iframe>|<frameBorder>]", response.text):
          return 0
      else:
          return 1
    
def mouseOver(response): 
  if response == "" :
    return 1
  else:
    if re.findall("<script>.+onmouseover.+</script>", response.text):
      return 1
    else:
      return 0
def forwarding(response):
  if response == "":
    return 1
  else:
    if len(response.history) <= 2:
      return 0
    else:
      return 1
def rightClick(response):
  if response == "":
    return 1
  else:
    if re.findall(r"event.button ?== ?2", response.text):
      return 0
    else:
      return 1

> Domain Based Features

In [None]:
import re
from bs4 import BeautifulSoup
#import whois
import urllib
import urllib.request
from datetime import datetime

In [None]:
def web_traffic(url):
  try:
    #Filling the whitespaces in the URL if any
    url = urllib.parse.quote(url)
    rank = BeautifulSoup(urllib.request.urlopen("http://data.alexa.com/data?cli=10&dat=s&url=" + url).read(), "xml").find(
        "REACH")['RANK']
    rank = int(rank)
  except TypeError:
        return 1
  if rank <100000:
    return 1
  else:
    return 0
def domainEnd(domain_name):
  expiration_date = domain_name.expiration_date
  if isinstance(expiration_date,str):
    try:
      expiration_date = datetime.strptime(expiration_date,"%Y-%m-%d")
    except:
      return 1
  if (expiration_date is None):
      return 1
  elif (type(expiration_date) is list):
      return 1
  else:
    today = datetime.now()
    end = abs((expiration_date - today).days)
    if ((end/30) < 6):
      end = 0
    else:
      end = 1
  return end
def domainAge(domain_name):
  creation_date = domain_name.creation_date
  expiration_date = domain_name.expiration_date
  if (isinstance(creation_date,str) or isinstance(expiration_date,str)):
    try:
      creation_date = datetime.strptime(creation_date,'%Y-%m-%d')
      expiration_date = datetime.strptime(expiration_date,"%Y-%m-%d")
    except:
      return 1
  if ((expiration_date is None) or (creation_date is None)):
      return 1
  elif ((type(expiration_date) is list) or (type(creation_date) is list)):
      return 1
  else:
    ageofdomain = abs((expiration_date - creation_date).days)
    if ((ageofdomain/30) < 6):
      age = 1
    else:
      age = 0
  return age

 Features extract urls

In [None]:
legi_features = []
label = 0

for i in range(0, 4000):
  url = legiurl['URLs'][i]
  legi_features.append(featureExtraction(url,label))

In [None]:

def featureExtraction(url,label):

  features = []
  #Address bar based features (10)
  features.append(getDomain(url))
  features.append(havingIP(url))
  features.append(haveAtSign(url))
  features.append(getLength(url))
  features.append(getDepth(url))
  features.append(redirection(url))
  features.append(httpDomain(url))
  features.append(tinyURL(url))
  features.append(prefixSuffix(url))
  
  Domain based features (4)
  dns = 0
  try:
    domain_name = whois.whois(urlparse(url).netloc)
  except:
    dns = 1

  features.append(dns)
  features.append(web_traffic(url))
  features.append(1 if dns == 1 else domainAge(domain_name))
 features.append(1 if dns == 1 else domainEnd(domain_name))
  
  # HTML & Javascript based features (4)
  try:
    response = requests.get(url)
  except:
    response = ""
  features.append(iframe(response))
  features.append(mouseOver(response))
  features.append(rightClick(response))
  features.append(forwarding(response))
  features.append(label)
  
  return features

on legitimate perform feature extraction

In [None]:
#converting the list to dataframe
feature_names = ['Domain', 'Have_IP', 'Have_At', 'URL_Length', 'URL_Depth','Redirection', 
                      'https_Domain', 'TinyURL', 'Prefix/Suffix', 'DNS_Record', 'Web_Traffic', 
                      'Domain_Age', 'Domain_End', 'iFrame', 'Mouse_Over','Right_Click', 'Web_Forwards', 'Label']

legitimate = pd.DataFrame(legi_features, columns= feature_names)
legitimate.head(3)

In [None]:
# Storing the extracted legitimate URLs fatures to csv file
legitimate.to_csv('Final.csv', index= False)

Phishing URLs:

**ML implementation**

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
data = pd.read_csv('final.csv')
data.head(4)

In [None]:
data.info()

**Data  visualization**

In [None]:
import matplotlib.pyplot as plt
import numpy as np

boxplot = data.boxplot(figsize=(10,7), rot=40)

In [None]:
data.hist(bins = 70,figsize = (8,12))
plt.show()

In [None]:
plt.figure(figsize=(4,4))
sns.heatmap(data.corr())
plt.show()

**Data Preprocessing**

In [None]:
# drop column because strings in it
column = data.drop(['Domain'], axis = 1).copy()
column.isnull().sum()
column.head(3)
boxplot = column.boxplot(figsize=(10,7), rot=30,color='yellow')

In [None]:
column.head(10).T

**Test and train**

In [None]:
y = column['Label']
X = column.drop('Label',axis=1)
X.shape, y.shape

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size = 0.3, random_state = 15)
X_train.shape, X_test.shape

ML algoritms

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.linear_model import SGDClassifier,RidgeClassifier
from sklearn.metrics import (precision_score, recall_score,f1_score)
from sklearn.metrics import average_precision_score
from sklearn.metrics import plot_confusion_matrix
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
# Metrics
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.metrics import roc_auc_score, roc_curve
# Models
from sklearn.linear_model import LogisticRegression    
from sklearn.naive_bayes import GaussianNB              
from sklearn.neighbors import KNeighborsClassifier      
from sklearn.svm import SVC                              
from sklearn import tree
from sklearn import metrics
from sklearn.metrics import (confusion_matrix, precision_recall_curve, auc, roc_curve, recall_score, 
                             classification_report, f1_score, average_precision_score, precision_recall_fscore_support)



In [None]:
# Ridge Classifier
rc = RidgeClassifier()
rc_fit = rc.fit(X_train, y_train)
y_pred_rc = rc_fit.predict(X_test)
rc_accuracy = accuracy_score(y_test, y_pred_rc)

In [None]:
print ("Accuracy: " + str(accuracy_score(y_pred_rc, y_test)))
print ("Precision: " + str(precision_score(y_pred_rc, y_test)))
print ("Recall: " + str(recall_score(y_pred_rc, y_test)))
print ("F1: " + str(f1_score(y_pred_rc, y_test)))

In [None]:
confusion_matrix(y_test, y_pred_rc)

In [None]:
print(classification_report(y_test, y_pred_rc))

In [None]:
rc_roc_auc_score = roc_auc_score(y_test, y_pred_rc)
rc_roc_auc_score

In [None]:
fpr_rc, tpr_rc, thresholds_lr = roc_curve(y_test, y_pred_rc)
roc_auc_rc = auc(fpr_lr, tpr_lr)
precision_rc, recall_rc, th_rc = precision_recall_curve(y_test, y_pred_rc)
log_auprc_score=average_precision_score(y_test,y_pred_rc)
log_auprc_score

**XGBoost**

In [None]:

from xgboost import XGBClassifier
xgb = XGBClassifier(learning_rate=0.8,max_depth=9)
xgb.fit(X_train, y_train)
#predicting 
y_test_xgb = xgb.predict(X_test)
y_train_xgb = xgb.predict(X_train)
acc_train_xgb = accuracy_score(y_train,y_train_xgb)
acc_test_xgb = accuracy_score(y_test,y_test_xgb)

In [None]:
print ("Accuracy: " + str(accuracy_score(y_test_xgb, y_test)))
print ("Precision: " + str(precision_score(y_test_xgb, y_test)))
print ("Recall: " + str(recall_score(y_test_xgb, y_test)))
print ("F1: " + str(f1_score(y_test_xgb, y_test)))

In [None]:
confusion_matrix(y_test, y_test_xgb)

In [None]:
print(classification_report(y_test,y_test_xgb ))

In [None]:
rc_roc_auc_score = roc_auc_score(y_test,y_test_xgb )
rc_roc_auc_score

In [None]:
plt.figure(figsize=(5,5))
n_features = X_train.shape[1]
plt.barh(range(n_features), xgb.feature_importances_, align='center',color ='pink')
plt.yticks(np.arange(n_features), X_train.columns)
plt.xlabel("Importance Features")
plt.ylabel("Feature")
plt.show()

In [None]:
fpr_xgb, tpr_xgb, thresholds_xgb = roc_curve(y_test, y_test_xgb)
roc_auc_xgb = auc(fpr_xgb, tpr_xgb)
precision_xgb, recall_xgb, th_xgb = precision_recall_curve(y_test, y_test_xgb)
log_auprc_score=average_precision_score(y_test,y_test_xgb)
log_auprc_score

**Decision tree**

In [None]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(max_depth = 5)
dt.fit(X_train, y_train)
y_test_dt = dt.predict(X_test)
y_train_dt = dt.predict(X_train)
acc_train_dt = accuracy_score(y_train,y_train_dt)
acc_test_dt = accuracy_score(y_test,y_test_dt)

In [None]:
print ("Accuracy: " + str(accuracy_score(y_test_dt, y_test)))
print ("Precision: " + str(precision_score(y_test_dt, y_test)))
print ("Recall: " + str(recall_score(y_test_dt, y_test)))
print ("F1: " + str(f1_score(y_test_dt, y_test)))

In [None]:
confusion_matrix(y_test, y_test_dt)

In [None]:
print(classification_report(y_test, y_test_dt))

In [None]:
rc_roc_auc_score = roc_auc_score(y_test, y_test_dt)
rc_roc_auc_score

In [None]:
plt.figure(figsize=(5,5))
n_features = X_train.shape[1]
plt.barh(range(n_features), dt.feature_importances_, align='center',color ='red') 
plt.yticks(np.arange(n_features), X_train.columns)
plt.xlabel("Importance features")
plt.ylabel("Feature")
plt.show()

In [None]:
fpr_dt, tpr_dt, thresholds_dt = roc_curve(y_test, y_test_dt)
roc_auc_dt = auc(fpr_dt, tpr_dt)
precision_dt, recall_dt, th_dt = precision_recall_curve(y_test, y_test_dt)
log_auprc_score=average_precision_score(y_test,y_test_dt)
log_auprc_score

**MLP**

In [None]:
# MLP
from sklearn.neural_network import MLPClassifier
from sklearn.datasets import make_classification

mlp = MLPClassifier(alpha=0.001, hidden_layer_sizes=([100,100,100]))
mlp.fit(X_train, y_train)
y_test_mlp = mlp.predict(X_test)
y_train_mlp = mlp.predict(X_train)

In [None]:
confusion_matrix(y_test, y_test_mlp)

In [None]:
print ("Accuracy: " + str(accuracy_score(y_test_mlp, y_test)))
print ("Precision: " + str(precision_score(y_test_mlp, y_test)))
print ("Recall: " + str(recall_score(y_test_mlp, y_test)))
print ("F1: " + str(f1_score(y_test_mlp, y_test)))

In [None]:
print(classification_report(y_test, y_test_mlp))

In [None]:
rc_roc_auc_score = roc_auc_score(y_test, y_test_mlp)
rc_roc_auc_score

In [None]:
fpr_mlp, tpr_mlp, thresholds_mlp = roc_curve(y_test, y_test_mlp)
roc_auc_mlp = auc(fpr_mlp, tpr_mlp)
precision_mlp, recall_mlp, th_mlp = precision_recall_curve(y_test, y_test_mlp)
log_auprc_score=average_precision_score(y_test,y_test_mlp)
log_auprc_score

**RandomForest**

In [None]:

from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(max_depth=5)
forest.fit(X_train, y_train)
#predicting 
y_test_forest = forest.predict(X_test)
y_train_forest = forest.predict(X_train)

In [None]:
print ("Accuracy: " + str(accuracy_score(y_test_forest, y_test)))
print ("Precision: " + str(precision_score(y_test_forest, y_test)))
print ("Recall: " + str(recall_score(y_test_forest, y_test)))
print ("F1: " + str(f1_score(y_test_forest, y_test)))

In [None]:
print(classification_report(y_test, y_test_forest))

In [None]:
confusion_matrix(y_test, y_test_forest)

In [None]:
rc_roc_auc_score = roc_auc_score(y_test, y_test_forest)
rc_roc_auc_score

In [None]:
plt.figure(figsize=(5,5))
n_features = X_train.shape[1]
plt.barh(range(n_features), forest.feature_importances_, align='center', color ='yellow')
plt.yticks(np.arange(n_features), X_train.columns)
plt.xlabel("Importance Features")
plt.ylabel("Feature")
plt.show()

In [None]:
fpr_forest, tpr_forest, thresholds_lr = roc_curve(y_test, y_test_forest)
roc_auc_forest = auc(fpr_forest, tpr_forest)
precision_forest, recall_forest, th_forest = precision_recall_curve(y_test, y_test_forest)
log_auprc_score=average_precision_score(y_test,y_test_forest)
log_auprc_score

**SVM**

In [None]:
from sklearn.svm import SVC
svm = SVC(kernel='linear', C=1.0, random_state=12)
svm.fit(X_train, y_train)
y_test_svm = svm.predict(X_test)
print ("Accuracy: " + str(accuracy_score(y_test_svm, y_test)))
print ("Precision: " + str(precision_score(y_test_svm, y_test)))
print ("Recall: " + str(recall_score(y_test_svm, y_test)))
print ("F1: " + str(f1_score(y_test_svm, y_test)))

In [None]:
confusion_matrix(y_test, y_test_svm)

In [None]:
print(classification_report(y_test, y_test_svm))

In [None]:
rc_roc_auc_score = roc_auc_score(y_test, y_test_svm)
rc_roc_auc_score

In [None]:
fpr_svm, tpr_svm, thresholds_svm = roc_curve(y_test, y_test_svm)
roc_auc_svm = auc(fpr_svm, tpr_svm)
precision_svm, recall_svm, th_svm = precision_recall_curve(y_test, y_test_svm)
log_auprc_score=average_precision_score(y_test,y_test_svm)
log_auprc_score

**comapre all**

In [None]:
pd.DataFrame({'Accuracy' : [81,79,85.3,85.5,81.2,80.5]}, index = ['Decision tree', 'Ridge classifier',
                                                                                    'XGB', 'MLP', 'Random forest', 'SVM']).plot(kind = 'bar')

In [None]:
# Plot ROC curve
plt.plot([0, 1], [0, 1], 'k--')

plt.plot(fpr_rc, tpr_rc, label='Ridge Classifier (area = %0.3f)' % roc_auc_rc)
plt.plot(fpr_dt, tpr_dt, label='Decision tree (area = %0.3f)' % roc_auc_dt)
plt.plot(fpr_forest, tpr_forest, label='RF (area = %0.3f)' % roc_auc_forest)
plt.plot(fpr_svm, tpr_svm, label=' SVM (area = %0.3f)' % roc_auc_svm)
plt.plot(fpr_mlp, tpr_mlp, label='MLP (area = %0.3f)' % roc_auc_mlp)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC curves ')
plt.legend(loc='best')
plt.show()

In [None]:
plt.plot([1, 0], [0, 1], 'k--')
plt.plot(recall_xgb, precision_xgb, label='XGB')
plt.plot(recall_rc, precision_rc, label='Ridge Classifier')
plt.plot(recall_dt, precision_dt, label='Decision Tree')
plt.plot(recall_mlp, precision_mlp, label='MLP')
plt.plot(recall_forest, precision_forest, label='RF')
plt.title('Precision vs. Recall')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.legend(loc='best')
plt.show()