# URLs Phishing Detecting 

## A. Data Pre-processing & feature extraction 
In this part i will perform actions on the data to prepare the data to the Machine learning models
 

In [1]:
# imorting required packages
import pandas as pd
import numpy as np 

## 1.Data collection - 
in order to train Machine Learning model i collected relevant data of URLs : 
- Phishing URLs:  https://www.phishtank.com/developer_info.php 
- Legitimate URLs: 

In [2]:
# loading the phishing url's data 
phi_data = pd.read_csv(r"C:\Users\moshi\Downloads\notebooks\perceptionpoint\data\phistank_phishing_url.csv")
phi_data.head()

Unnamed: 0,phish_id,url,phish_detail_url,submission_time,verified,verification_time,online,target
0,7406838,https://1mailerbt.weebly.com/,http://www.phishtank.com/phish_detail.php?phis...,2022-01-09T08:25:36+00:00,yes,2022-01-09T08:32:39+00:00,yes,Other
1,7406836,https://kuzewenoxsa.online/dbs/,http://www.phishtank.com/phish_detail.php?phis...,2022-01-09T08:17:25+00:00,yes,2022-01-09T08:23:59+00:00,yes,Development Bank of Singapore
2,7406835,http://codashopcz3.duckdns.org/,http://www.phishtank.com/phish_detail.php?phis...,2022-01-09T08:10:42+00:00,yes,2022-01-09T08:23:59+00:00,yes,Other
3,7406829,https://cp21672.tmweb.ru/,http://www.phishtank.com/phish_detail.php?phis...,2022-01-09T08:04:55+00:00,yes,2022-01-09T08:13:21+00:00,yes,Other
4,7406828,https://cp21672.tmweb.ru//,http://www.phishtank.com/phish_detail.php?phis...,2022-01-09T08:04:51+00:00,yes,2022-01-09T08:13:21+00:00,yes,Other


In [3]:
# taking random 4.5k samples from legitimate dataset
phi_data = phi_data.sample(n = 4500, random_state = 10).copy().reset_index(drop=True)

In [4]:
# check shape and nulls
phi_data.shape
phi_data.isnull().sum()


phish_id             0
url                  0
phish_detail_url     0
submission_time      0
verified             0
verification_time    0
online               0
target               0
dtype: int64

In [5]:
# loading the legitimate url's data
leg_data = pd.read_csv(r"C:\Users\moshi\Downloads\notebooks\perceptionpoint\data\unb_legitimate_url.csv")
leg_data.columns=['url']
leg_data.head()

Unnamed: 0,url
0,http://1337x.to/torrent/1110018/Blackhat-2015-...
1,http://1337x.to/torrent/1122940/Blackhat-2015-...
2,http://1337x.to/torrent/1124395/Fast-and-Furio...
3,http://1337x.to/torrent/1145504/Avengers-Age-o...
4,http://1337x.to/torrent/1160078/Avengers-age-o...


In [6]:
# taking random 4.5k samples from legitimate dataset
leg_data = leg_data.sample(n = 4500, random_state = 10).copy().reset_index(drop=True)

In [7]:
# new shape and nulls checking
leg_data.shape
leg_data.isnull().sum()

url    0
dtype: int64

## 2.Feature Extraction - 
The next step will be to create relevant features from the dataset to help train the ML model 

1. **Lexical Features.**
2. **Content Features.**
3. **Host-Based Features.**



### 2.1 Lexical Features - 
I will create some functions to extract features from the literal URL strings 
<br>
**References:** 
* https://www.hindawi.com/journals/scn/2019/2595794/tab2/
* https://hcis-journal.springeropen.com/articles/10.1186/s13673-016-0064-3

In [8]:
# load require libs
from urllib.parse import urlparse,urlencode

import re
import requests as req

### The features we wish to extract are:

1. **getDomainLen - length of domain name** 
2. **getDot - amount of dots**
3. **getSlash - amount of slashes**
4. **getIp - if the url contain ip**
5. **getAt - if the url contain "@"** 
6. **getDomainSign - if the domain name contain "-"** 
7. **getSubdAmount - the amount of subdriections** 
8. **getKeyWord - if there is sensitive keyword in the url** 

In [9]:
key_word_list = r"login|Login|signin|Signin|server|admin|pay|payment|client"

In [10]:
# 1.check domain lentgh if grater than 25
def getDomainLen(url):
    if len(urlparse(url).netloc) >= 25:
        return 1
    else:
        return 0
# 2.check amount of '.' in url
def getDot(url):
    if url.count('.') > 4:
        return 1
    else: 
        return 0
# 3.check amount of '/' in the url        
def getSlash(url):
    if url.count('/') > 5:
        return 1
    else: 
        return 0
# 4.check if the url contain ip adress
def getIp(url):
    if re.match(r"https?://(\d+\.){3}\d+/", url):
        return 1
    else:
        return 0 
# 5.check for "@" sign in the url  
def getAt(url):
    if "@" in url:
        return 1
    else:
        return 0
# 6.check for "-" sign in the domain name    
def getDomainSign(url):
    domain=urlparse(url).netloc
    if "-" in domain:
        return 1
    else:
        return 0

# 7.check the deapth of the url
def getSubdAmount(url):
        sub_len = urlparse(url).path.split('/')
        return len(sub_len)
    
# 8.check for sensitive keywords in the url
def getKeyWords(url):
    match=re.search(key_word_list,url)
    if match:
        return 1
    else:
        return 0

### 2.2 Content Features - 
now we will try to get some fetures related to the html/scrpits body 
<br>
**Referecnces:**
* https://thesai.org/Downloads/Volume11No4/Paper_77-Feature_Selection_for_Phishing_Website.pdf
* https://d-nb.info/1181271673/34


In [11]:
# help function that pull url
def getHtml(url):
    try:
        html = req.get(url, timeout=5)
        html = html.text if html else None
    except:
        html = None
    return html

### The feature we want to extract from the html:
if the html not found(any 400,402,406 etc..) assign 1 for phishing

1. **getUniqeWords - the amount of unique words per html page**
2. **getHtmlTags - the amount of html tags in a page**
3. **getScriptsTags - the amount of scripts tags per page**
4. **getIframeTags - the amount of iframe tags in a page**

In [12]:
# 1. get the amount of unique words per page
def getUniqeWords(url):
    html=getHtml(url)
    if html is None:
        return -1
    else:
        words = set(html.lower().split())  
    return len(words)

# 2.get the amount of html tags per page
def getHtmlTags(url):
    html=getHtml(url)
    if html is None:
        return -1
    else:
        html_tg = len(re.findall(r"</",html)) #find closer of tags
    return html_tg

# 3.get the amount of scripts tags per page
def getScriptsTags(url):
    html=getHtml(url)
    if html is None:
        return -1
    else:
        sc_tg = len(re.findall(r"<script>",html))
    return sc_tg
    
# 4.get iframe tags  
def getIframeTags(url):
    html=getHtml(url)
    if html is None:
        return 1
    else:
        if re.findall(r"[<iframe>|<frameBorder>]", html):
            return 0
        else:
            return 1
    


### 2.3 Host-based Features - 
Features that related to information about the host of the webpage. 
<br>
**Referecnces:**
* http://eprints.hud.ac.uk/id/eprint/24330/6/MohammadPhishing14July2015.pdf

In [13]:
# load relevant packeges 
import whois
from datetime import datetime 

In [14]:
# function that get the parameter from domain data(whois) and url and return parsed date 
def parseDate(keydate,url):
    date = whois.whois(urlparse(url).netloc).get(keydate, None)
    if date:#if date is not none
        if isinstance(date, str) and 'before' in date: # if its like 'before Aug-1990'
            y_m = date.split()[-1] #extract year and month
            tmp_d = '01-{}'.format(y_m) #reset the day to 01
            date = dt.strptime(tmp_d, '%d-%b-%Y')
        elif isinstance(date, list):# if its a list of date take the first
            date = date[0] 
    else:
        date= None
    return date    

### The feature we want to extract from the host name:
if the domain not fount or None mark as phishing 

1. **getDomainAge - the amount of unique words per html page**

In [15]:
import datetime

In [16]:
# 1.get the domain 
def getDomainAge(url):
    try:
        age=abs((datetime.datetime.now()-parseDate('creation_date',url)).days)
        if ((age/30)<12):
            age=1 #if its less than 1 year old
        else:
            age=0
    except:
        return 1
    return age

## 3.Extractions and Data frame building
In this step i will:

1. **extract the features from the urls**
2. **build a new data frame with features and labels for the ml model**

In [17]:
#initialize a list of the features


### 3.1 Extract the phishing and legit url features

In [18]:
#in this function i will extract all the feauters to a list from all the function i build
def featuresFromUrl(url,label):
    feat_list = [] 
    #append the url to list just for recognitions
    feat_list.append(url)
    #Lexical feauters
    feat_list.append(getDomainLen(url))
    feat_list.append(getDot(url))
    feat_list.append(getSlash(url))
    feat_list.append(getIp(url))
    feat_list.append(getAt(url))
    feat_list.append(getDomainSign(url))
    feat_list.append(getSubdAmount(url))
    feat_list.append(getKeyWords(url))

    #content features
    feat_list.append(getUniqeWords(url))
    feat_list.append(getHtmlTags(url))
    feat_list.append(getScriptsTags(url))
    feat_list.append(getIframeTags(url))
  
    #host based features
    feat_list.append(getDomainAge(url))
    
    #append the label of the url (phishing=1:legit=0)
    feat_list.append(label)
  
    return feat_list

in this part we will build a list with all the features for each row of url

In [20]:
#build list for bengin urls
def buildFeatList(data,label):
    feat_l=[]
    for i in range(0, len(data)):
        feat_l.append(featuresFromUrl(data.url[i],label))
        print(len(feat_l))
    return feat_l       

In [None]:
legit_feature_list=buildFeatList(leg_data,0)#feuters for legitimate urls with label 0
phish_feature_list=buildFeatList(phi_data,1)#feuters for phishing urls with label 0

### 3.2 build a new data frames from the lists

In [22]:
# list of features
column_list=['url','domain_len','have_dot','have_slash','have_ip','have_at','have_domain_sign',
             'have_sub_directions','have_key_word','unique_words','html_tags','scripts_tags',
             'frame_tags','domain_age','label']

In [23]:
#save legitimate urls with the features to csv
legitimate_new = pd.DataFrame(legit_feature_list, columns= column_list)
legitimate_new.to_csv('legit_with_feat.csv', index= False)
#save phishing urls with the features to csv
phishing_new = pd.DataFrame(phish_feature_list, columns= column_list)
phishing_new.to_csv('phishi_with_feat.csv', index= False)

In [24]:
#Concatenating the dataframes into one 
full_data = pd.concat([legitimate_new, phishing_new]).reset_index(drop=True)
full_data

Unnamed: 0,url,domain_len,have_dot,have_slash,have_ip,have_at,have_domain_sign,have_sub_directions,have_key_word,unique_words,html_tags,scripts_tags,frame_tags,domain_age,label
0,http://lifehacker.com/5900260/how-can-stop-wor...,0,0,0,0,0,0,3,0,4406,642,13,0,0,0
1,http://cookpad.com/recipe/list/212659?utf8=%E2...,0,0,0,0,0,0,4,0,1571,1166,12,0,0,0
2,http://conservativetribune.com/civil-rights-le...,0,0,0,0,0,0,3,0,-1,-1,-1,1,0,0
3,http://distractify.com/igor-feng/28-photos-tha...,0,0,0,0,0,0,4,0,-1,-1,-1,1,0,0
4,http://motthegioi.vn/mao-trach-dong-qua-sach-b...,0,0,0,0,0,0,3,0,-1,-1,-1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8995,https://docsharex-authorize.firebaseapp.com/co...,1,0,0,0,0,1,4,0,46,10,0,0,1,1
8996,https://zxass.jbkyj0o.cn/,0,0,0,0,0,0,2,0,-1,-1,-1,1,1,1
8997,https://auth-task1-m.web.app/,0,0,0,0,0,1,2,0,209,24,1,0,0,1
8998,https://servervalidationcheck935.web.app/,1,0,0,0,0,0,2,1,-1,-1,-1,1,0,1


In [26]:
full_data.isnull().sum()
full_data.to_csv('full_data.csv',index=False)