In [1]:
import re
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from colorama import Fore
from urllib.parse import urlparse
from tld import get_tld, is_tld

In [2]:
df = pd.read_csv('../dataset/clean_malicious_phish.csv')

In [3]:
df.head()

Unnamed: 0,url,type,Category
0,br-icloud.com.br,phishing,2
1,mp3raid.com/music/krizz_kaliko.html,benign,0
2,bopsecrets.org/rexroth/cr/1.htm,benign,0
3,http://garage-pirenne.be/index.php?option=com_...,defacement,1
4,http://adventure-nicaragua.net/index.php?optio...,defacement,1


## Extract the primary domain

In [4]:
df['url_len'] = df['url'].apply(lambda x: len(str(x)))

In [5]:
def process_tld(url):
    try:
        res = get_tld(url, as_object = True, fail_silently=False,fix_protocol=True)
        pri_domain= res.parsed_url.netloc
    except :
        pri_domain= None
    return pri_domain

In [6]:
df['domain'] = df['url'].apply(lambda i: process_tld(i))

In [7]:
df.head(10)

Unnamed: 0,url,type,Category,url_len,domain
0,br-icloud.com.br,phishing,2,16,br-icloud.com.br
1,mp3raid.com/music/krizz_kaliko.html,benign,0,35,mp3raid.com
2,bopsecrets.org/rexroth/cr/1.htm,benign,0,31,bopsecrets.org
3,http://garage-pirenne.be/index.php?option=com_...,defacement,1,84,garage-pirenne.be
4,http://adventure-nicaragua.net/index.php?optio...,defacement,1,235,adventure-nicaragua.net
5,http://buzzfil.net/m/show-art/ils-etaient-loin...,benign,0,118,buzzfil.net
6,espn.go.com/nba/player/_/id/3457/brandon-rush,benign,0,45,espn.go.com
7,yourbittorrent.com/?q=anthony-hamilton-soulife,benign,0,46,yourbittorrent.com
8,http://pashminaonline.com/pure-pashminas,defacement,1,40,pashminaonline.com
9,allmusic.com/album/crazy-from-the-heat-r16990,benign,0,45,allmusic.com


In [8]:
feature = ['@','?','-','=','.','#','%','+','$','!','*',',','//']
for a in feature:
    df[a] = df['url'].apply(lambda i: i.count(a))

In [9]:
df.tail(10)

Unnamed: 0,url,type,Category,url_len,domain,@,?,-,=,.,#,%,+,$,!,*,",",//
651181,1up.com/do/gameOverview?cId=3159391,phishing,2,35,1up.com,0,1,0,1,1,0,0,0,0,0,0,0,0
651182,psx.ign.com/articles/131/131835p1.html,phishing,2,38,psx.ign.com,0,0,0,0,3,0,0,0,0,0,0,0,0
651183,wii.gamespy.com/wii/cursed-mountain/,phishing,2,36,wii.gamespy.com,0,0,1,0,2,0,0,0,0,0,0,0,0
651184,wii.ign.com/objects/142/14270799.html,phishing,2,37,wii.ign.com,0,0,0,0,3,0,0,0,0,0,0,0,0
651185,xbox360.gamespy.com/xbox-360/dead-space/,phishing,2,40,xbox360.gamespy.com,0,0,2,0,2,0,0,0,0,0,0,0,0
651186,xbox360.ign.com/objects/850/850402.html,phishing,2,39,xbox360.ign.com,0,0,0,0,3,0,0,0,0,0,0,0,0
651187,games.teamxbox.com/xbox-360/1860/Dead-Space/,phishing,2,44,games.teamxbox.com,0,0,2,0,2,0,0,0,0,0,0,0,0
651188,gamespot.com/xbox360/action/deadspace/,phishing,2,38,gamespot.com,0,0,0,0,1,0,0,0,0,0,0,0,0
651189,en.wikipedia.org/wiki/Dead_Space_(video_game),phishing,2,45,en.wikipedia.org,0,0,0,0,2,0,0,0,0,0,0,0,0
651190,angelfire.com/goth/devilmaycrytonite/,phishing,2,37,angelfire.com,0,0,0,0,1,0,0,0,0,0,0,0,0


In [10]:
def abnormal_url(url):
    hostname = urlparse(url).hostname
    hostname = str(hostname)
    match = re.search(hostname, url)
    if match:
        return 1
    else:
        return 0

In [11]:
df['abnormal_url'] = df['url'].apply(lambda i: abnormal_url(i))

In [13]:
df.value_counts('abnormal_url')

abnormal_url
0    463185
1    188006
Name: count, dtype: int64

Let's check https secure...

In [14]:
def httpSecure(url):
    htp = urlparse(url).scheme
    match = str(htp)
    if match=='https':
        return 1
    else:
        return 0

In [15]:
df['https'] = df['url'].apply(lambda i: httpSecure(i))

In [16]:
df.value_counts('https')

https
0    635511
1     15680
Name: count, dtype: int64

Let's count digit char

In [17]:
def digit_count(url):
    digits = 0
    for i in url:
        if i.isnumeric():
            digits = digits + 1
    return digits

In [18]:
df['digits']= df['url'].apply(lambda i: digit_count(i))

Let's count letter char

In [21]:
def letter_count(url):
    letters = 0
    for i in url:
        if i.isalpha():
            letters = letters + 1
    return letters

In [22]:
df['letters']= df['url'].apply(lambda i: letter_count(i))

## Now check if shortener service urls used

In [23]:
def Shortining_Service(url):
    match = re.search('bit\.ly|goo\.gl|shorte\.st|go2l\.ink|x\.co|ow\.ly|t\.co|tinyurl|tr\.im|is\.gd|cli\.gs|'
                      'yfrog\.com|migre\.me|ff\.im|tiny\.cc|url4\.eu|twit\.ac|su\.pr|twurl\.nl|snipurl\.com|'
                      'short\.to|BudURL\.com|ping\.fm|post\.ly|Just\.as|bkite\.com|snipr\.com|fic\.kr|loopt\.us|'
                      'doiop\.com|short\.ie|kl\.am|wp\.me|rubyurl\.com|om\.ly|to\.ly|bit\.do|t\.co|lnkd\.in|'
                      'db\.tt|qr\.ae|adf\.ly|goo\.gl|bitly\.com|cur\.lv|tinyurl\.com|ow\.ly|bit\.ly|ity\.im|'
                      'q\.gs|is\.gd|po\.st|bc\.vc|twitthis\.com|u\.to|j\.mp|buzurl\.com|cutt\.us|u\.bb|yourls\.org|'
                      'x\.co|prettylinkpro\.com|scrnch\.me|filoops\.info|vzturl\.com|qr\.net|1url\.com|tweez\.me|v\.gd|'
                      'tr\.im|link\.zip\.net',
                      url)
    if match:
        return 1
    else:
        return 0

In [24]:
df['Shortining_Service'] = df['url'].apply(lambda i: Shortining_Service(i))

In [25]:
df.value_counts('Shortining_Service')

Shortining_Service
0    611436
1     39755
Name: count, dtype: int64