In [1]:
import pandas as pd
from urllib.parse import urlparse
import re

In [2]:
dataphish = pd.read_csv('phishing_dataset.csv')
dataphish.columns = ['URLs']
dataphish.shape

(9964, 1)

In [3]:
dataphish.head()

Unnamed: 0,URLs
0,http://bid.openx.net/json?amp;amp;amp;amp;cid;...
1,http://webmail2.centurytel.net/hwebmail/servic...
2,http://www.google.com.ng/imgres?imgurl=http://...
3,http://webmail2.centurytel.net/hwebmail/servic...
4,http://www.liceonuzzi.it/cmd=_Inf/connectionSt...


In [4]:
dp = dataphish.sample(n = 5000, random_state = 12).copy()
dp = dp.reset_index(drop=True)
dp.head()

Unnamed: 0,URLs
0,http://www.mylivingreef.com/cdirecroot/upx/lee...
1,http://us.battle.net.account-com.net/battle_ne...
2,http://www.refriautoartiles.com/administrator/...
3,http://www.banmarianna.hu/xmlrpc/includes/Brad...
4,http://www.cpiano.com/js/?us.battle.net/login/...


In [5]:
dp.shape

(5000, 1)

In [6]:
datalegitimate = pd.read_csv("legitimate_dataset.csv")
datalegitimate.columns = ['URLs']
datalegitimate.head()

Unnamed: 0,URLs
0,http://1337x.to/torrent/1110018/Blackhat-2015-...
1,http://1337x.to/torrent/1122940/Blackhat-2015-...
2,http://1337x.to/torrent/1124395/Fast-and-Furio...
3,http://1337x.to/torrent/1145504/Avengers-Age-o...
4,http://1337x.to/torrent/1160078/Avengers-age-o...


In [7]:
dl = datalegitimate.sample(n = 5000, random_state = 12).copy()
dl = dl.reset_index(drop=True)
dl.head()

Unnamed: 0,URLs
0,http://graphicriver.net/search?date=this-month...
1,http://ecnavi.jp/redirect/?url=http://www.cros...
2,https://hubpages.com/signin?explain=follow+Hub...
3,http://extratorrent.cc/torrent/4190536/AOMEI+B...
4,http://icicibank.com/Personal-Banking/offers/o...


In [8]:
dl.shape

(5000, 1)

In [9]:
def getDomain(url):  
  domain = urlparse(url).netloc
  if re.match(r"^www.",domain):
	       domain = domain.replace("www.","")
  return domain

In [10]:
def getDepth(url):
  s = urlparse(url).path.split('/')
  depth = 0
  for j in range(len(s)):
    if len(s[j]) != 0:
      depth = depth+1
  return depth

In [11]:
def haveAtSign(url):
  if "@" in url:
    at = 1    
  else:
    at = -1    
  return at

In [12]:
def length(url):
  if len(url) < 54:
    length = -1            
  else:
    length = 1            
  return length

In [13]:
def redirection(url):
  pos = url.rfind('//')
  if pos > 6:
    if pos > 7:
      return 1
    else:
      return -1
  else:
    return -1

In [14]:
def httpDomain(url):
  domain = urlparse(url).netloc
  if 'https' in domain:
    return 1
  else:
    return -1

In [15]:
shortening_services = r"bit\.ly|goo\.gl|shorte\.st|go2l\.ink|x\.co|ow\.ly|t\.co|tinyurl|tr\.im|is\.gd|cli\.gs|" \
                      r"yfrog\.com|migre\.me|ff\.im|tiny\.cc|url4\.eu|twit\.ac|su\.pr|twurl\.nl|snipurl\.com|" \
                      r"short\.to|BudURL\.com|ping\.fm|post\.ly|Just\.as|bkite\.com|snipr\.com|fic\.kr|loopt\.us|" \
                      r"doiop\.com|short\.ie|kl\.am|wp\.me|rubyurl\.com|om\.ly|to\.ly|bit\.do|t\.co|lnkd\.in|db\.tt|" \
                      r"qr\.ae|adf\.ly|goo\.gl|bitly\.com|cur\.lv|tinyurl\.com|ow\.ly|bit\.ly|ity\.im|q\.gs|is\.gd|" \
                      r"po\.st|bc\.vc|twitthis\.com|u\.to|j\.mp|buzurl\.com|cutt\.us|u\.bb|yourls\.org|x\.co|" \
                      r"prettylinkpro\.com|scrnch\.me|filoops\.info|vzturl\.com|qr\.net|1url\.com|tweez\.me|v\.gd|" \
                      r"tr\.im|link\.zip\.net"

def tinyURL(url):
    match=re.search(shortening_services,url)
    if match:
        return 1
    else:
        return -1

In [16]:
def prefixSuffix(url):
    if '-' in urlparse(url).netloc:
        return 1            
    else:
        return -1

In [17]:
def featureExtraction(url,result):

  features = []

  features.append(getDomain(url)) 
  features.append(getDepth(url))
  features.append(haveAtSign(url))
  features.append(length(url))
  features.append(redirection(url))
  features.append(httpDomain(url))
  features.append(tinyURL(url))
  features.append(prefixSuffix(url))
  
  features.append(result)
  
  return features

In [18]:
legitimate_features = []
result = -1
for i in range(0, 5000):
  url = dl['URLs'][i]
  legitimate_features.append(featureExtraction(url,result))

In [19]:
feature_names = ['Domain_Name', 'URL_Depth', 'Have_At', 'URL_Length','Redirection', 
                      'https_Domain', 'TinyURL', 'Prefix/Suffix', 'Result']

legitimate = pd.DataFrame(legitimate_features, columns = feature_names)
legitimate.head()

Unnamed: 0,Domain_Name,URL_Depth,Have_At,URL_Length,Redirection,https_Domain,TinyURL,Prefix/Suffix,Result
0,graphicriver.net,1,-1,1,-1,-1,-1,-1,-1
1,ecnavi.jp,1,-1,1,1,-1,-1,-1,-1
2,hubpages.com,1,-1,1,-1,-1,-1,-1,-1
3,extratorrent.cc,3,-1,1,-1,-1,-1,-1,-1
4,icicibank.com,3,-1,1,-1,-1,-1,-1,-1


In [20]:
legitimate.to_csv('legitimate.csv', index= False)

In [21]:
phishing_features = []
result = 1
for i in range(0, 5000):
  url = dp['URLs'][i]
  phishing_features.append(featureExtraction(url,result))

In [22]:
phishing = pd.DataFrame(phishing_features, columns = feature_names)
phishing.head()

Unnamed: 0,Domain_Name,URL_Depth,Have_At,URL_Length,Redirection,https_Domain,TinyURL,Prefix/Suffix,Result
0,mylivingreef.com,4,-1,1,-1,-1,-1,-1,1
1,us.battle.net.account-com.net,1,-1,1,-1,-1,-1,1,1
2,refriautoartiles.com,6,-1,1,-1,-1,-1,-1,1
3,banmarianna.hu,4,-1,1,-1,-1,-1,-1,1
4,cpiano.com,1,-1,1,1,-1,-1,-1,1


In [23]:
phishing.to_csv('phishing.csv', index= False)

In [24]:
urls = pd.concat([legitimate, phishing]).reset_index(drop=True)
urls.head()

Unnamed: 0,Domain_Name,URL_Depth,Have_At,URL_Length,Redirection,https_Domain,TinyURL,Prefix/Suffix,Result
0,graphicriver.net,1,-1,1,-1,-1,-1,-1,-1
1,ecnavi.jp,1,-1,1,1,-1,-1,-1,-1
2,hubpages.com,1,-1,1,-1,-1,-1,-1,-1
3,extratorrent.cc,3,-1,1,-1,-1,-1,-1,-1
4,icicibank.com,3,-1,1,-1,-1,-1,-1,-1


In [25]:
urls.shape

(10000, 9)

In [26]:
urls.to_csv('Classified_Dataset.csv', index=False)