# **2.B FEATURE EXTRACTION**
Phishing URLs only

#### The objective of this notebook is to collect data and save it as a CSV file for Feature Extraction.

* Lexical Features
* Whois Features
* Popularity Features

#### This project is worked on Jupyter Notebook 

In [1]:
import pandas as pd
from urllib.parse import urlparse
import re
from bs4 import BeautifulSoup
import whois
import urllib.request
import time
import socket
from urllib.error import HTTPError
from datetime import datetime

In [2]:
 
legitimate = pd.read_csv("/home/jovyan/Datasets/Dataset1/Extract-Data-Benign/url_orignal.csv")



In [3]:
legitimate

Unnamed: 0,http://1337x.to/torrent/1048648/American-Sniper-2014-MD-iTALiAN-DVDSCR-X264-BST-MT/
0,http://1337x.to/torrent/1110018/Blackhat-2015-...
1,http://1337x.to/torrent/1122940/Blackhat-2015-...
2,http://1337x.to/torrent/1124395/Fast-and-Furio...
3,http://1337x.to/torrent/1145504/Avengers-Age-o...
4,http://1337x.to/torrent/1160078/Avengers-age-o...
...,...
35372,https://lastpass.com/signup2.php?ac=1&from_uri...
35373,https://lastpass.com/signup2.php?ac=1&from_uri...
35374,https://lastpass.com/signup2.php?ac=1&from_uri...
35375,https://lastpass.com/signup2.php?ac=1&from_uri...


In [4]:

legit = legitimate.sample(n = 6000, random_state = 12).copy()

legitimate_urls = legit.reset_index(drop=True)
legitimate_urls


Unnamed: 0,http://1337x.to/torrent/1048648/American-Sniper-2014-MD-iTALiAN-DVDSCR-X264-BST-MT/
0,http://graphicriver.net/search?date=this-month...
1,http://ecnavi.jp/redirect/?url=http://www.cros...
2,https://hubpages.com/signin?explain=follow+Hub...
3,http://extratorrent.cc/torrent/4190536/AOMEI+B...
4,http://icicibank.com/Personal-Banking/offers/o...
...,...
5995,http://mylust.com/videos/26185/extreme-pussy-i...
5996,http://onedio.com/haber/irak-savunma-bakanligi...
5997,http://io9.com/this-eerie-chandelier-is-contro...
5998,https://www.gov.uk/government/policies/local-e...


In [5]:
legitimate_urls.columns = ['url']
legitimate_urls.head()

Unnamed: 0,url
0,http://graphicriver.net/search?date=this-month...
1,http://ecnavi.jp/redirect/?url=http://www.cros...
2,https://hubpages.com/signin?explain=follow+Hub...
3,http://extratorrent.cc/torrent/4190536/AOMEI+B...
4,http://icicibank.com/Personal-Banking/offers/o...


## 2.1 Lexical Features

* URL Length 
* URL Shortening Services “TinyURL”
* URL Presence of "@" Symbol
* URL Presence of special characters : _ ? = & etc
* URL Suspicious words (security sensitive words)
* URL Digit Count
* URL Protocol Count (http / https)
* URL Dot Count
* URL Hyphen Count
* Domain presence of IP Address
* Domain presence of hyphen / prefix or Suffix
* Sub Domain and Multi Sub Domains Count
* Redirecting "//" in URL (// position)
* URL presence of EXE


In [6]:
#class FeatureExtraction:
#    def __init__(url):
#        pass

# 1.Extracts domain from the given URL
def getDomain(url):
    domain = urlparse(url).netloc
    if re.match(r"^www.",domain):
        domain = domain.replace("www.","")
    return domain
    
# 2.Checks for IP address in URL (Have_IP)
def ip_address(url):
    try:
        ipaddress.ip_address(url)
        ip = 1
    except:
        ip = 0
    return ip
    
# 3.Checks the presence of @ in URL (Have_At)
def have_at_symbol(url):
    if "@" in url:
        at = 1 
    else:
        at = 0   
    return at
    
# 4.Finding the length of URL and categorizing (URL_Length)
def long_url(url):
    if len(url) < 54:
        length = 0    
    else:
        length = 1    
    return length

# 5.Gives number of '/' in URL (URL_Depth)
def getDepth(url):
    s = urlparse(url).path.split('/')
    depth = 0
    for j in range(len(s)):
        if len(s[j]) != 0:
            depth = depth+1
    return depth
        
# 6.Checking for redirection '//' in the url (Redirection)
def redirection(url):
    pos = url.rfind('//')
    if pos > 6:
        if pos > 7:
            return 1
        else:
            return 0
    else:
        return 0
    
# 7.Existence of “HTTPS” Token in the Domain Part of the URL (https_Domain)
def httpDomain(url):
    domain = urlparse(url).netloc
    if 'https://|http://' in domain:
        return 1
    else:
        return 0

    
# 8. Checking for Shortening Services in URL (Tiny_URL) 
def shortening_service(url):
    match = re.search('bit\.ly|goo\.gl|shorte\.st|go2l\.ink|x\.co|ow\.ly|t\.co|tinyurl|tr\.im|is\.gd|cli\.gs|'
                    'yfrog\.com|migre\.me|ff\.im|tiny\.cc|url4\.eu|twit\.ac|su\.pr|twurl\.nl|snipurl\.com|'
                    'short\.to|BudURL\.com|ping\.fm|post\.ly|Just\.as|bkite\.com|snipr\.com|fic\.kr|loopt\.us|'
                    'doiop\.com|short\.ie|kl\.am|wp\.me|rubyurl\.com|om\.ly|to\.ly|bit\.do|t\.co|lnkd\.in|'
                    'db\.tt|qr\.ae|adf\.ly|goo\.gl|bitly\.com|cur\.lv|tinyurl\.com|ow\.ly|bit\.ly|ity\.im|'
                    'q\.gs|is\.gd|po\.st|bc\.vc|twitthis\.com|u\.to|j\.mp|buzurl\.com|cutt\.us|u\.bb|yourls\.org|'
                    'x\.co|prettylinkpro\.com|scrnch\.me|filoops\.info|vzturl\.com|qr\.net|1url\.com|tweez\.me|v\.gd|'
                    'tr\.im|link\.zip\.net', url)
    if match:
        return 1               # phishing
    else:
        return 0               # legitimate
    
    
    
    
# 9.Checking for Prefix or Suffix Separated by (-) in the Domain (Prefix/Suffix)     
def prefix_suffix_separation(url):
    if "-" in urlparse(url).netloc:
        return 1            # phishing
    else:
        return 0            # legitimate
    
# 10. DNS Record 

    
# 11.Web traffic (Web_Traffic)
def web_traffic(url):
    try:
        url = urllib.parse.quote(url)
        rank = BeautifulSoup(urllib.request.urlopen("http://data.alexa.com/data?cli=10&dat=s&url=" + url).read(), "xml").find(
        "REACH")['RANK']
        rank = int(rank)
    except TypeError:
        return 1
    if rank <100000:
        return 1
    else:
        return 0
        
# 12.Survival time of domain: The difference between termination time and creation time (Domain_Age)  
def domainAge(domain_name):
    creation_date = domain_name.creation_date
    expiration_date = domain_name.expiration_date
    if (isinstance(creation_date,str) or isinstance(expiration_date,str)):
        try:
            creation_date = datetime.strptime(creation_date,'%Y-%m-%d')
            expiration_date = datetime.strptime(expiration_date,"%Y-%m-%d")
        except:
            return 1
    if ((expiration_date is None) or (creation_date is None)):
        return 1
    elif ((type(expiration_date) is list) or (type(creation_date) is list)):
        return 1
    else:
        ageofdomain = abs((expiration_date - creation_date).days)
        if ((ageofdomain/30) < 6):
            age = 1
        else:
            age = 0
    return age

# 13.End time of domain: The difference between termination time and current time (Domain_End) 
def domainEnd(domain_name):
    expiration_date = domain_name.expiration_date
    if isinstance(expiration_date,str):
        try:
            expiration_date = datetime.strptime(expiration_date,"%Y-%m-%d")
        except:
            return 1
    if (expiration_date is None):
        return 1
    elif (type(expiration_date) is list):
        return 1
    else:
        today = datetime.now()
        end = abs((expiration_date - today).days)
    if ((end/30) < 6):
        end = 0
    else:
        end = 1
    return end

# 14. Dot count
def dot_count(url):
    if url.count(".") < 3:
        return 0            # legitimate
    elif url.count(".") == 3:
        return 1            # suspicious
    else:
        return 1            # phishing
        
    
# 14. Special characters count
def specialcharCount(url):
    cnt = 0
    special_characters = [';','+=','_','?','=','&','[',']','/',':']
    for each_letter in url:
        if each_letter in special_characters:
            cnt = cnt + 1
    return cnt


# 15. 
def subdomCount(url):

    # separate protocol and domain then count the number of dots in domain
    
    domain = url.split("//")[-1].split("/")[0].split("www.")[-1]
    if(domain.count('.')<=1):
        return 0
    else:
        return 1

In [7]:
#Function to extract features
def featureExtraction(url,label):
    
    features = []
    
    features.append(getDomain(url))
    features.append(ip_address(url))
    features.append(have_at_symbol(url))
    features.append(long_url(url))
    features.append(getDepth(url))
    features.append(redirection(url))
    features.append(httpDomain(url))
    features.append(shortening_service(url))
    features.append(prefix_suffix_separation(url))
  
    dns = 0
    try:
        domain_name = whois.whois(urlparse(url).netloc)
    except:
        dns = 1
        
    features.append(dns)
    features.append(web_traffic(url))
    features.append(1 if dns == 1 else domainAge(domain_name))
    features.append(1 if dns == 1 else domainEnd(domain_name))
    
    features.append(dot_count(url))
    features.append(specialcharCount(url))
    features.append(subdomCount(url))
    

    features.append(label)
    
    
    return features

In [8]:
feature_names = ['domain', 'ip_present', 'at_present', 'url_length', 'url_depth','redirection', 
                      'https_domain', 'short_url', 'prefix/suffix', 'dns_record', 'web_traffic', 
                      'domain_age', 'domain_end', 'dot_count', 'specialchar_count','subdom_count', 'label']

label = 0

In [None]:
# Extracting the features & storing them in a list
# Lexical Features

# starting time
start_time = time.time()
print('\n')
print('Begin feature extraction for benign dataset.... \n')

##===================================##


#Extracting the feautres & storing them in a list
legit_features = []
rows = len(legitimate_urls['url'])
label = 0

for i in range(0, rows):
    url = legitimate_urls['url'][i]
    print(i), print(url)
    
    
    legit_features.append(featureExtraction(url,label))

    
##===================================##

elapsed = time.strftime("%H:%M:%S", time.gmtime(time.time() - start_time))
print('\n')
print(f"Runtime: Feature Extraction for legitimate dataset took:  {elapsed}")


print('\n\n\n\n')
print("***Legitimate Features")




Begin feature extraction for benign dataset.... 

0
http://graphicriver.net/search?date=this-month&length_max=&length_min=&price_max=&price_min=&rating_min=&sales=&sort=sales&term=&view=list
1
http://ecnavi.jp/redirect/?url=http://www.cross-a.net/x.php?id=1845_3212_22061_26563&m=1004&pid=%user_id%
2
https://hubpages.com/signin?explain=follow+Hubs&url=%2Fhub%2FComfort-Theories-of-Religion
3
http://extratorrent.cc/torrent/4190536/AOMEI+Backupper+Technician+%2B+Server+Edition+2.8.0+%2B+Patch+%2B+Key+%2B+100%25+Working.html
4
http://icicibank.com/Personal-Banking/offers/offer-detail.page?id=offer-ezeego-domestic-airtravel-20141407112611060
5
http://nypost.com/2015/05/07/us-indifference-leaves-saudis-partnering-with-terrorists/
6
http://kienthuc.net.vn/diem-thi/diem-chuan-dh-cong-nghe-giao-thong-van-tai-nam-2014-482407.html
7
http://thenextweb.com/in/2015/04/16/india-wants-a-neutral-web-and-facebooks-internet-org-cant-be-a-part-of-it/gtm.js
8
http://tobogo.net/cdsb/board.php?board=greet&b

73
http://torcache.net/torrent/047D47DFF4DC5CD9BEA6D0F4C57D68F2F2D71205.torrent?title=[kickass.to]night.at.the.museum.secret.of.the.tomb.2014.1080p.brrip.x264.yify
74
http://thenextweb.com/asia/2014/09/26/myanmars-mobile-revolution-kicks-telenor-prepares-launch-service/gtm.start
75
http://extratorrent.cc/torrent/4189616/Jedi+Mind.Tricks.The.Thief.and.the.Fallen.2015.mp3.vbr.NOiR.html
76
http://genius.com/2990084/Us-general-services-administration-gsa-mentor-protege-program-subpart-51970/Incentives-for-prime-contractors
77
http://thenextweb.com/apps/2012/04/19/500px-launches-android-app-and-overhauls-its-ipad-version-too/
78
http://kienthuc.net.vn/tin-tuc-tuyen-sinh/ty-le-choi-cua-dai-hoc-fpt-nam-2014-487408.html
79
http://sourceforge.net/directory/development/add_facet_filter?facet=license&constraint=OSI-Approved+Open+Source+%3A%3A+PHP+License
80
http://nypost.com/2015/01/28/tepper-loses-fight-to-restructure-caesars-in-delaware/
81
http://ap.org/Content/Press-Release/2013/NFL-celebrate

141
http://seekingalpha.com/article/2471615-mexico-economy-to-move-with-energy-reforms-part-1
142
http://web.tv/liveCategory/6/language/1/index/changeLanguage/newshared/searchAutoComplete
143
http://indianexpress.com/article/sports/football/chelsea-go-past-leicester-city-3-1-eye-premier-league-title/
144
http://web.de/magazine/sport/fussball/champions-league/wunder-fc-schalke-04-real-madrid-champions-league-17321480
145
http://olx.co.id/i2/elektronik-gadget/komputer/keyboard-mouse/elektronik-gadget/komputer/keyboard-mouse
146
http://mylust.com/videos/171589/my-lewd-chinese-wife-is-really-into-having-sex-in-missionary-position/
147
http://babal.net/books/view/397/%D9%82%D8%A7%D9%86%D9%88%D9%86-%D8%BA%D8%B1%D9%81-%D8%A7%D9%84%D8%B5%D9%86%D8%A7%D8%B9%D8%A9-%D8%A7%D9%84%D8%A7%D8%B1%D8%AF%D9%86%D9%8A
148
http://techcrunch.com/2014/03/18/hurry-this-is-your-last-chance-to-get-tickets-to-tonights-washington-d-c-meetup/
149
http://distractify.com/post/related/id/54b985434a0c4b136ee6813d/skip/30

212
http://nguyentandung.org/dieu-gi-xay-ra-neu-chung-ta-khong-xu-ly-kheo-vu-gian-khoan-981.html
213
http://buzzfil.net/article/5466/animaux/georges-le-chat-qui-se-tient-debout-tout-le-temps-3.html?href=inner_website
214
http://thenextweb.com/insider/2014/10/29/unbabel-integrates-mailchimp-offer-translation-service-promotional-emails/gtm.js/
215
http://fanpage.gr/must-watch/%ce%b4%ce%b5%ce%af%cf%84%ce%b5-%cf%84%ce%b9%cf%82-%ce%ba%ce%b1%ce%bb%cf%8d%cf%84%ce%b5%cf%81%ce%b5%cf%82-%cf%80%cf%81%ce%bf%cf%84%ce%ac%cf%83%ce%b5%ce%b9%cf%82-%ce%b3%ce%ac%ce%bc%ce%bf%cf%85/
216
http://extratorrent.cc/torrent_download/4189419/Ken+Follett+-+Notte+SullAcqua+%5BEbook+-+Ita%5D+%5B+%5D.torrent
217
http://mylust.com/videos/89242/three-insatiable-girlfriend-share-one-juicy-stiff-cock/
218
http://nguyentandung.org/can-ro-muc-dich-ban-co-phan-theo-lo-tai-doanh-nghiep-nha-nuoc.html
219
http://babal.net/books/view/1394/%D9%88%D9%84%D8%AF%D8%AA-%D9%87%D9%86%D8%A7%D9%83%D8%8C-%D9%88%D9%84%D8%AF%D8%AA-%D9%87%D9%

275
http://grantland.com/hollywood-prospectus/three-potential-replacements-for-the-ousted-piers-morgan-hint-theyre-all-jay-leno/
276
https://twitter.com/home?status=%E3%83%8C%E3%81%91%E3%82%8B%EF%BC%81%E3%80%90%E5%B7%A8%E4%B9%B3%E3%80%91+http%3A%2F%2Fero-video.net%2Ft%2FInoAwxva2pEyzD9f+%E7%BE%8E%E5%92%B2%E7%B5%90%E8%A1%A3+%23ero+%23douga+%23agesage
277
http://haberler.com/gaziantep-te-karsit-goruslu-ogrenciler-arasinda-7302901-haberi/gaziantep-te-karsit-goruslu-ogrenciler-arasinda-7302003
278
http://techcrunch.com/2015/03/18/health-insurer-premera-blue-breached-11m-customers-information-exposed-including-medical-records/
279
http://buzzfil.net/m/show-art/quand-jamel-debbouze-retourne-le-plateau-de-tf1-wow-quel-jt-9.html
280
http://twitter.com/home?status=%E3%83%8C%E3%81%91%E3%82%8B%EF%BC%81%E3%80%90%E5%B7%A8%E4%B9%B3%E3%80%91+http%3A%2F%2Fero-video.net%2Ft%2FInoAwxva2pEyzD9f+%E7%BE%8E%E5%92%B2%E7%B5%90%E8%A1%A3+%23ero+%23douga+%23agesage
281
http://olx.co.id/i2/hobi-olahraga/olahraga/

343
http://correios.com.br/para-voce/consultas-e-solicitacoes/precos-e-prazos/servicos-nacionais_pasta/cecograma
344
http://extratorrent.cc/torrent/4189651/Zatoichis+Pilgrimage.1966.480p.BluRay.x264.mSD.html
345
http://motthegioi.vn/tags/IHBob25nIHRo4buneSBiw6BuIGzDoG0gdmnhu4dj/phong-thuy-ban-lam-viec.html
346
http://caixa.gov.br/poder-publico/apoio-poder-publico/servicos-caixa/servicos-judiciarios
347
http://digg.com/video/billy-eichner-performs-glitter-and-ribs-his-song-for-taylor-swift
348
http://allegro.pl/listing/listing.php?id=10&order=m&string=%7Bstring%7D&bmatch=seng-ps-mp-p-sm-isqm-2-e-0402
349
http://fazenda.gov.br/divulgacao/agenda/agenda_dia_view?year:int=2015&month:int=3&day:int=13
350
http://bdnews24.com/environment/2015/03/08/winged-poppy-thieves-eat-into-farmers-yields
351
http://nesn.com/2015/05/champions-league-semifinal-wrap-juventus-barcelona-out-last-real-madrid-bayern-munich/
352
http://tobogo.net/cdsb/board.php?board=funnyvideo&bm=view&no=11589&category=&auth=&pa

415
http://io9.com/5936202/are-these-horrific-characters-the-new-villains-in-mad-max-fury-road
416
http://gtbank.com/media-centre/gtbank-in-the-news/14-media/press-releases/532-gtbank-adjudged-2014-african-bank-of-the-year-wins-for-the-2nd-time-in-a-row
417
http://olx.co.id/bali/q-%7Bq%7D/?utm_source=google&utm_medium=search&utm_campaign=search_organic
418
http://sourceforge.net/directory/audio-video/add_facet_filter?facet=natlanguage&constraint=Dutch
419
http://allrecipes.com/video/4986/oatmeal-cookie-apple-crisp/detail.aspx?prop24=VH_Newest
420
http://rt.com/in-vision/venezuela-drill-artillery-2015/minister-state-general-2015/
421
http://icicibank.com/Personal-Banking/offers/offer-detail.page?id=offer-yatra-hotel-offer-20150604134542408
422
http://extratorrent.cc/torrent/4189367/Martin+Gardner+-+I+Misteri+della+Magia+Matematica%2C+%5BPdf+-+Ita%5D+%5B+%5D.html
423
http://tunein.com/radio/Vibration-Country-s117836/15480783/ca-pub-1542925551861702/StationLeader
424
http://atwiki.jp/wiki

481
http://extratorrent.cc/torrent_download/4189651/Zatoichis+Pilgrimage.1966.480p.BluRay.x264.mSD.torrent
482
http://zozo.jp/shop/abahouse/?price=proper&p_ssy=2015&p_ssm=5&p_ssd=13&p_sey=2015&p_sem=5&p_sed=13&dstk=2
483
http://olx.ua/uk/list/q-%D0%B2%D0%B5%D0%BB%D0%BE%D1%81%D0%B8%D0%BF%D0%B5%D0%B4%D1%8B/gtm.js
484
http://cheezburger.com/8492094976/superheroes-joker-dc-is-a-60-year-old-woman?ref=leftarrow&siteId=91
485
http://www.nhs.uk/news/2015/05May/Pages/Smartphone-app-used-to-scan-blood-for-parasites.aspx
486
http://jalopnik.com/5959102/watch-hurricane-sandys-floods-destroy-these-nypd-police-cars-and-192-others
487
http://kakaku.com/kaden/av-selector/ranking_2072/pricedown/div-gpt-ad-k/header_text
488
http://tunein.com/radio/SportsJuice---Campbell-River-Storm-s181916/15480783/ca-pub-1542925551861702/StationLeader
489
http://allegro.pl/listing/listing.php?bmatch=seng-v10-p-sm-pers-isqm-chl-moda-0414&id=4&order=m&string=%7Bstring%7D
490
http://sourceforge.net/directory/communication

554
http://bdnews24.com/environment/2015/03/22/climate-change-threatens-world-s-iconic-ecosystems
555
http://kenh14.vn/star/sau-hop-bao-demi-lovato-dai-nao-trung-tam-mua-sam-20150508053151327.chn
556
http://emgn.com/entertainment/portugal-player-goes-ballistic-and-gets-red-carded-for-head-butting-opponent1/
557
http://patch.com/florida/tarponsprings/slain-tarpon-springs-officer-remembered-washington-dc
558
http://udn.com/news/story/7318/901242-%E9%80%A3%E7%BA%8C%E5%81%B7%E5%BD%A9%E5%88%B8-%E5%A5%B3%E8%B3%8A%E8%A2%AB%E9%80%AE
559
http://buzzfil.net/article/5674/c-mignon/la-petite-n-a-que-10-ans-et-elle-a-deja-une-voix-d-or-tres-impressionnante-9.html?href=inner_website
560
http://olx.co.id/all-results/q-%7Bq%7D/?utm_source=google&utm_medium=search&utm_campaign=search_organic&view=galleryBig
561
http://seekingalpha.com/article/3179826-international-stock-etfs-one-way-or-another
562
http://searchengineland.com/pay-seo-hourly-monthly-project-based-full-answer-219903
563
http://tunein.com/r

625
http://tinnhanh360.net/con-duong-sa-chan-lam-gai-ban-d-am-cua-sinh-vien-that-nghiep.html
626
http://olx.ro/i2/anunturi-agricole/seminte-plante-pomi/cereale-furaje-fan/anunturi-agricole/seminte-plante-pomi/cereale-furaje-fan
627
http://tobogo.net/cdsb/board.php?board=storyani&bm=view&no=82&category=&auth=&page=1&search=&keyword=&recom=
628
http://torcache.net/torrent/B328659007CE94161E7A46A5C0397318FDBADB99.torrent?title=[kickass.to]horriblesubs.yamada.kun.and.the.seven.witches.05.720p.mkv
629
http://udn.com/news/story/7323/898767-%E5%8C%97%E5%B8%82%E9%87%8D%E9%99%BD%E7%AF%80%E6%95%AC%E8%80%81%E9%87%91-%E4%BB%8A%E5%B9%B4%E6%94%B9%E6%8E%A1%E5%8C%AF%E6%AC%BE
630
https://medium.com/backchannel/this-video-game-solved-the-problem-of-learning-guitar-d0fcea6f0d3b
631
http://torrentdn.com/bbs/s.php?bo_table=torrent_variety&wr_id=113366&k=%ED%81%AC%EB%9D%BC%EC%9E%84%EC%94%AC&page=1
632
http://ap.org/Content/AP-In-The-News/2012/AP-photojournalism-in-focus-with-OPC-Award-wins
633
http://tobogo

689
http://superuser.com/questions/868334/my-headphones-are-now-recognized-as-side-left-and-side-right-rather-than-le
690
http://irecommend.ru/content/skrab-dlya-tela-banka-agafi-kamchatskii-skrab-dlya-tela
691
http://yourlust.com/videos/mad-reverse-gang-bang-action-with-spoiled-ladies-and-one-big-dick.html
692
http://livestream.com/streamingcafekelowna/LovecoastPaperboyandTheMessengersmay23rd
693
http://thenextweb.com/insider/2015/05/09/mass-data-collection-is-back-on-the-cards-in-the-uk/gtm.start
694
http://mylust.com/videos/60150/hidden-cam-in-the-guest-room-caught-my-wife-masturbating/
695
http://olx.co.id/i2/elektronik-gadget/aksesoris-hp-tablet/elektronik-gadget/aksesoris-hp-tablet
696
http://correios.com.br/para-sua-empresa/comunicacao/comunicacao/imagens/comunicacao.jpg
697
http://abc.go.com/shows/the-middle/episode-guide/season-06/22-while-you-were-sleeping
698
http://elitedaily.com/dating/all-couples-fall-in-love-only-some-can-build-a-true-relationship/1018474/
699
http://qz.

759
http://fanpage.gr/family/antras/6-%cf%84%cf%8d%cf%80%ce%bf%ce%b9-%ce%b3%cf%85%ce%bd%ce%b1%ce%b9%ce%ba%cf%8e%ce%bd-%cf%80%ce%bf%cf%85-%ce%b1%cf%81%ce%ad%cf%83%ce%bf%cf%85%ce%bd-%cf%83%cf%84%ce%bf%cf%85%cf%82-%ce%ac%ce%bd%cf%84%cf%81/
760
http://diply.com/different-solutions/rock-covers-way-better-than-their-originals/125386
761
http://udn.com/news/story/7321/900996-%E7%84%A1%E7%85%A7%E7%9C%8B%E8%A8%BA-%E5%88%A4%E5%88%911%E5%B9%B44%E5%80%8B%E6%9C%88
762
http://torcache.net/torrent/6BD5D60102398970EA19007F62642CD8330A879E.torrent?title=[kickass.to]kingsman.the.secret.service.2015.hdcam.h264.aac.2ch.blitzcrieg
763
http://extratorrent.cc/torrent/4189343/Paul+Anka+-+His+All+Time+Greatest+Hits+%2830th+Anniversary%29+%281989%29+%7BFLAC%7D+vtwin88cube.html
764
http://udn.com/news/story/8031/868380-%E5%A0%B1%E7%A8%85%E2%80%A6%E7%94%A8%E8%87%AA%E7%84%B6%E4%BA%BA%E6%86%91%E8%AD%89%E6%9C%80%E7%9C%81%E4%BA%8B
765
http://jezebel.com/kris-jenner-goes-ham-on-intouch-over-transphobic-bruce-167991392

828
http://thenextweb.com/shareables/2015/05/07/watch-this-artist-draw-famous-logos-by-hand/gtm.js
829
http://tunein.com/radio/News-957-s60243/15480783/ca-pub-1542925551861702/StationLeader
830
http://espn.go.com/nhl/story/_/id/12757345/nhl-hiring-peter-chiarelli-signals-big-culture-change-edmonton-oilers
831
http://genius.com/2992858/Us-general-services-administration-gsa-mentor-protege-program-subpart-51970/Contracting-officers
832
http://variety.com/2015/tv/news/constantine-arrow-season-4-crossover-possibility-1201492462/?replytocom=1294200
833
http://cox.com/residential/support/tv/articles.cox?catId=p2_order_programs&catName=T3JkZXIgUHJvZ3JhbXM7r2o0c9o5x&prodId=p1_tv
834
http://1337x.to/torrent/1160602/Apowersoft-Streaming-Audio-Recorder-3-4-5-Keygen-100-Working/
835
http://cox.com/residential/support/internet/article.cox?articleId=43027480-462a-11e0-f270-000000000000
836
http://persianblog.ir/tags/42383/1/%d8%af%d8%a7%d8%b3%d8%aa%d8%a7%d9%86_%da%a9%d9%88%d8%aa%d8%a7%d9%87/
837
htt

896
http://torcache.net/torrent/233C92B7D475B9B739902F66F6CBCF56CA063CED.torrent?title=[kickass.to]the.walking.dead.seizoen3.afl.16.hdtv.xvid.nl.subs.dmt
897
http://thenextweb.com/insider/2014/07/26/online-payments-prostitution-internet-transforming-oldest-profession/
898
http://kakaku.com/kaden/digital-cooking-scale/ranking_2197/pricedown/div-gpt-ad-k/header_text
899
http://censor.net.ua/resonance/333305/tranzitnyyi_vopros_isportil_ih_sila_i_slabost_ukrainy_pered_litsom_gazproma
900
http://fishki.net/demotivation/1526201-pjatnichnaja-podborka--demotivatorov.html?mode=recent
901
http://onedio.com/haber/-turkiye-de-insan-hayati-ne-kadar-ucuz-dedirten-12-olay-507256
902
http://indianexpress.com/article/trending/ground-was-shaking-scary-video-of-avalanche-hitting-mt-everest-base-camp-goes-viral/
903
http://ringring.vn/doi-song/thay-giao-dam-o-hoc-sinh-lop-2-ngay-tren-buc-giang/118302.html?ref=xt
904
http://udn.com/news/story/6816/887931-%E9%87%91%E9%8C%A2%E6%98%AF%E5%B8%8C%E6%8B%89%E8%95%

958
http://askubuntu.com/questions/232569/deskbar-applet-function-for-ubuntu-12-10-non-unity
959
http://ap.org/Content/Press-Release/2014/SNTV-partners-with-6Medias-to-target-the-French-digital-market
960
http://censor.net.ua/resonance/335554/reportaj_iz_prigranichya_s_krymom_kto_i_pochemu_stoit_na_straje_ukrainy
961
http://distractify.com/post/related/id/55467a164a0c4b201e6b25f1/skip/30/limit/10/back/0
962
https://www.gov.uk/government/organisations/social-mobility-and-child-poverty-commission
963
http://comicbook.com/blog/category/batman-vs-superman?utm_campaign=rbm&utm_medium=popularcat3&utm_source=home
964
http://kakaku.com/bicycle/kids-bicycle/div-gpt-ad-k/sports/bicycle/kids-bicycle/728x90
965
http://syosetu.com/searchuser/search/index.php?name1st=%E3%81%B8&all=1&all2=1&all3=1&all4=1&p=11
966
http://stackexchange.com/newsletters/newsletter?site=windowsphone.stackexchange.com
967
http://twitter.com/home?status=%E3%83%8C%E3%81%91%E3%82%8B%EF%BC%81%E3%80%90%E5%B7%A8%E4%B9%B3%E3%80%9

1026
http://jalopnik.com/is-it-okay-to-leave-your-kid-in-a-car-if-youre-not-an-i-1702237113/templates/closure/
1027
http://caixa.gov.br/empresa/credito-financiamento/imoveis/financiamento-para-producao-imoveis/alocacao-de-recursos
1028
https://medium.com/human-parts/life-is-one-giant-game-of-metroid-cf5b0716e5e8?source=latest
1029
http://deadspin.com/the-nbas-christmas-uniforms-are-actually-great-this-yea-1704269847
1030
http://hollywoodlife.com/pics/hottest-celebrity-pics-this-week-dianna-agron-may-11-18/
1031
http://depositphotos.com/login.html?url=%2F58418321%2Fstock-photo-bearded-man-with-vintage-straight.html
1032
http://grantland.com/the-triangle/how-a-handful-of-mocked-and-minimized-signings-propelled-bostons-worst-to-first-comeback/
1033
http://bleacherreport.com/articles/2447086-2015-fantasy-football-outlook-for-new-york-giants-stars
1034
http://digg.com/video/the-alchemists-letter-is-an-animated-short-narrated-by-john-hurt
1035
http://rt.com/in-vision/ukraine-marks-chernobyl-

1091
http://olx.ro/i2/moda-frumusete/incaltaminte-barbati/bocanci/moda-frumusete/incaltaminte-barbati/bocanci
1092
http://deadspin.com/5738218/the-case-against-the-case-against-lance-armstrong/1696100504/+marchman
1093
http://otomoto.pl/oferta/case-ih-mdw-524-claas-fendt-new-holland-case-john-deere-ID6y3M61.html
1094
http://motthegioi.vn/cau-vong-luc-sac/hieu-ve-lgbt/7-dieu-ma-ban-thuong-lam-tuong-ve-chuyen-doi-gioi-tinh-188387.html
1095
http://gawker.com/kerry-says-the-administration-has-evidence-assad-used-s-1236159661/1246633641
1096
http://plarium.com/en/strategy-games/sparta-war-of-empires/news/besiege-other-player-s-cities/
1097
http://uproxx.com/sports/2015/05/the-pay-per-view-numbers-are-in-for-mayweather-pacquiao-and-everyones-super-rich/
1098
http://torcache.net/torrent/E2C1D0541CD20BCD57A9976FF4E21CBB1ABCA1F5.torrent?title=[kickass.to]the.big.bang.theory.s08e08.hdtv.x264.lol.eztv
1099
http://wikiwiki.jp/sample/?%A5%C7%A5%B6%A5%A4%A5%F3%A5%C6%A5%F3%A5%D7%A5%EC%A1%BC%A5%C8%2Fo

1161
http://hdfcbank.com/personal/products/accounts-and-deposits/current-accounts/merchant-advantage-current-account
1162
http://kenh14.vn/doi-song/le-roi-bat-ngo-duoc-lam-khach-moi-tren-1-chuong-trinh-cua-vtv1-20141227085627553.chn
1163
http://codecanyon.net/item/envato-user-support-ticket-system/full_screen_preview/11433331
1164
http://wpengine.com/2013/10/25/announcing-automated-upgrade-protection-wordpress-3-7/
1165
http://kakaku.com/camera/compact-flash/ranking_0051/pricedown/div-gpt-ad-k/header_text
1166
http://udn.com/news/story/6655/901381-%E5%A4%A7%E5%B7%A8%E8%9B%8B%E6%A1%88-%E6%9F%AF%E6%96%87%E5%93%B2%EF%BC%9A%E8%A9%B2%E6%80%8E%E9%BA%BC%E8%BE%A6%E5%B0%B1%E6%80%8E%E9%BA%BC%E8%BE%A6
1167
https://elevenia.co.id/login.do?returnURL=%2Fproduct%2FProductQnaForm%2FgetProductQnaDetail.do%3Fflag%3DprdQna&isSSL=Y
1168
http://distractify.com/post/related/id/541831754a0c4bd3048b6a53/skip/10/limit/10/back/0
1169
http://tunein.com/radio/Mike-Lupica-p321555/15480783/ca-pub-1542925551861702/P

In [10]:
#Converting the list to dataframe

legitimate = pd.DataFrame(legit_features, columns= feature_names)
legitimate.head()

Unnamed: 0,domain,ip_present,at_present,url_length,url_depth,redirection,https_domain,short_url,prefix/suffix,dns_record,web_traffic,domain_age,domain_end,dot_count,specialchar_count,subdom_count,label
0,graphicriver.net,0,0,1,1,0,0,0,0,1,1,1,1,0,29,0,0
1,ecnavi.jp,0,0,1,1,1,0,0,0,1,1,1,1,1,21,0,0
2,hubpages.com,0,0,1,1,0,0,0,0,1,1,1,1,0,8,0,0
3,extratorrent.cc,0,0,1,3,0,0,0,0,1,0,1,1,1,6,0,0
4,icicibank.com,0,0,1,3,0,0,0,0,1,1,1,1,0,8,0,0


In [11]:
# Storing the extracted legitimate URLs fatures to csv file

legitimate.to_csv('/home/jovyan/Datasets/Dataset1/Creating-data/benign_updated.csv', index= False)

