In [2]:
import time
import whois
import urllib.request, re
import xmltodict, json
from datetime import datetime
import ssl
import OpenSSL
import favicon
import ipaddress
from bs4 import BeautifulSoup
import socket
import requests
from googlesearch import search
from dateutil.parser import parse as date_parse
import sys
import dns.resolver
from patterns import *
import whois.parser
start = time.time()

def prepare_url(url):
    if not re.match(r"^https?", url):
        url = "http://" + url
    return url


def get_soup(url):
    try:
        response = requests.get(url,timeout=15)
        soup = BeautifulSoup(response.text, 'html.parser')
    except Exception as e:
        print("Oops!", e.__class__, "occurred .(soup error)")    
        response = ""
        soup = -999
    return soup


def find_domain(url):
    domain = re.findall(r"://([^/]+)/?", url)[0]
    if re.match(r"^.",domain):
        domain = domain.replace("www.","")
    return domain



def whois_response(domain):
    try: 
        w = whois.whois(domain)
        return w
    except whois.parser.PywhoisError as e:
        print(e.__class__, "occurred")
        return -1
    
def exp_date(w):
    if(w==-1):
        return -1
    
    if type(w.expiration_date) == list:
        expire_date = w.expiration_date[0]
        return expire_date
    else:
        expire_date = w.expiration_date
        return expire_date
    
def updatd_date(w):
    if(w==-1):
        return -1
    if type(w.updated_date) == list:
        up_date = w.updated_date[0]
        return up_date
    else:
        up_date = w.updated_date
        return up_date 

def create_date(w):
    if(w==-1):
        return -1
    if type(w.creation_date) == list:
        regis_date = w.creation_date[0]
        return regis_date
    else:
        regis_date = w.creation_date
        return regis_date


def find_global_rank(url):
    try:
        xml = urllib.request.urlopen('http://data.alexa.com/data?cli=10&dat=s&url={}'.format(url)).read()
        result= xmltodict.parse(xml)

        data = json.dumps(result).replace("@","")
        data_tojson = json.loads(data)
        url = data_tojson["ALEXA"]["SD"][1]["POPULARITY"]["URL"]
        global_rank= int(data_tojson["ALEXA"]["SD"][1]["POPULARITY"]["TEXT"])
    except Exception as e:
        print("Oops!", e.__class__, "occurred.")    
        global_rank=-1
    
    return global_rank


def get_certificate(host, port=443, timeout=10):
    try:
        context = ssl.create_default_context()
        conn = socket.create_connection((host, port))
        sock = context.wrap_socket(conn, server_hostname=host)
        sock.settimeout(timeout)
        der_cert = sock.getpeercert(True)
        #c = sock.getpeercert()
        #print(c)
        return ssl.DER_cert_to_PEM_cert(der_cert)

    except:
        print("SSL error")
        return -1

    
def domain_age(regis_date):
    if(regis_date==-1):
        return -1
    
    try:
        today=datetime.now()
        timedelta = today - regis_date
        val = timedelta.days
        print("Domain age:",val)
        if val <= 5000:
            return -1
        else:
            return 1
    except:
        return -1

    
#Features
def having_ip_address(url):
    try:
        ipaddress.ip_address(url)
        return -1
    except:
        return 1



def url_length(url):
    if len(url) < 54:
        return 1
    if 54 <= len(url) <= 75:
        return 0
    return -1



def shortening_services(url):
    match=re.search('bit\.ly|goo\.gl|shorte\.st|go2l\.ink|x\.co|ow\.ly|t\.co|tinyurl|tr\.im|is\.gd|cli\.gs|'
                    'yfrog\.com|migre\.me|ff\.im|tiny\.cc|url4\.eu|twit\.ac|su\.pr|twurl\.nl|snipurl\.com|'
                    'short\.to|BudURL\.com|ping\.fm|post\.ly|Just\.as|bkite\.com|snipr\.com|fic\.kr|loopt\.us|'
                    'doiop\.com|short\.ie|kl\.am|wp\.me|rubyurl\.com|om\.ly|to\.ly|bit\.do|t\.co|lnkd\.in|'
                    'db\.tt|qr\.ae|adf\.ly|goo\.gl|bitly\.com|cur\.lv|tinyurl\.com|ow\.ly|bit\.ly|ity\.im|'
                    'q\.gs|is\.gd|po\.st|bc\.vc|twitthis\.com|u\.to|j\.mp|buzurl\.com|cutt\.us|u\.bb|yourls\.org|'
                    'x\.co|prettylinkpro\.com|scrnch\.me|filoops\.info|vzturl\.com|qr\.net|1url\.com|tweez\.me|v\.gd|tr\.im|link\.zip\.net',url)
    if match:
        return -1
    else:
        return 1



def having_at_symbol(url):
    if re.findall("@", url):
        return -1
    else:
        return 1



def double_slash_redirecting(url):
    last_double_slash = url.rfind('//')
    return -1 if last_double_slash > 6 else 1


def prefix_suffix(url):
    if re.findall(r"https?://[^\-]+-[^\-]+/", url):
            return -1
    else:
        return 1



def having_sub_domain(url):
    if having_ip_address(url) == -1:
        match = re.search(
            '(([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\.([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\.([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\.'
            '([01]?\\d\\d?|2[0-4]\\d|25[0-5]))|(?:[a-fA-F0-9]{1,4}:){7}[a-fA-F0-9]{1,4}',
            url)
        pos = match.end()
        url = url[pos:]
    num_dots = [x.start() for x in re.finditer(r'\.', url)]
    if len(num_dots) <= 3:
        return 1
    elif len(num_dots) == 4:
        return 0
    else:
        return -1

    
    
def ssl_final_state(certificate,dom_age):
    try:
        x509 = OpenSSL.crypto.load_certificate(OpenSSL.crypto.FILETYPE_PEM, certificate)
        extensions = (x509.get_extension(i) for i in range(x509.get_extension_count()))
        extension_data = { e.get_short_name().decode('ascii'): str(e) for e in extensions}
        #pprint(extension_data)

        st=datetime.strptime(x509.get_notBefore().decode('utf-8'), '%Y%m%d%H%M%SZ'),
        end=datetime.strptime(x509.get_notAfter().decode('utf-8'), '%Y%m%d%H%M%SZ'),

        e=str(*st)
        f=str(*end)

        start_date = datetime.strptime(e,'%Y-%m-%d %H:%M:%S')
        print("\nssl start:",start_date)
        exp_date = datetime.strptime(f,'%Y-%m-%d %H:%M:%S')
        print("ssl end:",exp_date)

        timedelta = exp_date - start_date
        val = timedelta.days
        #print("ssl will expire in " +str(val) +" days")

        
        c=extension_data['certificatePolicies']
        li=['2.23.140.1.2.1','2.23.140.1.2.2','2.23.140.1.1',]
    
        for i in li:
            if (c.find(i) != -1):
                policy=i
                

        if policy==li[0] or certificate==-1: #domain_v
            print("policy is DV",policy)
            if dom_age==1:
                return 1
            else:
                return -1
    
        
        if policy==li[1]:   #organization_v
            print("policy is OV",policy)
            if dom_age==1:
                return 1
            else:
                return 0
            
            
        if policy==li[2]:  #extended_v
            print("policy is EV",policy)
            return 1
    
    except:
        #print("error")
        return -1


    
def domain_reg_length(expire_date):
    if(expire_date==-1):
        return -1
    
    try:
        
        today=datetime.now()
        timedelta = expire_date - today
        val = timedelta.days
        print("days to expire: " +str(val))
                       
        if val / 365 <=1:
            return -1
        else:
            return 1
    except:
        return -1
    
        

def fav_icon(url):
    try:
        fav=0
        icons=favicon.get(url)
        #print(icons)   
        #icon=icons[0]
        #print(icon.url)
        for i in icons:
            if i.format =='ico':
                fav+=1
                print("has favicon")
                #print(i.url)
                return 1
    
        if(fav==0):
            print("no favicon")
            return -1
    except:
        return -1

        

def port(domain):
    try:
        port = domain.split(":")[1]
        if port:
            return -1
        else:
            return 1
    except:
        return 1



def https_token(url):
    if re.findall(r"^https://", url):
        return 1
    else:
        return -1


def request_url(url, soup, domain):
    i = 0
    success = 0
    if soup==-999:
        return -1
    else:
        for img in soup.find_all('img', src=True):
            dots = [x.start() for x in re.finditer(r'\.', img['src'])]
            if url in img['src'] or domain in img['src'] or len(dots) == 1:
                success = success + 1
            i = i + 1

        for audio in soup.find_all('audio', src=True):
            dots = [x.start() for x in re.finditer(r'\.', audio['src'])]
            if url in audio['src'] or domain in audio['src'] or len(dots) == 1:
                success = success + 1
            i = i + 1

        for embed in soup.find_all('embed', src=True):
            dots = [x.start() for x in re.finditer(r'\.', embed['src'])]
            if url in embed['src'] or domain in embed['src'] or len(dots) == 1:
                success = success + 1
            i = i + 1

        for i_frame in soup.find_all('i_frame', src=True):
            dots = [x.start() for x in re.finditer(r'\.', i_frame['src'])]
            if url in i_frame['src'] or domain in i_frame['src'] or len(dots) == 1:
                success = success + 1
            i = i + 1

        try:
            percentage = success / float(i) * 100
        except:
            return 1

        if percentage < 22.0:
            return 1
        elif 22.0 <= percentage < 61.0:
            return 0
        else:
            return -1



def url_of_anchor(url,soup,domain):
    percentage = 0
    i = 0
    unsafe=0
    if soup == -999:
        return -1
    else:
        for a in soup.find_all('a', href=True):
        
            if "#" in a['href'] or "javascript" in a['href'].lower() or "mailto" in a['href'].lower() or not (url in a['href'] or domain in a['href']):
                unsafe = unsafe + 1
            i = i + 1
        try:
            percentage = unsafe / float(i) * 100
        except:
            return 1
        if percentage < 31.0:
            return 1
        elif ((percentage >= 31.0) and (percentage < 67.0)):
            return 0
        else:
            return -1



def links_in_tags(url, soup, domain):
    i=0
    success =0
    if soup == -999:
        return -1
    else:
        for link in soup.find_all('link', href=True):
            dots = [x.start() for x in re.finditer(r'\.', link['href'])]
            if url in link['href'] or domain in link['href'] or len(dots) == 1:
                success = success + 1
            i = i + 1

        for script in soup.find_all('script', src=True):
            dots = [x.start() for x in re.finditer(r'\.', script['src'])]
            if url in script['src'] or domain in script['src'] or len(dots) == 1:
                success = success + 1
            i = i + 1
        try:
            percentage = success / float(i) * 100
        except:
            return 1

        if percentage < 17.0:
            return 1
        elif 17.0 <= percentage < 81.0:
            return 0
        else:
            return -1



def sfh(url,soup,domain):
    if soup==-999:
        return -1
    else:
        for form in soup.find_all('form', action=True):
            if form['action'] == "" or form['action'] == "about:blank":
                return -1
            elif url not in form['action'] and domain not in form['action']:
                return 0
            else:
                return 1
    return 1




def submitting_to_email(soup):
    if soup==-999:
        return -1
    else:
        for form in soup.find_all('form', action=True):
            if "mailto:" in form['action']:
                return -1 
            else:
                return 1
    return 1




def abnormal_url(domain, who): 
    if(who==-1):
        return -1
    try:
        print(who.domain_name)
        if type(who.domain_name) == list:
            w_dom=who.domain_name[0]
        
        else:
            w_dom=who.domain_name
    
        print("\nURL Domain ",domain.lower())
        print("Whois_domain ",w_dom.lower())
        if domain.lower()==w_dom.lower():
            return 1
        else:
            return -1
    
    except:
        return -1




def redirect(url):
    try:
        response=requests.get(url)
        if response == "":
            return -1
        else:
            #print("response length",len(response.history))
            if len(response.history) <= 1:
                return -1
            elif len(response.history) <= 4:
                return 0
            else:
                return 1
    except:
        return -1




def popup_window(url):
    try:
        response=requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        if response == "":
            return -1
        else:
            if re.findall(r"prompt\(", response.text):
                return -1
            else:
                return 1
    except:
        return -1


    
    
def age_of_domain(expire_date,regis_date):
    if(expire_date==-1):
        return -1
    
    try:
        timedelta = expire_date - regis_date
        ageofdomain = timedelta.days
        #print("Age of domain in days: ",ageofdomain)
        
        if ageofdomain <= 365:
            return -1
        else:
            return 1
    except:
        return -1
    


def dns_record(domain,w):
    try:
        my_resolver = dns.resolver.Resolver()
        # Finding A record 
        result = my_resolver.resolve(domain, 'A') 
        # Printing record 
        for val in result: 
            #print('\nA Record : ',val)
            c=str(val)
    
            is_valid = re.match("^(([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\.){3}([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])$",c)
            if is_valid:
                cond=True
    
        if w!=-1 and cond==True:
            return 1
        else:
            return -1
    
    except:
        return -1
    

def web_traffic(url):
    try:
        rank = BeautifulSoup(urllib.request.urlopen("http://data.alexa.com/data?cli=10&dat=s&url=" + url).read(), "xml").find("REACH")['RANK']
        rank= int(rank)
        if (rank<100000):
            return 1
        else:
            return 0
    except TypeError:
        return -1




def links_pointing_to_page(url,soup):
    try:    
        externalLinks = []
        for link in soup.find_all('a', {'href' : re.compile('^(http|www)((?!'+url+').)*$')}):
            if link.attrs['href'] is not None:
                if link.attrs['href'] not in externalLinks:
                    externalLinks.append(link.attrs['href'])
        
        number_of_links =len(externalLinks)
        print("no. of links: ",number_of_links)
        if number_of_links == 0:
            return -1
        elif number_of_links <=5:
            return 0
        else:
            return 1
    except:
        return -1

def generate_dataset(url):
    url=prepare_url(url)
    soup=get_soup(url)
    
    domain=find_domain(url)
    
    who_resp=whois_response(domain)
    #print("who_resp:",who_resp)
    
    global_rank=find_global_rank(url)
    expire_date=exp_date(who_resp)
    #print("Expiration date "+ str(expire_date))
    
    regis_date=create_date(who_resp)
    #print("Registration date "+ str(regis_date))

    updat_date=updatd_date(who_resp)
    #print("Updated date "+ str(updat_date))
    
    dom_age=domain_age(regis_date)
    
    certificate = get_certificate(domain)
    ss_l=ssl_final_state(certificate,dom_age)
    
    data_set=[]
    data_set.append(having_ip_address(url))          #1
    data_set.append(url_length(url))
    data_set.append(shortening_services(url))
    data_set.append(having_at_symbol(url))
    data_set.append(double_slash_redirecting(url))
    data_set.append(prefix_suffix(url))
    data_set.append(having_sub_domain(url))
    
    data_set.append(ss_l)     #8
    data_set.append(domain_reg_length(expire_date))
    data_set.append(fav_icon(url))            #10
    data_set.append(port(domain))
    data_set.append(https_token(url))
    data_set.append(request_url(url,soup,domain))        #13
    data_set.append(url_of_anchor(url,soup,domain))
    data_set.append(sfh(url,soup,domain))
    data_set.append(submitting_to_email(soup))           #16
    data_set.append(abnormal_url(domain,who_resp))
    data_set.append(redirect(url))
    data_set.append(popup_window(url))                   #19
    data_set.append(age_of_domain(expire_date,regis_date))
    data_set.append(dns_record(domain,who_resp))                #21
    data_set.append(web_traffic(url))
    data_set.append(links_pointing_to_page(url,soup))     #23
    return data_set,global_rank,ss_l

#url="https://www.axisbank.com/"
#url="https://www.hdfcbank.com/"
#url="https://www.onlinesbi.com/"
#url="https://www.ssl.com/article/dv-ov-and-ev-certificates/"    
#url="https://www.amazon.in/"
#url="https://www.paypal.com/in/signin"
#url="https://www.airtel.in/bank/"
#url="https://www.jiopaymentsbank.com/"
#url="https://www.amazon.in/amazonpay/home?ref_=apay_logo_APayDashboard"

#c,r,s=generate_dataset(url)
#print("RANK",r)
#print(c) 
end = time.time()
# total time taken
print("Runtime of the program is", end - start)

Runtime of the program is 0.004986286163330078
