In [1]:
import re
import os
import socket
import urllib
from datetime import datetime
from urllib.parse import urlparse, urlencode
import dns.resolver
from bs4 import BeautifulSoup
import requests
import whois
import json
import time

In [2]:
class URLFeatureExtractor:
    
    def __init__(self, url):
        self.url = url
        self.parsed_url = urlparse(url)
        self.path = self.parsed_url.path
        self.hostname_parts = self.parsed_url.hostname.split('.')

        self.suspicious_tlds = ['fit','tk', 'gp', 'ga', 'work', 'ml', 'date', 'wang', 'men', 'icu', 'online', 'click', # Spamhaus
                                'country', 'stream', 'download', 'xin', 'racing', 'jetzt',
                                'ren', 'mom', 'party', 'review', 'trade', 'accountants', 
                                'science', 'work', 'ninja', 'xyz', 'faith', 'zip', 'cricket', 'win',
                                'accountant', 'realtor', 'top', 'christmas', 'gdn', # Shady Top-Level Domains
                                'link', # Blue Coat Systems
                                'asia', 'club', 'la', 'ae', 'exposed', 'pe', 'go.id', 'rs', 'k12.pa.us', 'or.kr',
                                'ce.ke', 'audio', 'gob.pe', 'gov.az', 'website', 'bj', 'mx', 'media', 'sa.gov.au' # statistics
                               ]
        # Load brand list
        with open("allbrands.txt", "r") as f:
            self.allbrand = [line.strip() for line in f]
            
    # URL
    def get_url(self):
        return self.url
    
    # HOSTNAME
    def get_hostname(self):
        return self.parsed_url.hostname
    
    # 2 and 3----------------------------------------------------------------
    def get_url_parts_lengths(self):
        """
        Returns a tuple containing the lengths of two parts of the URL:
        the full URL and the parsed network location.
        """
        return len(self.url), len(self.parsed_url.netloc) 
    #------------------------------------------------------------------------
    
    # 4 IP ------------------------------------------------------------------
    def get_ip(self):
        """
        Checks if the URL contains an IP address.

        Returns:
        - 1 if the URL contains an IP address
        - 0 otherwise
        """
        ip_regex = r'(([01]?\d\d?|2[0-4]\d|25[0-5])\.([01]?\d\d?|2[0-4]\d|25[0-5])\.([01]?\d\d?|2[0-4]\d|25[0-5])\.([01]?\d\d?|2[0-4]\d|25[0-5])\/)|' + \
               '((0x[0-9a-fA-F]{1,2})\.(0x[0-9a-fA-F]{1,2})\.(0x[0-9a-fA-F]{1,2})\.(0x[0-9a-fA-F]{1,2})\/)|' + \
               '(?:[a-fA-F0-9]{1,4}:){7}[a-fA-F0-9]{1,4}|' + \
               '[0-9a-fA-F]{7}'
        match = re.search(ip_regex, self.url)
        if match:
            return 1
        return 0
    #------------------------------------------------------------------------
    
    # 5 - 21 - special characters--------------------------------------------
    def count_special_characters(self):
        """
        Counts the occurrence of specific special characters in the URL and returns a dictionary with their counts.
        The special characters that are counted include: '.', '-', '@', '?', '&', '|', '=', '_', '~', '%', '/', '*', ':', ',', ';', '$'
        The count of space characters (represented by '%20') is also included in the dictionary.
        """
        special_chars = {'.': 0, '-': 0, '@': 0, '?': 0, '&': 0, '|': 0, '=': 0, '_': 0, '~': 0, '%': 0, '/': 0, '*': 0, ':': 0, ',': 0, ';': 0, '$': 0}
        for char in special_chars:
            special_chars[char] = self.url.count(char)
        special_chars['%20'] = self.count_space()  # add count for %20
        return special_chars

    # count space in URL (represented by %20)
    def count_space(self):
        return self.url.count('%20')
    #------------------------------------------------------------------------
    
    # 22 - 25 - common terms-------------------------------------------------
    def check_url_words(self, words_raw, word_to_check):
        """
        Counts the number of times a given word appears in a list of words.

        Parameters:
        - words_raw: list of words to search for the target word
        - word_to_check: the word to search for in the list of words

        Returns:
        - count: the number of times the target word appears in the list of words
        """
        count = 0
        for word in words_raw:
            if word == word_to_check:
                count += 1
        return count
    
    def check_www(self):
        words_raw = self.url.split('.')
        count = 0
        for word in words_raw:
            if not word.find('www') == -1:
                count += 1
        return count

    def check_com(self):
        """
        Checks if the domain ends with '.com'.

        Returns:
        - 1 if the domain ends with '.com', 0 otherwise.
        """
        return self.check_url_words(self.parsed_url.netloc.split('.'), 'com') # if show only 1 .com should be 0 if 
    
    def count_double_slash(self):
        """
        Counts the number of double slashes ('//') in the URL.

        Returns:
        - 1 if there is more than one double slash in the URL, 0 otherwise.
        """
        list=[x.start(0) for x in re.finditer('//', self.url)]
        if list[len(list)-1]>6:
            return 1
        else:
            return 0
    
    def count_http_token(self):
        """
        Counts the number of 'http' and 'https' tokens in the URL.

        Returns:
        - the difference between the total number of 'http' and 'https' tokens and 1, which is the expected number of tokens in a valid URL.
        """
        http_pattern = re.compile(r'https?')
        matches = http_pattern.findall(self.url)
        if len(matches) <= 1:
            return 0
        else:
            return len([m for m in matches if m == 'http' or m == 'https']) - 1
    #------------------------------------------------------------------------
    
    # 26 --------------------------------------------------------------------
    def https_token(self):
        if self.parsed_url.scheme == 'https':
            return 0
        else:
            return 1
    #------------------------------------------------------------------------

    # 27 and 28--------------------------------------------------------------
    def get_ratio_digits_url(self):
        return len(re.sub("[^0-9]", "", self.url))/len(self.url)
    
    def get_ratio_digits_host(self):
        hostname = self.parsed_url.hostname
        if not hostname:
            return 0
        return len(re.sub("[^0-9]", "", hostname)) / len(hostname)
    #------------------------------------------------------------------------

    # 29 --------------------------------------------------------------------
    def punycode(self):
        if self.parsed_url.hostname.startswith("xn--"):
            return 1
        else:
            return 0
    #------------------------------------------------------------------------

    # 30 --------------------------------------------------------------------
    def get_port(self):
        if re.search("^[a-z][a-z0-9+\-.]*://([a-z0-9\-._~%!$&'()*+,;=]+@)?([a-z0-9\-._~%]+|\[[a-z0-9\-._~%!$&'()*+,;=:]+\]):([0-9]+)", self.url):
            return 1
        return 0
    #------------------------------------------------------------------------

    # 31 and 32 -------------------------------------------------------------
    # TLD's can be null values
    #------------------------------------------------------------------------

    # 33 --------------------------------------------------------------------
    def abnormal_subdomain(self):
        if re.search('(http[s]?://(w[w]?|\d))([w]?(\d|-))', self.url):
            return 1
        return 0
    #------------------------------------------------------------------------
    
    # 34 --------------------------------------------------------------------
    def count_subdomain(self):
        if len(re.findall("\.", self.url)) == 1:
            return 1
        elif len(re.findall("\.", self.url)) == 2:
            return 2
        else:
            return 3
    #------------------------------------------------------------------------

    # 35 --------------------------------------------------------------------
    def prefix_suffix(self):
        if re.findall(r"https?://[^\-]+-[^\-]+/", self.url):
            return 1
        else:
            return 0 
    #------------------------------------------------------------------------

    # 36 --------------------------------------------------------------------
    # random domain can be null value
    #------------------------------------------------------------------------

    # 37 --------------------------------------------------------------------
    def shortening_service(self):
        shorting = r'bit\.ly|goo\.gl|shorte\.st|go2l\.ink|x\.co|ow\.ly|t\.co|tinyurl|tr\.im|is\.gd|cli\.gs|' \
                'yfrog\.com|migre\.me|ff\.im|tiny\.cc|url4\.eu|twit\.ac|su\.pr|twurl\.nl|snipurl\.com|' \
                'short\.to|BudURL\.com|ping\.fm|post\.ly|Just\.as|bkite\.com|snipr\.com|fic\.kr|loopt\.us|' \
                'doiop\.com|short\.ie|kl\.am|wp\.me|rubyurl\.com|om\.ly|to\.ly|ity\.im|' \
                'q\.gs|po\.st|bc\.vc|twitthis\.com|u\.to|j\.mp|buzurl\.com|cutt\.us|u\.bb|yourls\.org|' \
                'prettylinkpro\.com|scrnch\.me|filoops\.info|vzturl\.com|qr\.net|1url\.com|tweez\.me|v\.gd|' \
                'tr\.im|link\.zip\.net'
        match = re.search(shorting, self.parsed_url.netloc, re.IGNORECASE)
        if match:
            return 1
        else:
            return 0
    #------------------------------------------------------------------------

    # 38 -------------------------------------------------------------------    
    def path_extension(self):
        if self.path.endswith(('.txt', '.exe', '.js')):
            return 1
        return 0
    #------------------------------------------------------------------------
    
    # 39 and 40 -------------------------------------------------------------
    # COUNT REDIRECTION
    def count_redirection(self, page):
        return len(page.history)

    # COUNT EXTERNAL REDIRECTION
    def count_external_redirection(self, page, domain):
        count = 0
        if len(page.history) == 0:
            return 0
        else:
            for i, response in enumerate(page.history,1):
                if domain.lower() not in response.url.lower():
                    count+=1          
            return count

    #------------------------------------------------------------------------
    
    # 41 to 51 --------------------------------------------------------------
    # lenght_words_raw

    # char_repeat
    # null value 

    # shortest
    def extract_shortest_word(self, text):
        words = re.findall(r'[a-zA-Z0-9]+', text)
        if words:
            return min(words, key=len)
        else:
            return ''
        
    def get_shortest_word_lengths(self):
        # shortest_words_raw
        shortest_word_raw = len(self.extract_shortest_word(self.url))
        # shortest_word_hostanme
        shortest_word_host = len(self.extract_shortest_word(self.get_hostname()))
        # shortest_word_path
        shortest_word_path = len(self.extract_shortest_word(self.parsed_url.path))
        return shortest_word_raw, shortest_word_host, shortest_word_path

    
    # longest 
    def extract_longest_word(self, text):
        words = re.findall(r'[a-zA-Z0-9]+', text)
        if words:
            return max(words, key=len)
        else:
            return ''
        
    def get_longest_word_lengths(self):
        # longest_words_raw
        longest_word_raw = len(self.extract_longest_word(self.url))
        # longest_word_hostanme
        longest_word_host = len(self.extract_longest_word(self.get_hostname()))
        # longest_word_path
        longest_word_path = len(self.extract_longest_word(self.parsed_url.path))
        return longest_word_raw, longest_word_host, longest_word_path
    
    # average 
    # null values 
    #------------------------------------------------------------------------
    
    # 52 --------------------------------------------------------------------
    def phish_hints(self):
        count = 0
        HINTS = ['wp', 'login', 'includes', 'admin', 'content', 'site', 'images', 'js', 'alibaba', 'css', 'myaccount', 'dropbox', 'themes', 'plugins', 'signin', 'view']
        for hint in HINTS:
            count += self.path.lower().count(hint)
        return count
    #------------------------------------------------------------------------
    
    # 53 to 55 --------------------------------------------------------------
    def domain_in_brand(self):
        hostname = self.get_hostname()
        words = hostname.split('.')
        for word in words:
            if word in self.allbrand:
                return 1
        return 0
        
    # null value
    #------------------------------------------------------------------------
    
    # 56 --------------------------------------------------------------------
    def check_suspicious_tld(self):
        tld = self.parsed_url.hostname.split(".")[-1]
        if tld in self.suspicious_tlds:
            return 1
        return 0
    #------------------------------------------------------------------------ 
    
    # 57 --------------------------------------------------------------------
    def statistical_report(self):
        url_match = re.search('at\.ua|usa\.cc|baltazarpresentes\.com\.br|pe\.hu|esy\.es|hol\.es|sweddy\.com|myjino\.ru|96\.lt|ow\.ly', self.url)
        try:
            ip_address = socket.gethostbyname(self.get_hostname())
            ip_match = re.search('146\.112\.61\.108|213\.174\.157\.151|121\.50\.168\.88|192\.185\.217\.116|78\.46\.211\.158|181\.174\.165\.13|46\.242\.145\.103|121\.50\.168\.40|83\.125\.22\.219|46\.242\.145\.98|'
                                '107\.151\.148\.44|107\.151\.148\.107|64\.70\.19\.203|199\.184\.144\.27|107\.151\.148\.108|107\.151\.148\.109|119\.28\.52\.61|54\.83\.43\.69|52\.69\.166\.231|216\.58\.192\.225|'
                                '118\.184\.25\.86|67\.208\.74\.71|23\.253\.126\.58|104\.239\.157\.210|175\.126\.123\.219|141\.8\.224\.221|10\.10\.10\.10|43\.229\.108\.32|103\.232\.215\.140|69\.172\.201\.153|'
                                '216\.218\.185\.162|54\.225\.104\.146|103\.243\.24\.98|199\.59\.243\.120|31\.170\.160\.61|213\.19\.128\.77|62\.113\.226\.131|208\.100\.26\.234|195\.16\.127\.102|195\.16\.127\.157|'
                                '34\.196\.13\.28|103\.224\.212\.222|172\.217\.4\.225|54\.72\.9\.51|192\.64\.147\.141|198\.200\.56\.183|23\.253\.164\.103|52\.48\.191\.26|52\.214\.197\.72|87\.98\.255\.18|209\.99\.17\.27|'
                                '216\.38\.62\.18|104\.130\.124\.96|47\.89\.58\.141|78\.46\.211\.158|54\.86\.225\.156|54\.82\.156\.19|37\.157\.192\.102|204\.11\.56\.48|110\.34\.231\.42', ip_address)
            if url_match or ip_match:
                return 1
            else:
                return 0
        except:
            return 2
    #------------------------------------------------------------------------ 
    
    # 82 --------------------------------------------------------------------
    def whois_registered_domain(self):
        domain = self.parsed_url.hostname
        try:
            hostname = whois.whois(domain).domain_name
            if isinstance(hostname, list):
                for host in hostname:
                    if re.search(host.lower(), domain):
                        return 0
                return 1
            else:
                if re.search(hostname.lower(), domain):
                    return 0
                else:
                    return 1     
        except:
            return 1
    #------------------------------------------------------------------------ 
    
    # 83 --------------------------------------------------------------------
    def get_domain_registration_length(self):
        domain = self.get_hostname()
        try:
            res = whois.whois(domain)
            expiration_date = res.expiration_date
            today = datetime.now()
            # Some domains do not have expiration dates. The application should not raise an error if this is the case.
            if expiration_date:
                if type(expiration_date) == list:
                    expiration_date = min(expiration_date)
                return abs((expiration_date - today).days)
            else:
                return 0
        except:
            return -1
    #------------------------------------------------------------------------ 
    
    # 84 --------------------------------------------------------------------   
    def get_domain_age(self):
        domain = self.get_hostname()
        try:
            ip = socket.gethostbyname(domain)
            start = time.time()
            response = os.system(f"ping -c 1 {ip} > /dev/null 2>&1")
            end = time.time()
            if response == 0:
                rtt = (end - start) * 1000  # in milliseconds
                age = int(rtt / (2 * 365 * 24 * 60 * 60) * 1000)  # estimated age in years
                return age
            else:
                return -2  # ping failed
        except:
            return -1  # domain not found or error occurred
    #------------------------------------------------------------------------ 
    
    # 85 --------------------------------------------------------------------    
    def get_web_traffic(self):
        query = f"site:{self.get_hostname()}"
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"}
        results = 0
        for search_engine in ["https://www.google.com/search", "https://www.bing.com/search", "https://search.yahoo.com/search"]:
            response = requests.get(search_engine, headers=headers, params={"q": query})
            if response.status_code == 200:
                results_match = re.search(r"([0-9,]+)\sresults", response.text)
                if results_match:
                    results += int(results_match.group(1).replace(",", ""))
        return results
    #------------------------------------------------------------------------ 
    
    # 86 --------------------------------------------------------------------   
    def dns_record(self):
        try:
            nameservers = dns.resolver.query(self.get_hostname(),'NS')
            if len(nameservers) > 0:
                return 0
            else:
                return 1
        except:
            return 1
    #------------------------------------------------------------------------ 
    
    # 87 --------------------------------------------------------------------    
    def google_index(self):
        user_agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36'
        headers = {'User-Agent': user_agent}
        query = {'q': 'site:' + self.url}
        google = "https://www.google.com/search?" + urlencode(query)
        data = requests.get(google, headers=headers)
        data.encoding = 'ISO-8859-1'
        soup = BeautifulSoup(str(data.content), "html.parser")
        try:
            if 'Our systems have detected unusual traffic from your computer network.' in str(soup):
                return -1
            check = soup.find(id="rso").find("div").find("div").find("a")
            if check and check['href']:
                return 0
            else:
                return 1
        
        except AttributeError:
            return 1
    #------------------------------------------------------------------------ 
    
    # 88 --------------------------------------------------------------------
    def get_page_rank(self, api_key = "4kcw4ss88cs84ggckcwg4c4cwoo0g0ko4wows8sg"):
        domain = self.parsed_url.hostname
        url = f"https://openpagerank.com/api/v1.0/getPageRank?domains%5B0%5D={domain}"
        try:
            response = requests.get(url, headers={"API-OPR": api_key})
            result = response.json()
            page_rank = result["response"][0]["page_rank_integer"]
            if page_rank:
                return page_rank
            else:
                return 0
        except:
            return -1
    #------------------------------------------------------------------------ 
    
    def get_url_features(self):
        
        features = {} # dictionary

        # URL-based features

        # 1 ---------------------------------------------------
        # features['url'] = self.get_url()
        #------------------------------------------------------

        # 1 and 2 ---------------------------------------------
        length_url, length_hostname = self.get_url_parts_lengths()
        features['length_url'] = length_url
        features['length_hostname'] = length_hostname
        #-------------------------------------------------------

        # 3 ----------------------------------------------------
        features['ip'] = self.get_ip()
        #-------------------------------------------------------

        # 4 to 21 ----------------------------------------------
        special_chars = self.count_special_characters()
        for char, count in special_chars.items():
            features[char] = count
        #-------------------------------------------------------

        # 22 to 25 ---------------------------------------------
        features['nb_www'] = self.check_www()
        features['nb_com'] = self.check_com()
        features['nb_dslash'] = self.count_double_slash()
        features['http_in_path'] = self.count_http_token() 
        #-------------------------------------------------------

        # 26 ---------------------------------------------------
        features['https_token'] = self.https_token()
        #-------------------------------------------------------

        # 27 and 28 --------------------------------------------
        features['ratio_digits_url'] = self.get_ratio_digits_url()
        features['ratio_digits_host'] = self.get_ratio_digits_host()
        #-------------------------------------------------------

        # 29 ---------------------------------------------------
        features['punycode'] = self.punycode()
        #-------------------------------------------------------

        # 30 ---------------------------------------------------
        features['port'] = self.get_port()
        #-------------------------------------------------------

        # 31 and 32 --------------------------------------------
        features['tld_in_path'] = 0
        features['tld_in_subdomain'] = 0
        #-------------------------------------------------------

        # 33 ---------------------------------------------------
        features['abnormal_subdomain'] = self.abnormal_subdomain()
        #-------------------------------------------------------

        # 34 ---------------------------------------------------
        features['nb_subdomains'] = self.count_subdomain()
        #-------------------------------------------------------

        # 35 ---------------------------------------------------
        features['prefix_suffix'] = self.prefix_suffix()
        #-------------------------------------------------------

        # 36 ---------------------------------------------------
        features['random_domain'] = 0
        #-------------------------------------------------------

        # 37 ---------------------------------------------------
        features['shortening_service'] = self.shortening_service()
        #-------------------------------------------------------

        # 38 ---------------------------------------------------  
        features['path_extension'] = self.path_extension()
        #-------------------------------------------------------

        # 39 and 40 --------------------------------------------
        features['nb_redirection'] = 0
        features['nb_external_redirection'] = 0
        #-------------------------------------------------------

        # 41 to 51 ---------------------------------------------
        features['length_words_raw'] = 0

        features['char_repeat'] = 0

        url_word_length_s, hostname_word_length_s, path_word_length_s = self.get_shortest_word_lengths()
        features['shortest_word_raw'] = url_word_length_s
        features['shortest_word_host'] = hostname_word_length_s
        features['shortest_word_path'] = path_word_length_s
        
        url_word_length_l, hostname_word_length_l, path_word_length_l = self.get_longest_word_lengths()
        features['longest_word_raw'] = url_word_length_l
        features['longest_word_host'] = hostname_word_length_l
        features['longest_word_path'] = path_word_length_l

        features['avg_words_raw'] = 0
        features['avg_words_host'] = 0
        features['avg_words_path'] = 0
        #-------------------------------------------------------
        
        # 52 ---------------------------------------------------
        features['phish_hints'] = self.phish_hints()
        #-------------------------------------------------------

        # 53 to 55 ---------------------------------------------
        features['domain_in_brand'] = self.domain_in_brand()
        features['brand_in_subdomain'] = 0
        features['brand_in_path'] = 0
        #-------------------------------------------------------
        
        # ------------------------------------------------------
        features['suspecious_tld'] = self.check_suspicious_tld()
        #-------------------------------------------------------

        # 56 ---------------------------------------------------
        features['statistical_report'] = self.statistical_report()
        #-------------------------------------------------------

        # Content-based features
        
        # 58 -----------------------------------------------
        features['nb_hyperlinks'] = 0
        #-------------------------------------------------------

        # 59 and 60 -----------------------------------------------
        features['ratio_intHyperlinks'] = 0
        features['ratio_extHyperlinks'] = 0
        #-------------------------------------------------------

        # 61 -----------------------------------------------
        features['ratio_nullHyperlinks'] = 0
        #-------------------------------------------------------

        # 62 -----------------------------------------------
        features['nb_extCSS'] = 0
        #-------------------------------------------------------

        # 63 and 64 --------------------------------------------
        features['ratio_intRedirection'] = 0
        features['ratio_extRedirection'] = 0
        #-------------------------------------------------------

        # 65 and 66 --------------------------------------------
        features['ratio_intErrors'] = 0
        features['ratio_extErrors'] = 0
        #-------------------------------------------------------

        # 67 -----------------------------------------------
        features['login_form'] = 0
        #-------------------------------------------------------

        # 68 -----------------------------------------------
        features['external_favicon'] = 0
        #-------------------------------------------------------

        # 69 -----------------------------------------------
        features['links_in_tags'] = 0
        #-------------------------------------------------------

        # 70 -----------------------------------------------
        features['submit_email'] = 0
        #-------------------------------------------------------

        # 71 and 72 --------------------------------------------
        features['ratio_intMedia'] = 0
        features['ratio_extMedia'] = 0
        #-------------------------------------------------------

        # 73 -----------------------------------------------
        features['sfh'] = 0
        #-------------------------------------------------------

        # 74 -----------------------------------------------
        features['iframe'] = 0
        #-------------------------------------------------------
        
        # 75 -----------------------------------------------
        features['popup_window'] = 0
        #-------------------------------------------------------
        
        # 76 -----------------------------------------------
        features['safe_anchor'] = 0
        #-------------------------------------------------------

        # 77 -----------------------------------------------
        features['onmouseover'] = 0
        #-------------------------------------------------------

        # 78 -----------------------------------------------
        features['right_clic'] = 0
        #-------------------------------------------------------

        # 79 -----------------------------------------------
        features['empty_title'] = 0
        #-------------------------------------------------------

        # 80 -----------------------------------------------
        features['domian_in_title'] = 0
        #-------------------------------------------------------

        # 81 -----------------------------------------------
        features['domian_with_copy_right'] = 0
        #-------------------------------------------------------

        # External Features:
        # 82 -----------------------------------------------
        features['whois_registered_domain'] = self.whois_registered_domain()
        #-------------------------------------------------------
        
        # 83 -----------------------------------------------
        features['domain_registration_length'] = self.get_domain_registration_length()
        #-------------------------------------------------------
        
        # 84 -----------------------------------------------
        features['domain_age'] = self.get_domain_age()
        #-------------------------------------------------------
        
        # 85 -----------------------------------------------
        features['web_traffic'] = self.get_web_traffic()
        #-------------------------------------------------------

        # 86 -----------------------------------------------
        features['dns_record'] = self.dns_record()
        #-------------------------------------------------------

        # 87 -----------------------------------------------
        features['google_index'] = self.google_index()
        #-------------------------------------------------------

        # 88 -----------------------------------------------
        page_rank = self.get_page_rank()
        features['page_rank'] = page_rank
        #-------------------------------------------------------

        return features
