### **PHISHING VECTOR GENERATOR** 🐟

In [1]:
import pandas as pd
import numpy as np
import json
from copy import deepcopy
import os
import time
import requests
import re
from user_browsing import user_browsing
from xml.etree import ElementTree as ET
from urllib.parse import urlparse
from os import path
from collections import Counter
from bs4 import BeautifulSoup
import urllib

from phishing_utils import *

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\patri\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\patri\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
class PHISH_FVG:

    def __init__(self, url, fichero = 'html_dump'):

        self.url = url
        parsed = urlparse(url)
        self.base = parsed.netloc
        self.path = self.base + '/'.join(path.split('/')[:-1])

        self.fv = np.array([-1 for i in range(19)])
        self.fichero = fichero

        self.user = user_browsing()
        self.user.set_standard_header(self.base)

        response_content = self.get_bin_source_code()
        self.html = response_content.decode("utf-8")
        self.title = re.findall('(?:<title>)([^<]*)(?:</title>)', self.html)
        self.soup = BeautifulSoup(response_content)

        self.hyperlinks = self.find_hyperlinks()
        

    def get_bin_source_code(self):
        """
        Extracts binary source code from webpage.
        """

        response = requests.get(self.url, headers=self.user.get_simple_user_header_agent() ) #, headers=self.user.header) #proxies=user.proxies, cookies=user.cookies)

        if response.status_code != 400:
            with open(self.fichero, 'wb') as f:
                f.write(response.content)
                f.close()

        return response.content

        
    def set_f1(self):
        """
        Sets F1.
        F1 = 1, if dots in url >= 4
        F1 = 0, otherwise
        """

        if self.url.count('.') >= 4:
            self.fv[0] = 1

        else:
            self.fv[0] = 0


    def set_f2(self):
        """
        Sets F2.
        F2 = 1, if URL contains '@' or '-' symbols
        F2 = 0, otherwise
        """

        if '@' in self.url or '-' in self.url:
            self.fv[1] = 1

        else:
            self.fv[1] = 0


    def set_f3(self):
        """
        Sets F3.
        F3 = 1, if URL length >= 74
        F3 = 0, otherwise
        """

        if len(self.url) >= 74:
            self.fv[2] = 1

        else:
            self.fv[2] = 0


    def set_f4(self):
        """
        Sets F4.
        F4 = 1, if URL contains any suspicious word
        F4 = 0, otherwise
        """

        splitted_url = get_splitted_url(self.url)
        suspicious_words = get_suspicious_keywords()

        for word in splitted_url:
            leet_translation = translate_leet_to_letters(word) #Decisión propia
            
            if bool(suspicious_words & leet_translation):
                self.fv[3] = 1
                return

        self.fv[3] = 0


    def set_f5(self):
        """
        Sets F5.
        F5 = 1, if tlds in URL > 1
        F5 = 0, otherwise

        # REVISAR COMPUESTOS
        """

        splitted_url = set(get_splitted_url(self.url))
        tlds = get_tlds_set()

        if len(splitted_url & tlds) > 1:
            self.fv[4] = 1

        else:
            self.fv[4] = 0


    def set_f6(self):
        """
        Sets F6.
        F6 = 1, if http count in URL > 1
        F6 = 0, otherwise
        """

        if len(re.findall('http', self.url)) > 1:
            self.fv[5] = 1

        else:
            self.fv[5] = 0


    def set_f7(self):
        """
        Sets F7.
        F7 = 1, if brand in incorrect position.
        F7 = 0, otherwise
        # Unitarias
        """

        targets = get_phishing_targets_set()
        parsed = urlparse(self.url.lower())
        base = remove_tld(parsed.netloc)
        base = remove_tld(base)
        path = parsed.path

        for target in targets:
            if target in base or target in path:
                self.fv[6] = 1
                return

        self.fv[6] = 0


    def set_f8(self):
        """
        Sets F8.
        F8 = 1, if data URI present in website.
        F8 = 0, otherwise

        Syntax: data:[<mime type>][;charset=<charset>][;base64],<encoded data>
        """

        matches = re.findall('data:(?:[^;,]*)?(?:;charset=[^;,]*)?(?:;base64)?,[^)"\';>]*[^)"\';>]', self.html)
        print(matches)
        
        if len(matches) > 0:
            self.fv[7] = 1

        else:
            self.fv[7] = 0
            

    def set_f9(self):
        """
        Sets F9.
        F9 = 1, if action field is blank or javascript:void(0)
        F9 = 1, if action field is <name>.php
        F9 = 1, if action field contains foreign base domain
        F9 = 0, otherwise
        """

        forms_found = re.findall("<form[^>]+>", self.html)

        if len(forms_found) > 0:

            for i in range(len(forms_found)):
                form_found = forms_found[i]
                action_content = re.findall('(?:action=\")([^"]*)(?:\")', form_found)

                if len(action_content) > 0:

                    if is_empty(action_content[0]):
                        self.fv[8] = 1
                        return

                    elif is_simple_php_file(action_content[0]):
                        self.fv[8] = 1
                        return

                    elif is_foreign(self.url, action_content[0]):
                        self.fv[8] = 1
                        return
                        
        self.fv[8] = 0


    def set_f10_f11(self):
        """
        Sets F10 and F11.

        F10 = number of hyperlinks in source code.

        F11 = 1, if no hyperlinks found in source.
        F11 = 0, otherwise
        """

        n_hyperlinks_found = len(self.hyperlinks)
        self.fv[9] = n_hyperlinks_found

        if n_hyperlinks_found == 0:
            self.fv[10] = 1

        else:
            self.fv[10] = 0


    def set_f12(self):
        """
        Sets F12.

        ratio = |n_foreign_hyp| / |n_hyp|

        F12 = 1 if ratio > 0.5 and n_hyp > 0
        F12 = 0 otherwise

        REVISAR
        """

        if len(self.hyperlinks) < 1:
            self.fv[11] = 1 # Debería ser 0 pero es mejor 1 ya que es phishing clarísimamente
            return
            
        n_foreigns = self.get_number_foreign_hyperlinks() 
        ratio = (n_foreigns / len(self.hyperlinks))

        if ratio > 0.5:
            self.fv[11] = 1

        else:
            self.fv[11] = 0


    def set_f13(self):
        """
        Sets F13.

        ratio = |n_empty_hyp| / |n_hyp|

        F13 = 1 if ratio > 0.34 and n_hyp > 0
        F13 = 0 otherwise

        REVISAR
        """

        if len(self.hyperlinks) < 1:
            self.fv[12] = 1 # Debería ser 0 pero es mejor 1 ya que es phishing clarísimamente
            return
            
        n_empty = self.get_number_empty_hyperlinks()
        ratio = (n_empty / len(self.hyperlinks))

        if ratio > 0.34:
            self.fv[12] = 1

        else:
            self.fv[12] = 0


    #------------------------------------REVISANDO Y MOVIENDO A LOS UTILS-----------------------------------------------------

    def set_f14(self):

        if len(self.hyperlinks) < 1:
            self.fv[13] = 1
            return
            
        n_errors = self.get_number_errors()
            
        ratio = (n_errors / len(self.hyperlinks))

        if ratio > 0.3:
            self.fv[13] = 1

        else:
            self.fv[13] = 0

        print(ratio)

    
    def set_f15(self):

        if len(self.hyperlinks) < 1:
            self.fv[14] = 1
            return
            
        n_redirects = self.get_number_redirects()
            
        ratio = (n_redirects / len(self.hyperlinks))

        if ratio > 0.3:
            self.fv[14] = 1

        else:
            self.fv[14] = 0

        print(ratio)


    def set_f16(self):
        """
        Sets F16.
        F16 = 1, if CSS file is external and contains foreign domain name
        F16 = 0, otherwise
        """

        external_csss = self.soup.findAll("link", rel="stylesheet")

        for css in external_csss:

            link = self.extract_url_href(css)
            
            if is_foreign(link):
                self.fv[15] = 1
                break

        self.fv[15] = 0 


    def set_f17(self):

        copyright_clues = ['©', '& copy', 'copy', 'copyright', 'copyright', 'all right reserved', 'rights', 'right'] #'@', 

        base_domain = self.base.split(".")

        for clue in copyright_clues:

            regex = '(?:{})([^"]*)(?:[\.\"])'.format(clue)
            copy_contents = re.findall(regex, self.html)

            for copy_content in copy_contents:

                copy_content = copy_content.replace(" ", "")

                for base in base_domain:
                    if re.search(base, copy_content, re.IGNORECASE):
                        self.fv[16] = 0
                        return
        
        self.fv[16] = 1


    def set_f18(self):
        
        keywords = self.get_site_keywords()
        #base = re.match('(:?(www.)?)([^.]*)(:?.[A-Za-z]{0,4})', self.base)[0]
        #base = base.split('.')

        for keyword in keywords:

            if re.findall(keyword, self.base):
                self.fv[17] = 0
                break
        
        self.fv[17] = 1


    def set_f19(self):
        """
        Sets F19.
        F19 = 1, if foreign domain found in favicon link
        F19 = 0, otherwise
        """

        icons = self.soup.findAll("link", rel="icon") + self.soup.findAll("link", rel="shortcut icon")

        for icon in icons:

            link = self.extract_url_href(icon)

            if self.is_foreign(link):
                self.fv[18] = 1
                print(1)
                break

        self.fv[18] = 0 


# -----------------------------------------------------------------------------------------------------------


    def get_meta(self):

        keywords = []
        found = re.findall('(?:<meta)([^>]*)(?:>)', self.html) #(?:<meta.*content=")([^"]*)(?:")

        for content in found:
            match = re.findall('(?:content=")([^"]*)(?:")', content)

            if len(match) > 0:
                keywords.append(match[0])

        return keywords


    def get_meta_title_words(self):

        list = self.title + self.get_meta()
        words = ' '.join(list)

        return preprocess(words)


    def find_hyperlinks(self):
        """
        Finds number of pages in a website extracting them
        from the src attribute and href attribute of anchor
        tags.
        """
        return ( re.findall('(?:src\b*=\b*\")([^"]*)(?:\")', self.html) + re.findall('(?:href\b*=\b*\")([^"]*)(?:\")', self.html) )
    
    def url_validator(self, url):
        parsed = urlparse(url)
        return parsed.netloc != ''


    def get_response_code(self, url):
        return requests.get(url).status_code


    def get_number_foreign_hyperlinks(self):
        """
        Returns the number of foreign hyperlinks
        """

        n_foreigns = 0

        for h in self.hyperlinks:
            if is_foreign(h):
                n_foreigns += 1

        return n_foreigns

    def get_number_empty_hyperlinks(self):

        n_empty = 0

        for h in self.hyperlinks:
            if self.is_empty(h):
                n_empty += 1

        return n_empty
        
    
    def get_number_errors(self):

        n_errors = 0

        for h in self.hyperlinks:

            if not self.is_empty(h) and not self.is_relative_in_local(h):
                code = self.get_response_code(h)

                if code == 404 or code == 403:
                    n_errors += 1

        return n_errors


    def get_number_redirects(self):

        n_redirects = 0

        for h in self.hyperlinks:

            if not self.is_empty(h) and not self.is_relative_in_local(h):
                code = self.get_response_code(h)

                if code == 302 or code == 301:
                    n_redirects += 1

        return n_redirects


    def get_popular_words(self, k=10):

        cleaned = BeautifulSoup(self.html, "lxml").text
        tokens = preprocess(cleaned)
        counter = Counter(tokens)
        n_words = len(tokens)

        for token in np.unique(tokens):
        
            tf = counter[token]/n_words
            #df = doc_freq(token)
            #idf = np.log((N+1)/(df+1))
        
        #tf_idf[doc, token] = tf*idf

        return counter.most_common(k)

    def get_site_keywords(self):
        
        set_one = set(self.get_meta_title_words())
        set_two = set([word[0] for word in self.get_popular_words()])

        return set_one.union(set_two)

In [3]:
# ('https://www.naturaselection.com/es/') 
# ('https://www.bershka.com/es/h-woman.html') 
# ('https://ubuvirtual.ubu.es/') 
# ('https://fdeageadfahgeafeahg.azurewebsites.net/renner/inicio/login.php')
# ('https://banrural.herokuapp.com/')
# ('http://w3.unpocodetodo.info/canvas/data_uri.php')
# ph_entity = PHISH_FVG('https://www.facebook.com/login/')
# ph_entity.set_f4()

In [4]:
# ph_entity.set_f9()

# ph_entity.set_f10_f11()


# print(ph_entity.is_foreign('https://www.ubuvirtual.com/image.jpg'))
# print(ph_entity.is_foreign('www.fdeageadfahgeafeahg.azurewebsites.net/renner/inicio/login.php'))

# ph_entity.set_f12()
# ph_entity.set_f13()
#ph_entity.set_f17()


In [5]:
ph_entity = PHISH_FVG('https://www.bershka.com/es/h-woman.html') 

ph_entity.set_f8()

['data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAUAAAAGCAQAAACh8pzAAAAADklEQVR42mNkgANG8pgAASkAB1mKT2kAAAAASUVORK5CYII=', 'data:[{},{}],fetch:[],error:e,state:{']


# **TESTS**

In [6]:
import unittest

# class RealFV(unittest.TestCase):
    
#     def setUp(self):
#         self.ph_entity = PHISH_FVG('https://ubuvirtual.ubu.es/')

#     def test_correct_initialize(self):
#         self.assertTrue(np.array(self.ph_entity.fv).sum() == -19)

#     def test_f1(self):
#         self.assertTrue(self.ph_entity.fv[0] == -1)
#         self.ph_entity.set_f1()
#         self.assertTrue(self.ph_entity.fv[0] == 0)

#     def test_f2(self):
#         self.assertTrue(self.ph_entity.fv[1] == -1)
#         self.ph_entity.set_f2()
#         self.assertTrue(self.ph_entity.fv[1] == 0)

#     def test_f3(self):
#         self.assertTrue(self.ph_entity.fv[2] == -1)
#         self.ph_entity.set_f3()
#         self.assertTrue(self.ph_entity.fv[2] == 0)
        

class phishingUtilsMethods(unittest.TestCase):

    def test_translate_leet(self):

        phishing_words = ['l0g1n', '13urg05', '5h0pp1ng', '4maz0n', 'm1crosoft']
        real_words = ['login', 'burgos', 'shopping', 'amazon', 'microsoft']

        for phish, real in zip(phishing_words, real_words):
            alternatives = translate_leet_to_letters(phish)
            self.assertTrue(real in alternatives)

    
    def test_split_url(self):
        urls = ['https://ubuvirtual.ubu.es/', 'www.ubu-virtual.ubu.es/ruta/archivo.php']
        splitted_urls = [['https', 'ubuvirtual', 'ubu', 'es'], ['www', 'ubu', 'virtual', 'ubu', 'es', 'ruta', 'archivo', 'php']]

        for input_test, output_test in zip(urls, splitted_urls):
            result = get_splitted_url(input_test)
            self.assertTrue(result == output_test)


    def test_tlds_set(self):
        
        tlds = get_tlds_set()
        self.assertTrue(len(tlds) == 150)
        self.assertTrue(bool(tlds & {'com'}))
        self.assertTrue(bool(tlds & {'es'}))
        self.assertTrue(bool(tlds & {'edu'}))
        self.assertTrue(bool(tlds & {'fr'}))
        self.assertTrue(bool(tlds & {'org'}))


    def test_targets_set(self):
        
        tlds = get_phishing_targets_set()
        self.assertTrue(bool(tlds & {'amazon'}))
        self.assertTrue(bool(tlds & {'dropbox'}))
        self.assertTrue(bool(tlds & {'azure'}))
        self.assertTrue(bool(tlds & {'linkedin'}))
        self.assertTrue(bool(tlds & {'correos'}))


    def test_remove_tld(self):
        urls = ['ubuvirtual.ubu.es.org.uk', 'ubuvirtual.ubu.es.org', 'ubuvirtual.ubu.es']
        without_tlds = ['ubuvirtual.ubu.es.org', 'ubuvirtual.ubu.es', 'ubuvirtual.ubu']

        for input_test, output_test in zip(urls, without_tlds):
            result = remove_tld(input_test)
            self.assertTrue(result == output_test)


    def test_empty_content(self):
        
        empty = ['#', 'javascript:void(0)', '']
        not_empty = ['something', '/unexpected']

        for input_test in empty:
            self.assertTrue(is_empty(input_test))

        for input_test in not_empty:
            self.assertFalse(is_empty(input_test))


    def test_simple_php_file(self):
        
        simple = ['index.php', 'login.php', 'mail.php']
        not_simple = ['/index.php', 'something.something.php']

        for input_test in simple:
            self.assertTrue(is_simple_php_file(input_test))

        for input_test in not_simple:
            self.assertFalse(is_simple_php_file(input_test))


    def test_domains(self):
        
        base = 'https://ubuvirtual.ubu.es/'
        absolute = ['https://pwr.edu.pl/', 'https://www.uc3m.es/Inicio', 'https://estudios.uoc.edu/es/estudiar-online']
        relative = ['/mail.php', '/image/ruta/inventada.jpg', 'hola.html']

        for input_test in absolute:
            self.assertTrue(is_absolute(input_test))
            self.assertTrue(is_foreign(base, input_test))

        for input_test in relative:
            self.assertFalse(is_absolute(input_test))
            self.assertTrue(is_relative_in_local(input_test))


if __name__ == '__main__':
    unittest.main(argv=['first-arg-is-ignored'], exit=False)

........
----------------------------------------------------------------------
Ran 8 tests in 0.007s

OK
