### **PHISHING VECTOR GENERATOR** 🐟

In [199]:
import pandas as pd
import numpy as np
import json
from copy import deepcopy
import os
import time
import requests
import re
from user_browsing import user_browsing
from xml.etree import ElementTree as ET
from urllib.parse import urlparse
from os import path
from collections import Counter
from bs4 import BeautifulSoup
import urllib

from phishing_utils import Utils

In [200]:
class PHISH_FVG:

    def __init__(self, url, fichero = 'html_dump'):

        self.url = url
        self.base, self.path = self.process_url(self.url)

        self.fv = np.array([-1 for i in range(19)])
        self.fichero = fichero

        self.user = user_browsing()
        self.user.set_standard_header(self.base)

        response_content = self.get_bin_source_code()
        self.html = response_content.decode("utf-8")
        self.soup = BeautifulSoup(response_content)

        self.hyperlinks = self.find_hyperlinks()
        

    def process_url(self, url):

        parsed = urlparse(url)
        base = parsed.netloc
        return base, base + '/'.join(path.split('/')[:-1])


    def get_bin_source_code(self):

        response = requests.get(self.url, headers=self.user.get_simple_user_header_agent() ) #, headers=self.user.header) #proxies=user.proxies, cookies=user.cookies)

        if response.status_code != 400:
            with open(self.fichero, 'wb') as f:
                f.write(response.content)
                f.close()

        return response.content

    def get_title(self):
        return re.findall('(?:<title>)([^<]*)(?:</title>)', self.html)

    def get_meta(self):

        keywords = []
        found = re.findall('(?:<meta)([^>]*)(?:>)', self.html) #(?:<meta.*content=")([^"]*)(?:")

        for content in found:
            match = re.findall('(?:content=")([^"]*)(?:")', content)

            if len(match) > 0:
                keywords.append(match[0])

        return keywords


    def get_meta_title_words(self):

        list = self.get_title() + self.get_meta()
        words = ' '.join(list)

        return Utils().preprocess(words)


    def find_hyperlinks(self):
        return ( re.findall('(?:src\b*=\b*\")([^"]*)(?:\")', self.html) + re.findall('(?:href\b*=\b*\")([^"]*)(?:\")', self.html) )
    
    def url_validator(self, url):
        parsed = urlparse(url)
        return parsed.netloc != ''


    def set_f9(self):

        try:

            forms_found = re.findall("<form[^>]+>", self.html)

            if len(forms_found) > 0:

                for i in range(len(forms_found)):

                    form_found = forms_found[i]
                    action_content = re.findall('(?:action=\")([^"]*)(?:\")', form_found)
                    print(action_content[0])

                    if (len(action_content[0]) <1) or (len(re.findall('javascript:void\(0\)', action_content[0])) > 0):
                        self.fv[8] = 1
                        return

                    elif len(re.findall('^(.*\.php)$', action_content[0])) > 0 and self.base != self.process_url(action_content[0])[0]:
                        self.fv[8] = 1
                        return

                    elif action_content[0][0] != '/' and self.base != self.process_url(action_content[0])[0]:
                        self.fv[8] = 1
                        return
                    
                    else:
                        self.fv[8] = 0
                    
            else:
                self.fv[8] = 0
        
        # No action found
        except:
            self.fv[8] = 0

    
    def set_f10_f11(self):

        try:
            n_hyperlinks_found = len(self.hyperlinks)
            self.fv[9] = n_hyperlinks_found

            if n_hyperlinks_found == 0:
                self.fv[10] = 1

            else:
                self.fv[10] = 0

        except:
            self.fv[9] = 0
            self.fv[10] = 0


    def get_number_foreign_hyperlinks(self):

        n_foreigns = 0

        for h in self.hyperlinks:
            if self.is_foreign(h):
                n_foreigns += 1

        return n_foreigns

        
    def set_f12(self):

        if len(self.hyperlinks) < 1:
            self.fv[11] = 1
            return 0
            
        n_foreigns = self.get_number_foreign_hyperlinks() 
            
        ratio = (n_foreigns / len(self.hyperlinks))

        if ratio < 0.5:
            self.fv[11] = 1

        else:
            self.fv[11] = 0

            # for h in hyperlinks_found:
            #     print("{} {} ".format(h, self.is_foreign(h)))

        print(ratio)


    def set_f13(self):

        if len(self.hyperlinks) < 1:
            self.fv[12] = 1
            return
            
        n_empty = self.get_number_empty_hyperlinks()
            
        ratio = (n_empty / len(self.hyperlinks))

        if ratio > 0.34:
            self.fv[12] = 1

        else:
            self.fv[12] = 0

        print(ratio)


    def set_f14(self):

        if len(self.hyperlinks) < 1:
            self.fv[13] = 1
            return
            
        n_errors = self.get_number_errors()
            
        ratio = (n_errors / len(self.hyperlinks))

        if ratio > 0.3:
            self.fv[13] = 1

        else:
            self.fv[13] = 0

        print(ratio)

    
    def set_f15(self):

        if len(self.hyperlinks) < 1:
            self.fv[14] = 1
            return
            
        n_redirects = self.get_number_redirects()
            
        ratio = (n_redirects / len(self.hyperlinks))

        if ratio > 0.3:
            self.fv[14] = 1

        else:
            self.fv[14] = 0

        print(ratio)


    def set_f17(self):

        copyright_clues = ['©', '& copy', 'copy', 'copyright', 'copyright', 'all right reserved', 'rights', 'right'] #'@', 

        base_domain = self.base.split(".")

        for clue in copyright_clues:

            regex = '(?:{})([^"]*)(?:[\.\"])'.format(clue)
            copy_contents = re.findall(regex, self.html)

            for copy_content in copy_contents:

                copy_content = copy_content.replace(" ", "")

                for base in base_domain:
                    if re.search(base, copy_content, re.IGNORECASE):
                        self.fv[16] = 0
                        return
        
        self.fv[16] = 1
    

    def get_response_code(self, url):
        return requests.get(url).status_code



    def get_number_empty_hyperlinks(self):

        n_empty = 0

        for h in self.hyperlinks:
            if self.is_empty(h):
                n_empty += 1

        return n_empty



    def is_absolute(self, url):
        return bool(urlparse(url).netloc)

    def is_empty(self, url):
        return url[0] == '#' or bool(re.match('[Jj]ava[Ss]cript::?void\(0\)', url))

    def is_relative_in_local(self, url):

        if self.is_absolute(url):
            return False

        pattern = re.compile("^[/]?[A-z0-9_]")
        return bool (pattern.match(url))
        
    
    def get_number_errors(self):

        n_errors = 0

        for h in self.hyperlinks:

            if not self.is_empty(h) and not self.is_relative_in_local(h):
                code = self.get_response_code(h)

                if code == 404 or code == 403:
                    n_errors += 1

        return n_errors


    def get_number_redirects(self):

        n_redirects = 0

        for h in self.hyperlinks:

            if not self.is_empty(h) and not self.is_relative_in_local(h):
                code = self.get_response_code(h)

                if code == 302 or code == 301:
                    n_redirects += 1

        return n_redirects


    def is_foreign(self, url):
        return not self.is_relative_in_local(url) and not self.is_empty(url) and self.base != urlparse(url).netloc

    def get_popular_words(self, k=10):

        cleaned = BeautifulSoup(self.html, "lxml").text
        tokens = Utils().preprocess(cleaned)
        counter = Counter(tokens)
        n_words = len(tokens)

        for token in np.unique(tokens):
        
            tf = counter[token]/n_words
            #df = doc_freq(token)
            #idf = np.log((N+1)/(df+1))
        
        #tf_idf[doc, token] = tf*idf

        return counter.most_common(k)


    def get_site_keywords(self):
        
        set_one = set(self.get_meta_title_words())
        set_two = set([word[0] for word in self.get_popular_words()])

        return set_one.union(set_two)


    def set_f18(self):
        
        keywords = self.get_site_keywords()
        #base = re.match('(:?(www.)?)([^.]*)(:?.[A-Za-z]{0,4})', self.base)[0]
        #base = base.split('.')

        for keyword in keywords:

            if re.findall(keyword, self.base):
                self.fv[17] = 0
                break
        
        self.fv[17] = 1


    def set_f19(self):
        """
        Sets F19.
        F19 = 1, if foreign domain found in favicon link
        F19 = 0, otherwise
        """

        icons = self.soup.findAll("link", rel="icon") + self.soup.findAll("link", rel="shortcut icon")
        print(icons)

        for icon in icons:

            link = re.findall('(?:href=")([^"]*)(?:")', str(icon))[0]

            if self.is_foreign(link):
                self.fv[18] = 1
                print(1)
                break

        self.fv[18] = 0

In [201]:
# ('https://www.naturaselection.com/es/') 
# ('https://www.bershka.com/es/h-woman.html') 
# ('https://ubuvirtual.ubu.es/') 
# ('https://fdeageadfahgeafeahg.azurewebsites.net/renner/inicio/login.php')
# ('https://banrural.herokuapp.com/')

ph_entity = PHISH_FVG('https://www.naturaselection.com/es/') 

In [202]:
# ph_entity.set_f9()

# ph_entity.set_f10_f11()


# print(ph_entity.is_foreign('https://www.ubuvirtual.com/image.jpg'))
# print(ph_entity.is_foreign('www.fdeageadfahgeafeahg.azurewebsites.net/renner/inicio/login.php'))

# ph_entity.set_f12()
# ph_entity.set_f13()
#ph_entity.set_f17()


In [203]:
ph_entity.set_f19()

[]
