In [None]:
#!/usr/bin/env python
# -*-coding:utf-8 -*-
'''
@File    :   phishing_dataset_generator.ipynb
@Time    :   2023/03/30 21:02:17
@Author  :   Patricia Hernando Fernández 
@Version :   1.0
@Contact :   phf1001@alu.ubu.es
'''


### **PHISHING VECTOR GENERATOR** 🐟

In [13]:
import requests
import urllib.parse

In [14]:
print(requests.get('www.google.com').text)

MissingSchema: Invalid URL 'www.google.com': No scheme supplied. Perhaps you meant http://www.google.com?

In [11]:
parsed = urllib.parse.urlparse('https://www.google.com/')
parsed

ParseResult(scheme='https', netloc='www.google.com', path='/', params='', query='', fragment='')

In [None]:
from phishing_vector_generator import PHISH_FVG
from phishing_utils import *
from user_browsing import user_browsing

# **TESTS**

**Probando el comportamiento de usar proxy, no usarlo o tener la cadena vacía.**

In [None]:
import unittest
import os
import sys
import requests

src_path = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
sys.path.append(src_path)


class RealFV(unittest.TestCase):
    def setUp(self):
        # Tor proxy uploaded in file
        self.ph_entity = PHISH_FVG("https://ubuvirtual.ubu.es/", None, get_proxy_from_file=True)

        # No proxy
        self.ph_entity_two = PHISH_FVG("https://ubuvirtual.ubu.es/", None, get_proxy_from_file=False)

    def test_proxy_working(self):
        ip_one = requests.get(
            "http://ipinfo.io/ip", proxies=self.ph_entity.user.proxies
        ).text
        ip_two = requests.get("http://ipinfo.io/ip").text
        ip_three = requests.get(
            "http://ipinfo.io/ip", proxies=self.ph_entity_two.user.proxies
        ).text
        self.assertTrue(ip_one != ip_two)
        self.assertTrue(ip_one != ip_three)
        self.assertTrue(ip_two == ip_three)


if __name__ == "__main__":
    unittest.main(argv=["first-arg-is-ignored"], exit=False)


# **TF-IDF**

In [None]:
# user = user_browsing()
# urls = get_csv_data(get_data_path() + os.sep + "alexa_filtered.csv")[:3]
# corpus = get_tfidf_corpus(urls, user.get_simple_user_header_agent(), user.proxies)
# tfidf = get_tfidf(corpus)


user = user_browsing()
urls = get_csv_data(get_data_path() + os.sep + "alexa_filtered.csv")[:200]
corpus = get_tfidf_corpus(urls, user.get_simple_user_header_agent(), user.proxies)


In [None]:
import pickle

tfidf = get_tfidf(corpus)    
with open("tfidftocho.pkl", "wb") as f:
    pickle.dump(tfidf, f)

# **GENERATION OF VECTORS**

**Tor proxy used**

In [None]:
# reales = ["https://www.naturaselection.com/"]
# fvs_real = []

# for real in reales:
#     try:
#         ph_entity = PHISH_FVG(real, tfidf)
#         ph_entity.set_feature_vector()
#         fvs_real.append(np.append(ph_entity.fv, [0]))
#         print(ph_entity.fv)
#         print(ph_entity.extra_information)

#     except:
#         print(f"Error en {real}")


**No proxy used**

In [None]:
# reales = ["https://www.naturaselection.com/"]
# fvs_real = []

# for real in reales:
#     try:
#         ph_entity = PHISH_FVG(real, tfidf, get_proxy_from_file=False)
#         ph_entity.set_feature_vector()
#         fvs_real.append(np.append(ph_entity.fv, [0]))
#         print(ph_entity.fv)
#         print(ph_entity.extra_information)

#     except:
#         print(f"Error en {real}")


## **REALS**

In [None]:
# reales = get_alexa_sites()
# fvs_real = []

# for real in reales:

#     try:
#         ph_entity = PHISH_FVG(real, tfidf)
#         ph_entity.set_feature_vector()
#         fvs_real.append(np.append(ph_entity.fv, [0]))

#     except:
#         print(f"Error en {real}")

# output_file = get_fv_path() + os.path.sep + 'genuine_fv.csv'

# with open(output_file, mode='w') as f:

#     writer = csv.writer(f, delimiter=',', quotechar='"',
#                         quoting=csv.QUOTE_MINIMAL)
#     writer.writerow([f"f{i}" for i in range(1,20)] + ['tag'])

#     for url in fvs_real:
#         writer.writerow(url)

# f.close()


## **PHISHING**

In [None]:
# phishings = get_phish_tank_urls_csv(10000000).union(get_open_fish_urls())


In [None]:
# fvs_phishing = []

# for phishing in phishings:
#     try:
#         ph_entity = PHISH_FVG(phishing, tfidf)
#         ph_entity.set_feature_vector()
#         fvs_phishing.append(np.append(ph_entity.fv, [1]))

#         if len(fvs_phishing) == 1100:
#             break

#     except:  # (ConnectionError, requests.exceptions.SSLError, requests.exceptions.ConnectTimeOut):
#         print(phishing)

# output_file = get_fv_path() + os.path.sep + "phishing_fv.csv"

# with open(output_file, mode="w") as f:
#     writer = csv.writer(f, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL)
#     writer.writerow([f"f{i}" for i in range(1, 20)] + ["tag"])

#     for url in fvs_phishing:
#         writer.writerow(url)

# f.close()
