In [None]:
#!/usr/bin/env python
# -*-coding:utf-8 -*-
'''
@File    :   phishing_dataset_generator.ipynb
@Time    :   2023/04/28 21:02:17
@Author  :   Patricia Hernando Fernández 
@Version :   5.0
@Contact :   phf1001@alu.ubu.es
'''


### **PHISHING VECTOR GENERATOR** 🐟

In [None]:
import requests
import urllib.parse
import pandas as pd
import pickle

In [None]:
from phishing_vector_generator import PhishingFVG
from phishing_utils import *
from user_browsing import UserBrowsing

# **TESTS**

**Probando el comportamiento de usar proxy, no usarlo o tener la cadena vacía.**

In [None]:
import unittest
import os
import sys
import requests

src_path = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
sys.path.append(src_path)


class RealFV(unittest.TestCase):
    def setUp(self):
        # Tor proxy uploaded in file
        self.ph_entity = PhishingFVG("https://ubuvirtual.ubu.es/", None, get_proxy_from_file=True)

        # No proxy
        self.ph_entity_two = PhishingFVG("https://ubuvirtual.ubu.es/", None, get_proxy_from_file=False)

    def test_proxy_working(self):
        ip_one = requests.get(
            "http://ipinfo.io/ip", proxies=self.ph_entity.user.proxies
        ).text
        ip_two = requests.get("http://ipinfo.io/ip").text
        ip_three = requests.get(
            "http://ipinfo.io/ip", proxies=self.ph_entity_two.user.proxies
        ).text
        self.assertTrue(ip_one != ip_two)
        self.assertTrue(ip_one != ip_three)
        self.assertTrue(ip_two == ip_three)


if __name__ == "__main__":
    unittest.main(argv=["first-arg-is-ignored"], exit=False)


# **TF-IDF**

In [None]:
user = UserBrowsing()
urls = get_csv_data(get_data_path() + os.sep + "alexa_filtered.csv")
corpus = get_tfidf_corpus(urls, user.get_simple_user_header_agent(), user.proxies)

In [None]:
tfidf = get_tfidf(corpus)    
with open("tfidftocho.pkl", "wb") as f:
    pickle.dump(tfidf, f)

In [None]:
tfidf_pickle_in = open("tfidf-full-alexa.pkl","rb")
tfidf = pickle.load(tfidf_pickle_in)
print(tfidf)

# **GENERATION OF VECTORS**

**Tor proxy used**

In [None]:
reales = ["https://microsoft.com"]
fvs_real = []

for real in reales:
    try:
        ph_entity = PhishingFVG(real, tfidf)
        ph_entity.set_feature_vector()
        fvs_real.append(np.append(ph_entity.fv, [0]))
        print(ph_entity.fv)
        print(ph_entity.extra_information)

    except:
        print(f"Error en {real}")


**No proxy used**

In [None]:
reales = ["https://microsoft.com"]
fvs_real = []

for real in reales:
    try:
        ph_entity = PhishingFVG(real, tfidf, get_proxy_from_file=False)
        ph_entity.set_feature_vector()
        fvs_real.append(np.append(ph_entity.fv, [0]))
        print(ph_entity.fv)
        print(ph_entity.extra_information)
        print(ph_entity.extra_information)

    except:
        print(f"Error en {real}")


## **REALS -> CSV y Pandas**

In [None]:
reales = get_payment_gateways().union(get_banking_sites()) #get_alexa_sites()
fvs_real = []
df = pd.DataFrame(columns=["url", "fv", "tag"])

output_file = get_fv_path() + os.path.sep + 'genuine_bank_pd.csv'
output_file_2 = get_fv_path() + os.path.sep + 'genuine_bank_csv.csv'

for real in reales:

    try:
        ph_entity = PhishingFVG(real, tfidf)
        ph_entity.set_feature_vector()
        fvs_real.append(np.append(ph_entity.fv, [0]))
        df.loc[len(df.index)] = [real, list(ph_entity.fv), 0]

        with open(output_file, mode='w') as f:
            df.to_csv(f, sep=";", index=False)
        f.close()

        with open(output_file_2, mode="w") as f2:
            writer = csv.writer(f2, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL)
            writer.writerow([f"f{i}" for i in range(1, 20)] + ["tag"])

            for url in fvs_real:
                writer.writerow(url)

        f2.close()

    except:
        print(f"Error en {real}")


## **PHISHING -> FV y Pandas**

In [None]:
phishings = get_phish_tank_urls_csv(10000000).union(get_open_fish_urls())

In [None]:
df = pd.DataFrame(columns=["url", "fv", "tag"])
fvs_phishing = []
output_file = get_fv_path() + os.path.sep + 'phishing_pd.csv'
output_file_2 = get_fv_path() + os.path.sep + 'phishing_pd_2.csv'

for phishing in phishings:
    try:
        ph_entity = PhishingFVG(phishing, tfidf)
        ph_entity.set_feature_vector()
        fvs_phishing.append(np.append(ph_entity.fv, [1]))
        df.loc[len(df.index)] = [phishing, list(ph_entity.fv), 1]

        with open(output_file, mode='w') as f:
            df.to_csv(f, sep=";", index=False)
        f.close()

        with open(output_file_2, mode="w") as f2:
            writer = csv.writer(f2, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL)
            writer.writerow([f"f{i}" for i in range(1, 20)] + ["tag"])

            for url in fvs_phishing:
                writer.writerow(url)

        f2.close()

    except:  # (ConnectionError, requests.exceptions.SSLError, requests.exceptions.ConnectTimeOut):
        print(phishing)