In [1]:
from datetime import datetime
import pandas as pd

In [2]:
def select_ids(main, verified):
    df3 = main[main['id'].isin(verified)]
    return df3

In [3]:
def domain_age_lessThanOne(create_date, update_date):
    if create_date != "" and create_date != "expired" and not pd.isna(create_date):
        age = datetime.strptime("2024-04-23", '%Y-%m-%d') - datetime.strptime(create_date[:10],
                                                                              '%Y-%m-%d')
        return (age.days // 365) < 1

    elif create_date == "" and update_date != "":
        age = datetime.strptime("2024-04-23", '%Y-%m-%d') - datetime.strptime(update_date[:10],
                                                                              '%Y-%m-%d')
        if age.days < 365:
            return None
        else:
            return False

    elif create_date == "expired":
        return True

    return None

In [4]:
def weighted_selection_without_subdomain(item):
    # Calculate the number of hypotheses satisfied for each classification
    new_domain = domain_age_lessThanOne(item['creation_date'], item['updated_date'])
    shared_hosting_count = (
                                   (0.5 if new_domain is None else 0 if new_domain else 1) +
                                   (item['domain_indexed']) +
                                   (item['is_archived']) +
                                   (item['between_archives_similarity'] == -2) +
                                   (item['phish_archives_similarity'] == -2)
                           ) / 5

    attacker_domain_count = (
                                    (0.5 if new_domain is None else 1 if new_domain else 0) +
                                    (not item['domain_indexed']) +
                                    (not item['is_archived'] or item['phish_archives_similarity'] == 1
                                     or item['phish_archives_similarity'] == 2 or item[
                                         'phish_archives_similarity'] == 0)
                            ) / 3

    compromised_host_count = (
                                     (0.5 if new_domain is None else 0 if new_domain else 1) +
                                     (item['domain_indexed']) +
                                     (item['is_archived']) +
                                     (item['between_archives_similarity'] == 2) +
                                     (item['phish_archives_similarity'] == -2)
                             ) / 5

    # Determine the classification based on the highest count
    if shared_hosting_count >= max(compromised_host_count, attacker_domain_count):
        return 'shared_domain'
    elif compromised_host_count >= attacker_domain_count:
        return 'compromised_domain'
    else:
        return 'attackers_domain'

In [5]:
def weighted_selection_with_subdomain(item):
    # Calculate the number of hypotheses satisfied for each classification
    new_domain = domain_age_lessThanOne(item['creation_date'], item['updated_date'])
    shared_hosting_count = (
                                   (0.5 if new_domain is None else 0 if new_domain else 1) +
                                   (item['domain_indexed']) +
                                   (item['is_archived']) +
                                   (item['between_archives_similarity'] == -2) +
                                   (item['phish_archives_similarity'] == -2) +
                                   (not item['control_over_dns']) +
                                   (0.5 if pd.isna(item['control_over_ssl']) else 0 if item['control_over_ssl'] else 1)
                           ) / 5

    attacker_domain_count = (
                                    (0.5 if new_domain is None else 1 if new_domain else 0) +
                                    (not item['domain_indexed']) +
                                    (not item['is_archived'] or item['phish_archives_similarity'] == 1
                                     or item['phish_archives_similarity'] == 2 or item[
                                         'phish_archives_similarity'] == 0)
                            ) / 3

    compromised_host_count = (
                                     (0.5 if new_domain is None else 0 if new_domain else 1) +
                                     (item['domain_indexed']) +
                                     (item['is_archived']) +
                                     (item['between_archives_similarity'] == 2) +
                                     (item['phish_archives_similarity'] == -2)
                             ) / 5


    # Determine the classification based on the highest count
    if shared_hosting_count >= max(compromised_host_count, attacker_domain_count):
        return 'shared_domain'
    elif compromised_host_count >= attacker_domain_count:
        return 'compromised_domain'
    else:
        return 'attackers_domain'

In [6]:
def taxonomy_analysis(data):
    for index, sample in data.iterrows():
        if sample['known_hosting']:  # sample hosted on a shared platform
            data.loc[index, 'taxonomy_predicted_category'] = 'shared_domain'
        else:
            if not sample['is_subdomain']:# sample is not a subdomain
                if sample['is_on_root']:
                    data.loc[index, 'taxonomy_predicted_category'] = 'attackers_domain'
                else:
                    data.loc[index, 'taxonomy_predicted_category'] = weighted_selection_without_subdomain(sample)

            else:  # sample is a subdomain
                data.loc[index, 'taxonomy_predicted_category'] = weighted_selection_with_subdomain(sample)

In [7]:
path_prefix = '../data/'
df = pd.read_csv(path_prefix + 'validated_dataset_for_taxonomy.csv')


In [8]:
df['taxonomy_predicted_category'] = None

taxonomy_analysis(df)
df.to_csv(path_prefix + 'taxonomy_results_for_validated_dataset.csv', index=False)