In [1]:
import csv
import joblib
import pandas as pd
from sklearn.model_selection import GridSearchCV
from datetime import datetime
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.preprocessing import MaxAbsScaler

In [3]:
# Create a DataFrame from the data
def create_dataframe(data):
    dataframe = pd.DataFrame(data)
    return dataframe

In [4]:
def domain_age_lessThanOne(create_date, update_date):
    if create_date != "" and create_date != "expired" and not pd.isna(create_date):
        age = datetime.strptime("2024-04-23", '%Y-%m-%d') - datetime.strptime(create_date[:10],
                                                                              '%Y-%m-%d')
        return (age.days // 365) < 1

    elif create_date == "" and update_date != "":
        age = datetime.strptime("2024-04-23", '%Y-%m-%d') - datetime.strptime(update_date[:10],
                                                                              '%Y-%m-%d')
        if age.days < 365:
            return None
        else:
            return False

    elif create_date == "expired":
        return True

    return None

In [5]:
def binary_to_numeric(value):
    if value:
        return 1
    if not value:
        return 0
    else:
        return None

In [6]:
def preprocess_data(data, features):
    
    preprocessed_data = data[features].copy()
    preprocessed_data['new_domain'] = None
    for index, item in preprocessed_data.iterrows():
        new_domain = domain_age_lessThanOne(item['creation_date'], item['updated_date'])
        preprocessed_data.loc[index, 'new_domain'] = new_domain
        
    preprocessed_data = preprocessed_data.drop('creation_date', axis=1)
    preprocessed_data = preprocessed_data.drop('updated_date', axis=1)
        
    # Transform binary values to numerical
    preprocessed_data['control_over_dns'] = preprocessed_data['control_over_dns'].astype(float).replace({True: 1.0, False: 0.0})
    preprocessed_data['control_over_ssl'] = preprocessed_data['control_over_ssl'].astype(float).replace({True: 1.0, False: 0.0})
    preprocessed_data['domain_indexed'] = preprocessed_data['domain_indexed'].astype(float).replace({True: 1.0, False: 0.0})
    preprocessed_data['is_archived'] = preprocessed_data['is_archived'].astype(float).replace({True: 1.0, False: 0.0})
    preprocessed_data['known_hosting'] = preprocessed_data['known_hosting'].astype(float).replace({True: 1.0, False: 0.0})
    preprocessed_data['new_domain'] = preprocessed_data['new_domain'].astype(float).replace({True: 1.0, False: 0.0})
    preprocessed_data['is_on_root'] = preprocessed_data['is_on_root'].astype(float).replace({True: 1.0, False: 0.0})
    preprocessed_data['is_subdomain'] = preprocessed_data['is_subdomain'].astype(float).replace({True: 1.0, False: 0.0})
        
    # preprocessed_data =preprocessed_data.applymap(lambda x: pd.to_numeric(x, errors='coerce'))
    
    return preprocessed_data

In [7]:
def count_missing_values(data, file_name):
    missing_values = data.isnull().sum()
    missing_values.to_csv(file_name)

In [8]:
# Scale numerical features using standardization
def scale_numerical_features(data, num_features):
    scaler = MaxAbsScaler()
    data[num_features] = scaler.fit_transform(data[num_features])
    return data

In [9]:
def handle_missing_values(data, num_features):
    features_with_missing_values = data.columns[data.isnull().any()].tolist()
    features_with_missing_values.sort(key=lambda x: data[x].isnull().sum())

    best_params_dict = {}  # Dictionary to store best parameters for each feature

    for feature in features_with_missing_values:
        complete_data = data.dropna()  # Remove rows with any missing values

        data_null = data[data[feature].isnull()] # Rows from the original DataFrame where the values in the column [feature] are null
        data_pred = data_null.drop(feature, axis=1) # Test data that we want to predict them the column [feature] 

        X_train = complete_data.drop(feature, axis=1)
        y_train = complete_data[feature]

        param_grid = {
            "n_estimators": [100, 200, 500],
            "max_depth": [None, 5, 10],
            "min_samples_split": [2, 5, 10]
        }
        if feature in num_features:
            RF_model = RandomForestRegressor()
            grid_search = GridSearchCV(RF_model, param_grid, cv=5, scoring='neg_mean_squared_error')

        else:
            RF_model = RandomForestClassifier()
            grid_search = GridSearchCV(RF_model, param_grid, cv=5, scoring='accuracy')

        grid_search.fit(X_train, y_train)
        best_model = grid_search.best_estimator_
        best_params_dict[feature] = grid_search.best_params_  # Save best parameters for the feature
        # Print the best hyperparameters for imputing missing values
        print("Best Parameters:", grid_search.best_params_)
        imputed_values = best_model.predict(data_pred)
        data.loc[data[feature].isnull(), feature] = imputed_values
        # Calculate accuracy and MSE
        best_scoring = grid_search.best_score_
        print(f"Feature: {feature}, Best Score: {best_scoring}")
        

    # Save the best parameters dictionary to a file
    joblib.dump(best_params_dict, path_prefix + 'best_params_dict.pkl')

    return data

In [10]:
import pickle
from sklearn.base import clone

def handle_missing_values_load_bestparams(data, num_features):
    features_with_missing_values = data.columns[data.isnull().any()].tolist()
    features_with_missing_values.sort(key=lambda x: data[x].isnull().sum())

    # Load the best parameters from the pickle file
    with open(path_prefix + 'best_params_dict.pkl', 'rb') as f:
        best_params_dict = pickle.load(f)

    for feature in features_with_missing_values:
        complete_data = data.dropna()  # Remove rows with any missing values

        data_null = data[data[feature].isnull()]
        data_pred = data_null.drop(feature, axis=1)

        X_train = complete_data.drop(feature, axis=1)
        y_train = complete_data[feature]

        if feature in num_features:
            RF_model = RandomForestRegressor()
        else:
            RF_model = RandomForestClassifier()

        if feature in best_params_dict:
            best_params = best_params_dict[feature] # Get the best parameters for the feature
        else:
            # Set best_params to default parameters
            default_model = clone(RF_model)
            default_params = default_model.get_params()
            best_params = default_params
            
        RF_model.set_params(**best_params)  # Set the best parameters for the model

        RF_model.fit(X_train, y_train)
        imputed_values = RF_model.predict(data_pred)
        data.loc[data[feature].isnull(), feature] = imputed_values.astype(float) 
        # Calculate accuracy and MSE
        best_scoring = RF_model.score(X_train, y_train)
        print(f"Feature: {feature}, Best Score: {best_scoring}")

    return data

In [35]:
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support

def perform_classification(data, labels, sample_ids):
    # Separate the target feature
    X = data.copy()
    y = labels
    mapped_y = target.map({'attackers_domain': 0, 'compromised_domain': 1, 'shared_domain': 2})
    
    param_grid = {
                "n_estimators": [100, 200, 500],
                "max_depth": [None, 5, 10],
                "min_samples_split": [2, 5, 10]
    }
    model_to_tune = RandomForestClassifier()
    
    # Declare the inner and outer cross-validation strategies
    inner_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
    outer_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
    
    # Inner cross-validation for parameter search
    model = GridSearchCV(estimator=model_to_tune, param_grid=param_grid, cv=inner_cv)
    
    # Outer cross-validation to compute the testing score
    outer_confusion_matrices = []
    outer_precision_recall_f1 = []
    y_true_list = []
    y_pred_list = []
    sample_id_list = []
    
    for i, (outer_train_index, outer_test_index) in enumerate(outer_cv.split(X, y)):
        X_outer_train, X_outer_test = X.iloc[outer_train_index], X.iloc[outer_test_index]
        y_outer_train, y_outer_test = y.iloc[outer_train_index], y.iloc[outer_test_index]
        sample_ids_outer_test = sample_ids.iloc[outer_test_index]

        model.fit(X_outer_train, y_outer_train)
        y_pred = model.predict(X_outer_test)

        confusion_matrix_values = confusion_matrix(y_outer_test, y_pred)
        outer_confusion_matrices.append(confusion_matrix_values)

        precision, recall, f1, _ = precision_recall_fscore_support(y_outer_test, y_pred, average=None)
        outer_precision_recall_f1.append((precision, recall, f1))

        y_true_list.extend(y_outer_test)
        y_pred_list.extend(y_pred)
        sample_id_list.extend(sample_ids_outer_test)

        print(f"Outer Fold {i+1} Confusion Matrix:\n{outer_confusion_matrices[-1]}")
        # Print the precision, recall, and F1-score for each class
        for j, (p, r, f) in enumerate(zip(precision, recall, f1)):
            print(f"Outer Fold {i+1} Class {j} Precision: {p:.3f}, Recall: {r:.3f}, F1-score: {f:.3f}")

        # Save the predicted values and sample IDs for the current outer fold
        fold_data = pd.DataFrame({'sample_id': sample_ids_outer_test, 'actual': y_outer_test, 'predicted': y_pred})
        fold_data.to_csv(path_prefix + f'random_forest_predictions_fold{i+1}.csv', index=False)
        
    # Print the best hyperparameters for classification
    print("Best Parameters:", model.best_params_)
    
    # Save the model
    joblib.dump(model, path_prefix + 'random_forest_model.pkl')
    
    # Save the model parameters
    model_params = model.get_params()
    with open(path_prefix + 'random_forest_model_params.txt', 'w') as f:
        for param, value in model_params.items():
            f.write(f"{param}: {value}\n")
    
    # Save the predicted values for all folds
    fold_data_all = pd.DataFrame({'actual': y_true_list, 'predicted': y_pred_list})
    fold_data_all.to_csv(path_prefix + f'random_forest_predictions_all.csv', index=False)


In [36]:
# List of selected features
selected_features = [
    'creation_date',
    'updated_date',
    'control_over_ssl',
    'control_over_dns',
    'domain_indexed',
    'known_hosting',
    'is_archived',
    'is_on_root',
    'is_subdomain',
    'between_archives_distance',
    'phish_archives_distance'
]

In [37]:
numerical_features = ['between_archives_distance', 'phish_archives_distance']

In [45]:
# Load the data
path_prefix = '../data/'
df = pd.read_csv(path_prefix + 'validated_dataset_for_classification.csv')
target = df['verified_category']
ids = df['id']

In [46]:
# Preprocess the data
transformed_data = preprocess_data(df, selected_features)

In [47]:
# Scale numerical features
scaled_data = scale_numerical_features(transformed_data, numerical_features)

In [48]:
# count_missing_values(transformed_data, path_prefix + 'missing_value_count.csv')

In [49]:
# Handling missing value using RandomForest
handled_missed = handle_missing_values(scaled_data, numerical_features)

Best Parameters: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 100}
Feature: control_over_ssl, Best Score: 0.8718886965014331
Best Parameters: {'max_depth': 5, 'min_samples_split': 2, 'n_estimators': 100}
Feature: new_domain, Best Score: 0.9798148148148149


In [50]:
import numpy as np

# handled_missed.to_csv(path_prefix + 'handled_missed.csv', index=True)
print(np.unique(target, return_counts=True))

(array(['attackers_domain', 'compromised_domain', 'shared_domain'],
      dtype=object), array([1376,  106, 3954]))


In [51]:
perform_classification(handled_missed, target, ids)

Outer Fold 1 Confusion Matrix:
[[274   1   1]
 [  3  14   4]
 [  4   3 784]]
Outer Fold 1 Class 0 Precision: 0.975, Recall: 0.993, F1-score: 0.984
Outer Fold 1 Class 1 Precision: 0.778, Recall: 0.667, F1-score: 0.718
Outer Fold 1 Class 2 Precision: 0.994, Recall: 0.991, F1-score: 0.992
Outer Fold 2 Confusion Matrix:
[[272   2   1]
 [  3  14   5]
 [  1   3 786]]
Outer Fold 2 Class 0 Precision: 0.986, Recall: 0.989, F1-score: 0.987
Outer Fold 2 Class 1 Precision: 0.737, Recall: 0.636, F1-score: 0.683
Outer Fold 2 Class 2 Precision: 0.992, Recall: 0.995, F1-score: 0.994
Outer Fold 3 Confusion Matrix:
[[273   0   2]
 [  2  17   2]
 [  0   1 790]]
Outer Fold 3 Class 0 Precision: 0.993, Recall: 0.993, F1-score: 0.993
Outer Fold 3 Class 1 Precision: 0.944, Recall: 0.810, F1-score: 0.872
Outer Fold 3 Class 2 Precision: 0.995, Recall: 0.999, F1-score: 0.997
Outer Fold 4 Confusion Matrix:
[[271   2   2]
 [  2  19   0]
 [  2   4 785]]
Outer Fold 4 Class 0 Precision: 0.985, Recall: 0.985, F1-score