In [1]:
import csv
import joblib
import pandas as pd
from sklearn.model_selection import GridSearchCV
from datetime import datetime
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.preprocessing import MaxAbsScaler

In [3]:
# Create a DataFrame from the data
def create_dataframe(data):
    dataframe = pd.DataFrame(data)
    return dataframe

In [4]:
def domain_age_lessThanOne(create_date, update_date):
    if create_date != "" and create_date != "expired" and not pd.isna(create_date):
        age = datetime.strptime("2024-04-23", '%Y-%m-%d') - datetime.strptime(create_date[:10],
                                                                              '%Y-%m-%d')
        return (age.days // 365) < 1

    elif create_date == "" and update_date != "":
        age = datetime.strptime("2024-04-23", '%Y-%m-%d') - datetime.strptime(update_date[:10],
                                                                              '%Y-%m-%d')
        if age.days < 365:
            return None
        else:
            return False

    elif create_date == "expired":
        return True

    return None

In [5]:
def binary_to_numeric(value):
    if value:
        return 1
    if not value:
        return 0
    else:
        return None

In [6]:
def preprocess_data(data, features):
    
    preprocessed_data = data[features].copy()
    preprocessed_data['new_domain'] = None
    for index, item in preprocessed_data.iterrows():
        new_domain = domain_age_lessThanOne(item['creation_date'], item['updated_date'])
        preprocessed_data.loc[index, 'new_domain'] = new_domain
        
    preprocessed_data = preprocessed_data.drop('creation_date', axis=1)
    preprocessed_data = preprocessed_data.drop('updated_date', axis=1)
        
    # Transform binary values to numerical
    preprocessed_data['control_over_dns'] = preprocessed_data['control_over_dns'].astype(float).replace({True: 1.0, False: 0.0})
    preprocessed_data['control_over_ssl'] = preprocessed_data['control_over_ssl'].astype(float).replace({True: 1.0, False: 0.0})
    preprocessed_data['domain_indexed'] = preprocessed_data['domain_indexed'].astype(float).replace({True: 1.0, False: 0.0})
    preprocessed_data['is_archived'] = preprocessed_data['is_archived'].astype(float).replace({True: 1.0, False: 0.0})
    preprocessed_data['known_hosting'] = preprocessed_data['known_hosting'].astype(float).replace({True: 1.0, False: 0.0})
    preprocessed_data['new_domain'] = preprocessed_data['new_domain'].astype(float).replace({True: 1.0, False: 0.0})
    preprocessed_data['is_on_root'] = preprocessed_data['is_on_root'].astype(float).replace({True: 1.0, False: 0.0})
    preprocessed_data['is_subdomain'] = preprocessed_data['is_subdomain'].astype(float).replace({True: 1.0, False: 0.0})
        
    # preprocessed_data =preprocessed_data.applymap(lambda x: pd.to_numeric(x, errors='coerce'))
    
    return preprocessed_data

In [7]:
def count_missing_values(data, file_name):
    missing_values = data.isnull().sum()
    missing_values.to_csv(file_name)

In [8]:
# Scale numerical features using standardization
def scale_numerical_features(data, num_features):
    scaler = MaxAbsScaler()
    data[num_features] = scaler.fit_transform(data[num_features])
    return data

In [9]:
def handle_missing_values(data, num_features):
    features_with_missing_values = data.columns[data.isnull().any()].tolist()
    features_with_missing_values.sort(key=lambda x: data[x].isnull().sum())

    best_params_dict = {}  # Dictionary to store best parameters for each feature

    for feature in features_with_missing_values:
        complete_data = data.dropna()  # Remove rows with any missing values

        data_null = data[data[feature].isnull()] # Rows from the original DataFrame where the values in the column [feature] are null
        data_pred = data_null.drop(feature, axis=1) # Test data that we want to predict them the column [feature] 

        X_train = complete_data.drop(feature, axis=1)
        y_train = complete_data[feature]

        param_grid = {
            "n_estimators": [100, 200, 500],
            "max_depth": [None, 5, 10],
            "min_samples_split": [2, 5, 10]
        }
        if feature in num_features:
            RF_model = RandomForestRegressor()
            grid_search = GridSearchCV(RF_model, param_grid, cv=5, scoring='neg_mean_squared_error')

        else:
            RF_model = RandomForestClassifier()
            grid_search = GridSearchCV(RF_model, param_grid, cv=5, scoring='accuracy')

        grid_search.fit(X_train, y_train)
        best_model = grid_search.best_estimator_
        best_params_dict[feature] = grid_search.best_params_  # Save best parameters for the feature
        # Print the best hyperparameters for imputing missing values
        print("Best Parameters:", grid_search.best_params_)
        imputed_values = best_model.predict(data_pred)
        data.loc[data[feature].isnull(), feature] = imputed_values
        # Calculate accuracy and MSE
        best_scoring = grid_search.best_score_
        print(f"Feature: {feature}, Best Score: {best_scoring}")
        

    # Save the best parameters dictionary to a file
    joblib.dump(best_params_dict, path_prefix + 'best_params_dict.pkl')

    return data

In [10]:
import pickle
from sklearn.base import clone

def handle_missing_values_load_bestparams(data, num_features):
    features_with_missing_values = data.columns[data.isnull().any()].tolist()
    features_with_missing_values.sort(key=lambda x: data[x].isnull().sum())

    # Load the best parameters from the pickle file
    with open(path_prefix + 'best_params_dict.pkl', 'rb') as f:
        best_params_dict = pickle.load(f)

    for feature in features_with_missing_values:
        complete_data = data.dropna()  # Remove rows with any missing values

        data_null = data[data[feature].isnull()]
        data_pred = data_null.drop(feature, axis=1)

        X_train = complete_data.drop(feature, axis=1)
        y_train = complete_data[feature]

        if feature in num_features:
            RF_model = RandomForestRegressor()
        else:
            RF_model = RandomForestClassifier()

        if feature in best_params_dict:
            best_params = best_params_dict[feature] # Get the best parameters for the feature
        else:
            # Set best_params to default parameters
            default_model = clone(RF_model)
            default_params = default_model.get_params()
            best_params = default_params
            
        RF_model.set_params(**best_params)  # Set the best parameters for the model

        RF_model.fit(X_train, y_train)
        imputed_values = RF_model.predict(data_pred)
        data.loc[data[feature].isnull(), feature] = imputed_values.astype(float) 
        # Calculate accuracy and MSE
        best_scoring = RF_model.score(X_train, y_train)
        print(f"Feature: {feature}, Best Score: {best_scoring}")

    return data

In [27]:
def perform_classification(data, model, sample_ids):
    # Separate the target feature
    X = data.copy()
    
    # Make predictions
    new_predictions = model.predict(X)
    
    # Save the predictions
    new_predictions_df = pd.DataFrame({'sample_id': sample_ids, 'predicted': new_predictions})
    new_predictions_df.to_csv(path_prefix + f'random_forest_predictions_whole_dataset.csv', index=False)
    return model


In [28]:
# List of selected features
selected_features = [
    'creation_date',
    'updated_date',
    'control_over_ssl',
    'control_over_dns',
    'domain_indexed',
    'known_hosting',
    'is_archived',
    'is_on_root',
    'is_subdomain',
    'between_archives_distance',
    'phish_archives_distance'
]

In [29]:
numerical_features = ['between_archives_distance', 'phish_archives_distance']

In [30]:
# Load the data
path_prefix = '../data/'
df = pd.read_csv(path_prefix + 'entire_dataset_for_classification.csv')
ids = df['id']
learnt_model = joblib.load(path_prefix + 'random_forest_model.pkl')

In [31]:
# Preprocess the data
transformed_data = preprocess_data(df, selected_features)

In [32]:
# Scale numerical features
scaled_data = scale_numerical_features(transformed_data, numerical_features)

In [33]:
# count_missing_values(transformed_data, path_prefix + 'missing_value_count.csv')

In [34]:
# Handling missing value using RandomForest
handled_missed = handle_missing_values(scaled_data, numerical_features)

Best Parameters: {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 500}
Feature: control_over_ssl, Best Score: 0.9134868187781441
Best Parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 100}
Feature: new_domain, Best Score: 0.9568822947795204


In [35]:
perform_classification(handled_missed, learnt_model, ids)