In [2]:
import os

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from scipy.stats import zscore
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer


from tqdm import tqdm

# %% Ensure folder existence
def ensure_folder_exists(folder_path):
    os.makedirs(folder_path, exist_ok=True)

folder_name = '../data/preprocessed_phishing'

ensure_folder_exists(folder_name)


In [3]:
# Load dataset
data_path = '../data/raw_data/phishing/out.csv'  # adjust the path as necessary
df = pd.read_csv(data_path)
df.head()

# drop null values
df_clean = df.dropna()

df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1749311 entries, 0 to 2499998
Data columns (total 18 columns):
 #   Column              Dtype  
---  ------              -----  
 0   url                 object 
 1   source              object 
 2   label               object 
 3   url_length          int64  
 4   starts_with_ip      bool   
 5   url_entropy         float64
 6   has_punycode        bool   
 7   digit_letter_ratio  float64
 8   dot_count           int64  
 9   at_count            int64  
 10  dash_count          int64  
 11  tld_count           int64  
 12  domain_has_digits   bool   
 13  subdomain_count     int64  
 14  nan_char_entropy    float64
 15  has_internal_links  bool   
 16  whois_data          object 
 17  domain_age_days     float64
dtypes: bool(4), float64(4), int64(6), object(4)
memory usage: 206.9+ MB


In [4]:
print(df_clean['label'].value_counts())

# Using a subset
target_size = 50000
target_size1 = 5000 #for hierarchical clustering

legitimate_sample = df_clean[df_clean['label'] == 'legitimate'].sample(n=target_size, random_state=42)
phishing_sample = df_clean[df_clean['label'] == 'phishing'].sample(n=target_size, random_state=42)
legitimate_sample1 = df_clean[df_clean['label'] == 'legitimate'].sample(n=target_size1, random_state=42)
phishing_sample1 = df_clean[df_clean['label'] == 'phishing'].sample(n=target_size1, random_state=42)


# Combining the samples
balanced_df = pd.concat([legitimate_sample, phishing_sample])
small_balanced_df = pd.concat([legitimate_sample1, phishing_sample1])

# Shuffling
balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)
small_balanced_df = small_balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

print(balanced_df['label'].value_counts())
print(small_balanced_df['label'].value_counts())


label
legitimate    1096403
phishing       652908
Name: count, dtype: int64
label
phishing      50000
legitimate    50000
Name: count, dtype: int64
label
phishing      5000
legitimate    5000
Name: count, dtype: int64


In [5]:
# Selecting relevant columns
numerical_features = ['url_length', 'url_entropy', 'digit_letter_ratio',
                     'dot_count', 'at_count', 'dash_count', 'tld_count',
                     'subdomain_count', 'nan_char_entropy', 'domain_age_days']
# Add boolean features to the list of numerical features
boolean_features = ['starts_with_ip', 'has_punycode', 'domain_has_digits', 'has_internal_links']
categorical_features = ['url', 'source', 'whois_data']

# Combine all features
all_features = numerical_features + boolean_features + categorical_features
target_features = ['label']

In [6]:
# Save the balanced_df subset to a new CSV file
file_path = os.path.join(folder_name, 'subset.csv')
balanced_df.to_csv(file_path, index=False)

file_path1 = os.path.join(folder_name, 'smaller_subset.csv')
small_balanced_df.to_csv(file_path1, index=False)

**Below we divide the data into test and train sets for supervisedd learning pre-processings**
- Clustering does not require this because it is an unsupervised learning where labels arent required.

**Scaling required for**
- Clustering
- Perceptron
- K-Nearest Neighbors	
- Multi-Layer Perceptron

**PCA required for**
- Clustering

In [10]:
# %% Train-Test Split
# Split the balanced dataset into train (80%) and test (20%)
train_df, test_df = train_test_split(balanced_df, test_size=0.2, random_state=42, stratify=balanced_df['label'])

# Save train and test datasets
train_df.to_csv(os.path.join(folder_name, 'train.csv'), index=False)
test_df.to_csv(os.path.join(folder_name, 'test.csv'), index=False)

# Print class distributions
print(f"Training set distribution:\n{train_df['label'].value_counts()}")
print(f"Testing set distribution:\n{test_df['label'].value_counts()}")

#for small
train_df1, test_df1 = train_test_split(small_balanced_df, test_size=0.2, random_state=42, stratify=small_balanced_df['label'])
train_df1.to_csv(os.path.join(folder_name, 'train_small.csv'), index=False)
test_df1.to_csv(os.path.join(folder_name, 'test_small.csv'), index=False)
# print(f"Training set distribution:\n{train_df1['label'].value_counts()}")
# print(f"Testing set distribution:\n{test_df1['label'].value_counts()}")

Training set distribution:
label
legitimate    40000
phishing      40000
Name: count, dtype: int64
Testing set distribution:
label
phishing      10000
legitimate    10000
Name: count, dtype: int64


In [None]:

# Creating 3 dataframes (full data, train, test)
# all_df = balanced_df[numerical_features + target_features]
all_df = balanced_df[all_features + target_features].copy()
train_df = train_df[all_features + target_features].copy()
test_df = test_df[all_features + target_features].copy()

# for small
all_df1 = small_balanced_df[all_features + target_features].copy()
train_df1 = train_df1[all_features + target_features].copy()
test_df1 = test_df1[all_features + target_features].copy()

In [12]:
# Apply Label Encoding for simplicity in this example
for cat_feature in categorical_features:
    label_encoder = LabelEncoder()
    all_df[cat_feature] = label_encoder.fit_transform(all_df[cat_feature])
    train_df[cat_feature] = label_encoder.transform(train_df[cat_feature])
    test_df[cat_feature] = label_encoder.transform(test_df[cat_feature])

    # For small subset
    all_df1[cat_feature] = label_encoder.transform(all_df1[cat_feature])
    train_df1[cat_feature] = label_encoder.transform(train_df1[cat_feature])
    test_df1[cat_feature] = label_encoder.transform(test_df1[cat_feature])

# Encode target labels
all_df['label_encoded'] = label_encoder.fit_transform(all_df['label'])
train_df['label_encoded'] = label_encoder.transform(train_df['label'])
test_df['label_encoded'] = label_encoder.transform(test_df['label'])

# For small subset
all_df1['label_encoded'] = label_encoder.transform(all_df1['label'])
train_df1['label_encoded'] = label_encoder.transform(train_df1['label'])
test_df1['label_encoded'] = label_encoder.transform(test_df1['label'])

# Clustering

1. Remove highly correlated features.
2. Remove outliers using z-scores.
3. Scale the data.
4. Perform PCA for dimensionality reduction.
5. Visualize the PCA-transformed data.
6. Apply and visualize clustering results.


In [27]:
# Preprocessing for clustering
# clustering_df = all_df.drop(columns=['label', 'label_encoded'] + categorical_features)
clustering_df = all_df.drop(columns=['label', 'label_encoded'] + categorical_features)
clustering_df_with_label = all_df.drop(columns=categorical_features)
# Convert boolean features to numeric if needed
# clustering_df = clustering_df.astype(float)

print(clustering_df.info())

#for small
clustering_df1 = all_df1.drop(columns=['label', 'label_encoded'] + categorical_features)
clustering_df1_with_label = all_df.drop(columns=categorical_features)
# print(clustering_df1.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 14 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   url_length          100000 non-null  int64  
 1   url_entropy         100000 non-null  float64
 2   digit_letter_ratio  100000 non-null  float64
 3   dot_count           100000 non-null  int64  
 4   at_count            100000 non-null  int64  
 5   dash_count          100000 non-null  int64  
 6   tld_count           100000 non-null  int64  
 7   subdomain_count     100000 non-null  int64  
 8   nan_char_entropy    100000 non-null  float64
 9   domain_age_days     100000 non-null  float64
 10  starts_with_ip      100000 non-null  bool   
 11  has_punycode        100000 non-null  bool   
 12  domain_has_digits   100000 non-null  bool   
 13  has_internal_links  100000 non-null  bool   
dtypes: bool(4), float64(4), int64(6)
memory usage: 8.0 MB
None


In [28]:
# Remove highly correlated features.
correlation_matrix = clustering_df.corr()
upper_tri = correlation_matrix.where(np.triu(np.ones(correlation_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper_tri.columns if any(upper_tri[column].abs() > 0.9)]
clustering_df = clustering_df.drop(columns=to_drop)

#for small
correlation_matrix1 = clustering_df1.corr()
upper_tri1 = correlation_matrix1.where(np.triu(np.ones(correlation_matrix1.shape), k=1).astype(bool))
to_drop1 = [column for column in upper_tri1.columns if any(upper_tri1[column].abs() > 0.9)]
clustering_df1 = clustering_df1.drop(columns=to_drop1)


In [20]:
# # Remove outliers using z-scores.
# # Convert boolean features to numeric if needed
# clustering_df = clustering_df.astype(float)
# clustering_df = clustering_df[(np.abs(zscore(clustering_df)) < 3).all(axis=1)]
# print(clustering_df.info())

# #for small
# clustering_df1 = clustering_df1.astype(float)
# clustering_df1 = clustering_df1[(np.abs(zscore(clustering_df1)) < 3).all(axis=1)]
# # print(clustering_df1.info())

In [29]:
# Scale features
scaler = StandardScaler()
clustering_df_scaled = scaler.fit_transform(clustering_df)

#for small
clustering_df_scaled1 = scaler.fit_transform(clustering_df1)

In [30]:
# Perform PCA without specifying the number of components
pca = PCA()
pca_transformed = pca.fit_transform(clustering_df_scaled)

# Calculate cumulative variance explained by each component
cumulative_variance = np.cumsum(pca.explained_variance_ratio_)

# Find the number of components that retain at least 90% variance
n_components_90 = np.argmax(cumulative_variance >= 0.9) + 1
print(f"Number of components to retain 90% variance: {n_components_90}")

# Apply PCA with the required number of components
pca = PCA(n_components=n_components_90)
pca.fit_transform(clustering_df_scaled)

# Verify the explained variance
print(f"Explained variance with {n_components_90} components: {np.cumsum(pca.explained_variance_ratio_)}")

Number of components to retain 90% variance: 10
Explained variance with 10 components: [0.22395213 0.34208606 0.44711621 0.53607682 0.62024986 0.69198608
 0.75518846 0.81315588 0.85979079 0.90341478]


###### Components needed to display atleast 90% data would be 10, but we shall use 2d clustering, hence our PCA shall be with 2 components

In [31]:
pca = PCA(n_components=2)
clustering_ready_df = pca.fit_transform(clustering_df_scaled)

#for small
clustering_ready_df1 = pca.fit_transform(clustering_df_scaled1)

In [32]:
# Final clustering-ready DataFrame
print(f"Shape after preprocessing: {clustering_ready_df.shape}")

#for small
print(f"Shape after preprocessing small: {clustering_ready_df1.shape}")

Shape after preprocessing: (100000, 2)
Shape after preprocessing small: (10000, 2)


In [33]:
folder = folder_name + '/clustering/'
ensure_folder_exists(folder)

# Save the clustering-ready DataFrame with PCA-reduced data
clustering_ready_df = pd.DataFrame(clustering_ready_df, columns=['PCA1', 'PCA2'])  # Ensure column names are present
clustering_ready_df.to_csv(os.path.join(folder,'clustering.csv'), index=False)

#for small
clustering_ready_df1 = pd.DataFrame(clustering_ready_df1, columns=['PCA1', 'PCA2'])  # Ensure column names are present
clustering_ready_df1.to_csv(os.path.join(folder,'clustering_small.csv'), index=False)

# Decision Tree

1. Decision Trees are not sensitive to feature scaling or outliers.
2. Categorical labels must be encoded.
3. No need for dimensionality reduction (like PCA).
4. Handles missing values.


In [7]:

# Step 1: Load the dataset
df_dt = balanced_df.copy()

# Step 2: Drop irrelevant columns (e.g., 'url', 'whois_data')
columns_to_drop = ["url", "whois_data", "source"]  # Adjust as necessary
df_dt = df_dt.drop(columns=columns_to_drop, errors="ignore")

# Step 3: Handle missing values
df_dt = df_dt.dropna()  # Drop rows with missing values

# Step 4: Encode target labels
label_encoder_dt = LabelEncoder()
df_dt["label_encoded"] = label_encoder_dt.fit_transform(df_dt["label"])  # Binary encoding

# Step 5: Convert Boolean columns to numeric
boolean_columns_dt = df_dt.select_dtypes(include=["bool"]).columns
df_dt[boolean_columns_dt] = df_dt[boolean_columns_dt].astype(int)

# Step 6: Separate features and target
X_dt = df_dt.drop(columns=["label", "label_encoded"])  # Features
y_dt = df_dt["label_encoded"]  # Target

# Step 7: Train-test split
X_train_dt, X_test_dt, y_train_dt, y_test_dt = train_test_split(X_dt, y_dt, test_size=0.2, random_state=42)

folder = folder_name + '/decision_tree/'
ensure_folder_exists(folder)

# Step 8: Save preprocessed data
X_train_dt.to_csv(os.path.join(folder, 'dt_X_train.csv'), index=False)
X_test_dt.to_csv(os.path.join(folder, 'dt_X_test.csv'), index=False)
y_train_dt.to_csv(os.path.join(folder, 'dt_y_train.csv'), index=False)
y_test_dt.to_csv(os.path.join(folder, 'dt_y_test.csv'), index=False)

print("Preprocessing completed! Files saved as dt_X_train.csv, dt_X_test.csv, dt_y_train.csv, and dt_y_test.csv.")

Preprocessing completed! Files saved as dt_X_train.csv, dt_X_test.csv, dt_y_train.csv, and dt_y_test.csv.


In [22]:
# Check processed data
print(f"Shape of X_train: {X_train.shape}, Shape of X_test: {X_test.shape}")
print(f"Sample of y_train:\n{y_train.head()}")

folder = folder_name + '/decision_tree/'
ensure_folder_exists(folder)

# Save preprocessed Decision Tree data
X_train.to_csv(os.path.join(folder, 'dt_X_train.csv'), index=False)
X_test.to_csv(os.path.join(folder, 'dt_X_test.csv'), index=False)
y_train.to_csv(os.path.join(folder, 'dt_y_train.csv'), index=False)
y_test.to_csv(os.path.join(folder, 'dt_y_test.csv'), index=False)

Shape of X_train: (80000, 17), Shape of X_test: (20000, 17)
Sample of y_train:
9567     0
93297    0
31947    0
11663    1
33026    0
Name: label_encoded, dtype: int64


# Naive Bayes

1. Handle missing values
2. No strict need for feature scaling, but it can improve numerical stability
3. No need for dimensionality reduction (like PCA).
4. Ensure labels and all features are numerical.

In [92]:
# Training and testing data for Naive Bayes
nb_features = numerical_features  # Already defined earlier
X_train_nb = train_df[nb_features].copy()
y_train_nb = train_df['label_encoded'].copy()

X_test_nb = test_df[nb_features].copy()
y_test_nb = test_df['label_encoded'].copy()

In [93]:
# Scale features (optional, recommended for Gaussian Naive Bayes)
X_train_nb_scaled = pd.DataFrame(scaler.fit_transform(X_train_nb), columns=nb_features)
X_test_nb_scaled = pd.DataFrame(scaler.transform(X_test_nb), columns=nb_features)

# Check processed data
print(f"Shape of X_train_nb_scaled: {X_train_nb_scaled.shape}, Shape of X_test_nb_scaled: {X_test_nb_scaled.shape}")
print(f"Sample of y_train_nb:\n{y_train_nb.head()}")

Shape of X_train_nb_scaled: (80000, 10), Shape of X_test_nb_scaled: (20000, 10)
Sample of y_train_nb:
9567     0
93297    0
31947    0
11663    1
33026    0
Name: label_encoded, dtype: int64


In [None]:
# Save preprocessed Naive Bayes data
folder = folder_name + '/naive_bayes/'
ensure_folder_exists(folder)

X_train_nb_scaled.to_csv(os.path.join(folder, 'nb_X_train.csv'), index=False)
X_test_nb_scaled.to_csv(os.path.join(folder, 'nb_X_test.csv'), index=False)
y_train_nb.to_csv(os.path.join(folder, 'nb_y_train.csv'), index=False)
y_test_nb.to_csv(os.path.join(folder, 'nb_y_test.csv'), index=False)

# Perceptrons & Multi-Layer Perceptron (MLP)
1. Feature Scaling is Mandatory: MLP relies on gradient descent optimization, which performs better when the features are normalized.
2. Label Encoding is Required: The target variable must be numerical. Use LabelEncoder for this purpose.
3. No Specific Need for Dimensionality Reduction: However, if the dataset has very high dimensions, Principal Component Analysis (PCA) can be used for preprocessing.
4. Handles Nonlinear Relationships: MLP can capture complex, nonlinear relationships between features.
5. Sensitive to Hyper-parameters: Key parameters like learning rate, number of layers, and number of neurons need tuning for optimal performance.

In [23]:
print("Unique values in 'label' for subset:")
print(balanced_df['label'].unique())

folder = folder_name + '/perceptron/'
ensure_folder_exists(folder)

# Copy dataset for MLP preprocessing
balanced_df_perceptron = balanced_df.copy()
small_balanced_df_perceptron = small_balanced_df.copy()  # For potential smaller dataset preprocessing

# Drop unnecessary columns  
balanced_df_perceptron = balanced_df_perceptron.drop(columns=categorical_features)
small_balanced_df_perceptron = small_balanced_df_perceptron.drop(columns=categorical_features)

# Encode the target column
label_encoder = LabelEncoder()
label_encoder.fit(['phishing', 'legitimate'])  # Define a consistent order for labels

# Encode labels for each dataset
balanced_df_perceptron['label_encoded'] = label_encoder.transform(balanced_df_perceptron['label'])
small_balanced_df_perceptron['label_encoded'] = label_encoder.transform(small_balanced_df_perceptron['label'])

# Drop the original label column
balanced_df_perceptron = balanced_df_perceptron.drop(columns=['label'])
small_balanced_df_perceptron = small_balanced_df_perceptron.drop(columns=['label'])

# Verify the unique values after encoding
print("Unique values in 'label_encoded' for balanced_df_perceptron:")
print(balanced_df_perceptron['label_encoded'].unique())

print("Unique values in 'label_encoded' for small_balanced_df_perceptron:")
print(small_balanced_df_perceptron['label_encoded'].unique())

# Scale numerical features

# Separate numerical columns, excluding 'label_encoded'
numerical_columns = [col for col in balanced_df_perceptron.select_dtypes(include=['int64', 'float64']).columns if col != 'label_encoded']

scaler = StandardScaler()
balanced_df_perceptron[numerical_columns] = scaler.fit_transform(balanced_df_perceptron[numerical_columns])
small_balanced_df_perceptron[numerical_columns] = scaler.transform(small_balanced_df_perceptron[numerical_columns])

# Verify the 'label_encoded' column before saving
print("Final unique values in 'label_encoded' for balanced_df_perceptron:")
print(balanced_df_perceptron['label_encoded'].unique())
balanced_df_perceptron.info()

print("Final unique values in 'label_encoded' for small_balanced_df_perceptron:")
print(small_balanced_df_perceptron['label_encoded'].unique())

# # Save the processed datasets for later use
# os.makedirs(output_folder, exist_ok=True)
folder = folder_name + '/perceptron/'
ensure_folder_exists(folder)
# Save full dataset
output_file = os.path.join(folder, 'perceptron.csv')
balanced_df_perceptron.to_csv(output_file, index=False)

# Save small dataset
output_file_small = os.path.join(folder, 'small_perceptron.csv')
small_balanced_df_perceptron.to_csv(output_file_small, index=False)

print(f"Preprocessed full data saved to: {output_file}")
print(f"Preprocessed small data saved to: {output_file_small}")

# Reload the saved file to verify its contents
small_balanced_df = pd.read_csv(output_file_small)
print("Unique values in 'label_encoded' after reloading from CSV:")
print(small_balanced_df['label_encoded'].unique())



Unique values in 'label' for subset:
['phishing' 'legitimate']
Unique values in 'label_encoded' for balanced_df_perceptron:
[1 0]
Unique values in 'label_encoded' for small_balanced_df_perceptron:
[1 0]
Final unique values in 'label_encoded' for balanced_df_perceptron:
[1 0]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 15 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   url_length          100000 non-null  float64
 1   starts_with_ip      100000 non-null  bool   
 2   url_entropy         100000 non-null  float64
 3   has_punycode        100000 non-null  bool   
 4   digit_letter_ratio  100000 non-null  float64
 5   dot_count           100000 non-null  float64
 6   at_count            100000 non-null  float64
 7   dash_count          100000 non-null  float64
 8   tld_count           100000 non-null  float64
 9   domain_has_digits   100000 non-null  bool   
 10  subdomain

# K Nearest Neighbours
1. Use numerical and boolean features only; drop categorical features.
2. Encode boolean features as 0 and 1.
3. Encode the target labels (phishing and legitimate) using LabelEncoder.
4. Apply standard scaling (StandardScaler) to ensure all features are on a similar scale.
5. Split the dataset into training and testing sets.

In [5]:
# Define output folder for KNN
folder = folder_name + '/knn/'
ensure_folder_exists(folder)

balanced_df_knn = balanced_df.copy()

# Drop categorical features
balanced_df_knn = balanced_df_knn.drop(columns=categorical_features)

# Encode the target column
label_encoder = LabelEncoder()
label_encoder.fit(['phishing', 'legitimate'])  # Define a consistent order for labels
balanced_df_knn['label_encoded'] = label_encoder.transform(balanced_df_knn['label'])

# Drop the original label column
balanced_df_knn = balanced_df_knn.drop(columns=['label'])

# Scale numerical features
numerical_columns = [col for col in balanced_df_knn.columns if col != 'label_encoded']
scaler = StandardScaler()
balanced_df_knn[numerical_columns] = scaler.fit_transform(balanced_df_knn[numerical_columns])

# Train-test split
X_knn = balanced_df_knn.drop(columns=['label_encoded'])  # Features
y_knn = balanced_df_knn['label_encoded']  # Target

X_train_knn, X_test_knn, y_train_knn, y_test_knn = train_test_split(
    X_knn, y_knn, test_size=0.2, random_state=42, stratify=y_knn
)

# Save datasets
X_train_knn.to_csv(os.path.join(folder, 'knn_X_train.csv'), index=False)
X_test_knn.to_csv(os.path.join(folder, 'knn_X_test.csv'), index=False)
y_train_knn.to_csv(os.path.join(folder, 'knn_y_train.csv'), index=False)
y_test_knn.to_csv(os.path.join(folder, 'knn_y_test.csv'), index=False)

# Summary
print(f"KNN datasets prepared and saved to: {folder}")
print(f"X_train shape: {X_train_knn.shape}, y_train shape: {y_train_knn.shape}")
print(f"X_test shape: {X_test_knn.shape}, y_test shape: {y_test_knn.shape}")

KNN datasets prepared and saved to: ../data/preprocessed_phishing/knn/
X_train shape: (80000, 14), y_train shape: (80000,)
X_test shape: (20000, 14), y_test shape: (20000,)


# Logistic Regression

1. **Handle Missing Values**:
   - Rows with missing values should be removed or imputed to ensure a clean dataset.

2. **Feature Scaling**:
   - Standardize features to have a mean of 0 and standard deviation of 1. Logistic regression assumes features are on a comparable scale for optimal performance.

3. **Label Encoding**:
   - Convert target labels to numeric values (`0` and `1`).
   - Ensure all categorical and Boolean features are also numeric.

4. **Train-Test Split**:
   - Divide the dataset into training and testing subsets to evaluate the model's performance on unseen data.

5. **No Dimensionality Reduction**:
   - Logistic regression does not strictly require techniques like PCA unless the dataset has a very high number of features.

In [8]:
# Define output folder for KNN
folder = folder_name + '/logistic_regression/'
ensure_folder_exists(folder)

df_lr = balanced_df.copy()


# Step 2: Drop irrelevant columns
columns_to_drop = ["url", "whois_data", "source"]  # Adjust as necessary
df_lr = df_lr.drop(columns=columns_to_drop, errors="ignore")

# Step 3: Handle missing values
df_lr = df_lr.dropna()  # Drop rows with missing values

# Step 4: Encode target labels
label_encoder_lr = LabelEncoder()
df_lr["label_encoded"] = label_encoder_lr.fit_transform(df_lr["label"])  # Binary encoding

# Step 5: Convert Boolean columns to numeric
boolean_columns_lr = df_lr.select_dtypes(include=["bool"]).columns
df_lr[boolean_columns_lr] = df_lr[boolean_columns_lr].astype(int)

# Step 6: Separate features and target
X_lr = df_lr.drop(columns=["label", "label_encoded"])  # Features
y_lr = df_lr["label_encoded"]  # Target

# Step 7: Feature scaling
scaler_lr = StandardScaler()
X_scaled_lr = scaler_lr.fit_transform(X_lr)

# Step 8: Train-test split
X_train_lr, X_test_lr, y_train_lr, y_test_lr = train_test_split(X_scaled_lr, y_lr, test_size=0.2, random_state=42)

# Step 9: Save preprocessed data
pd.DataFrame(X_train_lr).to_csv(os.path.join(folder, 'lr_X_train.csv'), index=False)
pd.DataFrame(X_test_lr).to_csv(os.path.join(folder, 'lr_X_test.csv'), index=False)
pd.DataFrame(y_train_lr).to_csv(os.path.join(folder, 'lr_y_train.csv'), index=False)
pd.DataFrame(y_test_lr).to_csv(os.path.join(folder, 'lr_y_test.csv'), index=False)

print("Preprocessing completed! Files saved as lr_X_train.csv, lr_X_test.csv, lr_y_train.csv, and lr_y_test.csv.")

Preprocessing completed! Files saved as lr_X_train.csv, lr_X_test.csv, lr_y_train.csv, and lr_y_test.csv.
