In [4]:
import cudf

# Load the dataset
df = cudf.read_csv('normalized_phishing_dataset.csv')

# Check unique values in the Label column
print(df['Label'].unique())

# Filter rows with valid labels
df = df[df['Label'].isin(['good', 'bad'])]


0    1
1    0
Name: Label, dtype: int64


In [14]:
import cudf

# Load the normalized dataset
df = cudf.read_csv('updated_phishing_dataset.csv')

# Print ranges (min and max values) for each numeric column
for column in df.columns:
    if df[column].dtype in ['float32', 'float64', 'int32', 'int64']:
        print(f"{column}: Min = {df[column].min()}, Max = {df[column].max()}")


Label: Min = 0, Max = 1
url_length: Min = 0.0, Max = 1.0
num_dots: Min = 0.0, Max = 1.0
has_ip: Min = 0.0, Max = 1.0
num_subdomains: Min = 0.0, Max = 1.0
path_length: Min = 0.0, Max = 1.0
https: Min = 0.0, Max = 1.0
special_chars: Min = 0.0, Max = 1.0
domain_length: Min = 0, Max = 151
path_length.1: Min = 0, Max = 2175
num_special_chars: Min = 0, Max = 205
has_https: Min = 0, Max = 1
contains_login: Min = 0, Max = 1
contains_secure: Min = 0, Max = 1


In [13]:
import pandas as pd
from urllib.parse import urlparse
import tldextract

# Load dataset
df = pd.read_csv('normalized_phishing_dataset.csv')

# Function to extract features from a URL
def extract_url_features(url):
    parsed = urlparse(url)
    ext = tldextract.extract(url)
    
    return {
        'domain_length': len(ext.domain),
        'path_length': len(parsed.path),
        'num_special_chars': sum(c in "@-_?=&" for c in url),
        'has_https': int(parsed.scheme == 'https'),
        'contains_login': int('login' in url.lower()),
        'contains_secure': int('secure' in url.lower())
    }

# Apply feature extraction to all rows
url_features = df['URL'].apply(extract_url_features).apply(pd.Series)

# Combine with existing dataset
df = pd.concat([df, url_features], axis=1)

# Save updated dataset
df.to_csv('updated_phishing_dataset.csv', index=False)
print("Updated dataset saved with URL-based features.")
df

Updated dataset saved with URL-based features.


Unnamed: 0,URL,Label,url_length,num_dots,has_ip,num_subdomains,path_length,https,special_chars,tld,domain_length,path_length.1,num_special_chars,has_https,contains_login,contains_secure
0,nobell.it/70ffb52d079109dca5664cce6f317373782/...,1,0.097138,0.162162,0.0,0.0,0.061609,0.0,0.063725,it,6,134,16,0,1,0
1,www.dghjdgf.com/paypal.co.uk/cycgi-bin/webscrc...,1,0.034692,0.135135,0.0,0.0,0.037241,0.0,0.024510,com,7,81,6,0,0,0
2,serviciosbys.com/paypal.cgi.bin.get-into.herf....,1,0.076323,0.189189,0.0,0.0,0.081379,0.0,0.004902,com,12,177,1,0,0,1
3,mail.printakid.com/www.online.americanexpress....,1,0.025585,0.162162,0.0,0.0,0.027586,0.0,0.000000,com,9,60,0,0,0,0
4,thewhiskeydregs.com/wp-content/themes/widescre...,1,0.049870,0.027027,0.0,0.0,0.036322,0.0,0.009804,com,15,79,2,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
549341,23.227.196.215/,1,0.006071,0.081081,0.0,0.0,0.006897,0.0,0.000000,,14,15,0,0,0,0
549342,apple-checker.org/,1,0.007372,0.027027,0.0,0.0,0.008276,0.0,0.004902,org,13,18,1,0,0,0
549343,apple-iclods.org/,1,0.006938,0.027027,0.0,0.0,0.007816,0.0,0.004902,org,12,17,1,0,0,0
549344,apple-uptoday.org/,1,0.007372,0.027027,0.0,0.0,0.008276,0.0,0.004902,org,13,18,1,0,0,0


In [15]:
import cudf
from sklearn.preprocessing import MinMaxScaler

# Load the updated dataset
df = cudf.read_csv('updated_phishing_dataset.csv')

# Identify numeric features to normalize (excluding 'Label' and 'URL')
features_to_normalize = ['url_length', 'num_dots', 'has_ip', 'num_subdomains',
                         'path_length', 'https', 'special_chars', 'domain_length',
                         'path_length.1', 'num_special_chars', 'has_https',
                         'contains_login', 'contains_secure']

# Initialize Min-Max Scaler
scaler = MinMaxScaler()

# Normalize selected features
df_normalized = df.copy()
df_normalized[features_to_normalize] = scaler.fit_transform(df[features_to_normalize].to_pandas())

# Save the normalized dataset to a new CSV file
df_normalized.to_csv('normalized_updated_phishing_dataset.csv', index=False)
print("Normalized dataset saved as 'normalized_updated_phishing_dataset.csv'")
df_normalized

Normalized dataset saved as 'normalized_updated_phishing_dataset.csv'


Unnamed: 0,URL,Label,url_length,num_dots,has_ip,num_subdomains,path_length,https,special_chars,tld,domain_length,path_length.1,num_special_chars,has_https,contains_login,contains_secure
0,nobell.it/70ffb52d079109dca5664cce6f317373782/...,1,0.097138,0.162162,0.0,0.0,0.061609,0.0,0.063725,it,0.039735,0.061609,0.078049,0.0,1.0,0.0
1,www.dghjdgf.com/paypal.co.uk/cycgi-bin/webscrc...,1,0.034692,0.135135,0.0,0.0,0.037241,0.0,0.024510,com,0.046358,0.037241,0.029268,0.0,0.0,0.0
2,serviciosbys.com/paypal.cgi.bin.get-into.herf....,1,0.076323,0.189189,0.0,0.0,0.081379,0.0,0.004902,com,0.079470,0.081379,0.004878,0.0,0.0,1.0
3,mail.printakid.com/www.online.americanexpress....,1,0.025585,0.162162,0.0,0.0,0.027586,0.0,0.000000,com,0.059603,0.027586,0.000000,0.0,0.0,0.0
4,thewhiskeydregs.com/wp-content/themes/widescre...,1,0.049870,0.027027,0.0,0.0,0.036322,0.0,0.009804,com,0.099338,0.036322,0.009756,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
549341,23.227.196.215/,1,0.006071,0.081081,0.0,0.0,0.006897,0.0,0.000000,,0.092715,0.006897,0.000000,0.0,0.0,0.0
549342,apple-checker.org/,1,0.007372,0.027027,0.0,0.0,0.008276,0.0,0.004902,org,0.086093,0.008276,0.004878,0.0,0.0,0.0
549343,apple-iclods.org/,1,0.006938,0.027027,0.0,0.0,0.007816,0.0,0.004902,org,0.079470,0.007816,0.004878,0.0,0.0,0.0
549344,apple-uptoday.org/,1,0.007372,0.027027,0.0,0.0,0.008276,0.0,0.004902,org,0.086093,0.008276,0.004878,0.0,0.0,0.0


In [5]:
import cudf
from cuml.ensemble import RandomForestClassifier as cuRF
from cuml.metrics import accuracy_score, confusion_matrix
from cuml.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset using cuDF (GPU DataFrame)
df = cudf.read_csv('normalized_updated_phishing_dataset.csv')

# Select features and target
features = ['url_length', 'num_dots', 'has_ip', 'num_subdomains',
                         'path_length', 'https', 'special_chars', 'domain_length',
                          'num_special_chars', 'has_https',
                         'contains_login', 'contains_secure']
X = df[features]
y = df['Label']  # Label is already numeric (0 or 1)

# Split data into training and testing sets using RAPIDS
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Define parameters for cuML Random Forest
cu_rf_params = {
    'n_estimators': 1500,
    'max_depth': 20,
    'max_features': 1.0,
    'min_samples_split': 5,
    'min_samples_leaf': 2,
    'bootstrap': True
}

# Initialize and train the model on GPU
rf_gpu = cuRF(**cu_rf_params)
rf_gpu.fit(X_train, y_train)

# Predict on test data
y_pred = rf_gpu.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"\nAccuracy: {accuracy:.4f}")
# Convert cuDF Series to NumPy arrays
# y_test_np = y_test.to_numpy()
# y_pred_np = y_pred.to_numpy()

# Calculate confusion matrix
# cm = confusion_matrix(y_test.to_array(), y_pred.to_array())


# plt.figure(figsize=(8, 6))
# sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
#             xticklabels=['Legitimate', 'Phishing'],
#             yticklabels=['Legitimate', 'Phishing'])
# plt.xlabel('Predicted')
# plt.ylabel('Actual')
# plt.title('Confusion Matrix (GPU)')
# plt.show()




Accuracy: 0.8529


In [24]:
del rf_gpu

In [3]:
import cudf
from cuml.svm import SVC
from cuml.model_selection import train_test_split
from cuml.metrics import accuracy_score
import joblib

# Load the dataset
df = cudf.read_csv('normalized_updated_phishing_dataset.csv')

# Select features and target (Label is already 0/1)
features = ['url_length', 'num_dots', 'has_ip', 'num_subdomains',
            'path_length', 'https', 'special_chars', 'domain_length']
X = df[features]
y = df['Label']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Initialize SVM (remove probability=True)
svm_gpu = SVC(kernel='rbf', C=1.0, gamma='scale')
svm_gpu.fit(X_train, y_train)

# Save the model
joblib.dump(svm_gpu, 'phishing_svm_model_gpu.pkl')

['phishing_svm_model_gpu.pkl']

In [4]:
y_pred = svm_gpu.predict(X_test)


In [5]:
print("\nAccuracy:", accuracy_score(y_test, y_pred))


Accuracy: 0.7886300683021545


In [9]:
import cudf
import cupy as cp
from cuml.svm import SVC
from cuml.model_selection import train_test_split
from cuml.metrics import accuracy_score

# Load data and convert to cuPy
df = cudf.read_csv('normalized_updated_phishing_dataset.csv')
X = df[features].to_cupy()
y = df['Label'].to_cupy().ravel()  # Ensure y is 1D

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Define hyperparameters to test
param_grid = {
    'kernel': ['rbf', 'linear'],
    'C': [0.1, 1, 10],
    'gamma': ['scale', 'auto', 0.01, 0.1]
}

best_score = 0
best_params = {}

# Manual grid search
for kernel in param_grid['kernel']:
    for C in param_grid['C']:
        for gamma in param_grid['gamma']:
            if kernel == 'linear' and gamma != 'scale':  # Skip gamma for linear
                continue
            
            # Train SVM5
            model = SVC(kernel=kernel, C=C, gamma=gamma)
            model.fit(X_train, y_train)
            
            # Predict and evaluate
            y_pred = model.predict(X_test)
            acc = accuracy_score(y_test, y_pred)
            
            # Track best parameters
            if acc > best_score:
                best_score = acc
                best_params = {'kernel': kernel, 'C': C, 'gamma': gamma}

print("Best Parameters:", best_params)
print("Best Accuracy:", best_score)

Best Parameters: {'kernel': 'rbf', 'C': 10, 'gamma': 'scale'}
Best Accuracy: 0.7908509373664856
