In [5]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import vstack



data = pd.read_csv("dataset/malicious_phish.csv")

label_encoder = LabelEncoder()
data['type'] = label_encoder.fit_transform(data['type'])


vectorizer = CountVectorizer(analyzer="char_wb", ngram_range=(3,5))
vectorizer.fit(data['url'])  # Fit on the entire dataset

batch_size = 10000
sparse_matrices =[]


for i in range (0, len(data), batch_size):
    print(f"Processing batch {i // batch_size + 1}...")
    batch = data['url'][i:i + batch_size]
    sparse_matrices.append(vectorizer.transform(batch))


X = vstack(sparse_matrices)

y = data['type']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42 )

print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")

Processing batch 1...
Processing batch 2...
Processing batch 3...
Processing batch 4...
Processing batch 5...
Processing batch 6...
Processing batch 7...
Processing batch 8...
Processing batch 9...
Processing batch 10...
Processing batch 11...
Processing batch 12...
Processing batch 13...
Processing batch 14...
Processing batch 15...
Processing batch 16...
Processing batch 17...
Processing batch 18...
Processing batch 19...
Processing batch 20...
Processing batch 21...
Processing batch 22...
Processing batch 23...
Processing batch 24...
Processing batch 25...
Processing batch 26...
Processing batch 27...
Processing batch 28...
Processing batch 29...
Processing batch 30...
Processing batch 31...
Processing batch 32...
Processing batch 33...
Processing batch 34...
Processing batch 35...
Processing batch 36...
Processing batch 37...
Processing batch 38...
Processing batch 39...
Processing batch 40...
Processing batch 41...
Processing batch 42...
Processing batch 43...
Processing batch 44.

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout


model = Sequential([
    Dense(128, activation="relu",input_dim=X_train.shape[1]),
    Dropout(0.3),
    Dense(64, activation="relu"),
    Dropout(0.3),
    Dense(4, activation="softmax")
])

model.compile(optimizer="adam", loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy:.2f}")

In [8]:

import numpy as np
# Check unique values in y_train and y_test
print(np.unique(y_train))
print(np.unique(y_test))

# If you find labels out of range, you need to handle them.


[0 1 2 3]
[0 1 2 3]


In [None]:
# Save the model
model.save('phishing_url_model.h5')

# Save the vectorizer using pickle
import pickle
with open('vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)
