In [27]:
import pandas as pd
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping

In [28]:
combined_df = pd.read_csv('combined_dataset.csv')

In [29]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(combined_df['URL'])
X = tokenizer.texts_to_sequences(combined_df['URL'])
X = pad_sequences(X)

In [30]:
X

array([[    0,     0,     0, ...,  1491,    12,  1236],
       [    0,     0,     0, ...,    15,    12,   649],
       [    0,     0,     0, ...,  2205,  2205,    31],
       ...,
       [    0,     0,     0, ...,   286,    12,   144],
       [    0,     0,     0, ..., 18356, 30623, 78520],
       [    0,     0,     0, ..., 22896,    48,  5603]], dtype=int32)

In [31]:
y = (combined_df['Type'] == 'Malicious').astype(int)

In [32]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=27)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=27)

In [33]:
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=64, input_length=X.shape[1]))
model.add(LSTM(128))
model.add(Dense(1, activation='sigmoid'))

In [34]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [35]:
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

In [36]:
model.fit(X_train, y_train, epochs=100, batch_size=64, validation_data=(X_val, y_val), callbacks=[early_stopping])


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100


<keras.src.callbacks.History at 0x7ff8dc618d90>

In [37]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test loss: {loss}')
print(f'Test accuracy: {accuracy}')

Test loss: 0.00041197522659786046
Test accuracy: 0.9997173547744751


In [38]:
#model.save('raw_url_model.h5') 