<a href="https://colab.research.google.com/github/pandeeswar/url-phishing-detection-using-lstm-and-cnn/blob/main/oneurllstm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import re
import requests
import numpy as np
import pandas as pd
from sklearn.metrics import classification_report
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout

In [None]:
# function to extract hyperlinks from a given text
def extract_hyperlinks(text):
    return re.findall("(?P<url>https?://[^\s]+)", text)


In [None]:
# load the dataset
url = "https://www.example.com"
response = requests.get(url)
data = response.text

In [None]:
# extract the hyperlinks from the dataset
hyperlinks = extract_hyperlinks(data)



In [None]:
# print the hyperlinks
print("Hyperlinks extracted from the website:")
for hyperlink in hyperlinks:
    print(hyperlink)


Hyperlinks extracted from the website:
https://www.iana.org/domains/example">More


In [None]:
# convert the hyperlinks to a numerical representation
max_len = 1000
chars = sorted(list(set(data)))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))
sequences = []
next_chars = []
for i in range(0, len(data) - max_len, 1):
    sequence = data[i:i + max_len]
    sequences.append([char_indices[char] for char in sequence])
    next_chars.append(char_indices[data[i + max_len]])
X = np.reshape(sequences, (len(sequences), max_len, 1))
X = X / float(len(chars))
y = pd.get_dummies(next_chars).values


In [None]:
# define the LSTM model
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(256))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [None]:
# train the model
model.fit(X, y, epochs=2, batch_size=1)



Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f087bedcb50>

In [None]:
# predict the phishing probability for each hyperlink using the LSTM model
phishing_probs = model.predict(X)
print(phishing_probs)

[[0.02385678 0.086992   0.00970003 ... 0.01836065 0.01268108 0.01124205]
 [0.02384827 0.08707901 0.00965053 ... 0.01814    0.0125643  0.01116271]
 [0.0238202  0.08537435 0.00988228 ... 0.01825592 0.01288119 0.0115296 ]
 ...
 [0.02477172 0.08074263 0.01073053 ... 0.01882392 0.01327892 0.01233238]
 [0.02469291 0.08256759 0.01040801 ... 0.01850773 0.01289753 0.01189923]
 [0.02449393 0.08355957 0.01023662 ... 0.01840604 0.01280387 0.01174933]]


In [None]:
# print the phishing probabilities and the corresponding hyperlinks
print("Hyperlinks and their phishing probabilities:")
for i in range(len(hyperlinks)):
    print(hyperlinks[i] + ' : ' + str(phishing_probs[i]))



Hyperlinks and their phishing probabilities:
https://www.iana.org/domains/example">More : [0.02385678 0.086992   0.00970003 0.02619178 0.04362282 0.00809447
 0.0401058  0.00633827 0.02995243 0.00833621 0.00675875 0.05093301
 0.00847787 0.01333284 0.02667139 0.0506923  0.02368897 0.01065412
 0.02429094 0.07387248 0.00706022 0.02392138 0.02812679 0.05056726
 0.06524196 0.0375847  0.04421218 0.03284357 0.04589787 0.03797008
 0.01172697 0.01836065 0.01268108 0.01124205]
