In [12]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, SimpleRNN
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import tldextract
import requests
import matplotlib.pyplot as plt

# Load the dataset
df = pd.read_csv('data/phishing_urls.csv')

# Split the dataset into features and labels
X = df['url'].values
y = df['label'].values

# Encode the labels
encoder = LabelEncoder()
y = encoder.fit_transform(y)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorize the text data
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

# Convert the text data to sequences
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

# Pad the sequences to ensure uniform length
max_len = 100
X_train = pad_sequences(X_train, padding='post', maxlen=max_len)
X_test = pad_sequences(X_test, padding='post', maxlen=max_len)

# Build the RNN model
rnn_model = Sequential()
rnn_model.add(Embedding(5000, 128, input_length=max_len))
rnn_model.add(SimpleRNN(128))
rnn_model.add(Dense(1, activation='sigmoid'))
rnn_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the RNN model
rnn_history = rnn_model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=5, batch_size=64)

# Build the LSTM model
lstm_model = Sequential()
lstm_model.add(Embedding(5000, 128, input_length=max_len))
lstm_model.add(LSTM(128))
lstm_model.add(Dense(1, activation='sigmoid'))
lstm_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the LSTM model
lstm_history = lstm_model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=5, batch_size=64)


# Define a function to predict the label of a new email
def predict_email_label(email_text):
    # Vectorize the email text
    email_vector = vectorizer.transform([email_text])
    # Convert the email text to a sequence and pad it
    email_sequence = tokenizer.texts_to_sequences([email_text])
    email_padded = pad_sequences(email_sequence, padding='post', maxlen=max_len)
    # Predict the label using the LSTM model
    label = lstm_model.predict(email_padded)[0][0]
    # Return the predicted label
    return int(round(label))


# Define a function to check if a URL is a phishing URL
def is_phishing_url(url):
    # Extract the domain from the URL
    domain = tldextract.extract(url).domain
    # Check if the domain is a known phishing domain
    if domain in phishing_domains:
        return True
    # Check if the URL is a known phishing URL
    url_vector = vectorizer.transform([url])
    url_sequence = tokenizer.texts_to_sequences([url])
    url_padded = pad_sequences(url_sequence, padding='post', maxlen=max_len)
    label = lstm_model.predict(url_padded)[0][0]
    if label > 0.5:
        return True
    # Check if the URL is a phishing URL based on its response status code
    try:
        response = requests.get(url, allow_redirects=False, timeout=5)
        if response.status_code in phishing_status_codes:
            return True
    except:
        pass
    # If none of the above checks pass, the URL is not a phishing URL
    return False


# Load a list of known phishing domains
phishing_domains = pd.read_csv('phishing_domains.csv')['domain'].tolist()

# Define a list of known phishing status codes
phishing_status_codes = [400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 422, 423, 424, 425, 426, 428]

ParserError: Error tokenizing data. C error: Expected 1 fields in line 28, saw 367
