<a href="https://colab.research.google.com/github/nosainwe/Machine-Learning-Projects/blob/main/Project%203%20-%20Implement%20URL%20phishing%20detection%20with%20Logistic%20Regression%20and%20TF-IDF%20vectorization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
# Importing necessary libraries
import re  # Regular expression operations
from sklearn.feature_extraction.text import TfidfVectorizer  # For vectorizing text data
from sklearn.linear_model import LogisticRegression  # For training the model
from sklearn.model_selection import train_test_split  # For splitting the data into training and test sets
import pandas as pd  # For handling data as DataFrame
import random  # For shuffling data randomly
import pickle  # For saving the model and vectorizer to files

# Define the url_cleanse function before using it in TfidfVectorizer
def url_cleanse(url):
    """
    Function to clean and preprocess the URL string:
    1. Converts URL to lowercase.
    2. Removes unwanted parts like http://, https://, www.
    3. Removes special characters while keeping alphanumeric, dots, and hyphens.
    4. Splits the URL into tokens based on dots, slashes, or hyphens for analysis.
    """
    url = url.lower()  # Convert URL to lowercase
    url = re.sub(r'http[s]?://', '', url)  # Remove 'http://' or 'https://'
    url = re.sub(r'www\.', '', url)  # Remove 'www.'
    url = re.sub(r'[^a-z0-9\s.-]', '', url)  # Remove all non-alphanumeric characters except dots and hyphens
    tokens = re.split(r'[./-]', url)  # Split the URL by dots, slashes, or hyphens
    return tokens  # Return the tokens for further analysis

# Correct the file path (adjust according to your setup)
input_url = '/content/phishing_site_urls.csv'

# Read the CSV file properly with the correct delimiter
data_csv = pd.read_csv(input_url, sep=',', on_bad_lines='skip')  # Load the dataset from the CSV file

# Shuffle the rows by converting the DataFrame to a list
data_list = data_csv.values.tolist()  # Convert the DataFrame to a list of lists
random.shuffle(data_list)  # Shuffle the data randomly to avoid any order bias

# Convert shuffled data back to DataFrame if needed (to preserve column names)
data_df = pd.DataFrame(data_list, columns=data_csv.columns)

# Split columns into 'y' (labels) and 'inputurls' (features)
y = [d[1] for d in data_list]  # Extract the label (e.g., 'bad' or 'good') assuming it’s in the second column
inputurls = [d[0] for d in data_list]  # Extract the URLs from the first column

# Optionally, you can reassign them back to pandas DataFrame if necessary
data_df = pd.DataFrame(data_list, columns=data_csv.columns)  # Recreate DataFrame with shuffled data

# Initialize the vectorizer and logistic regression model
url_vectorizer = TfidfVectorizer(tokenizer=url_cleanse)  # Use the custom 'url_cleanse' function for tokenization
l_regress = LogisticRegression()  # Logistic regression model to classify URLs

# Split the data into training and testing sets (80% train, 20% test)
x_train, x_test, y_train, y_test = train_test_split(inputurls, y, test_size=0.2, random_state=42)  # 80% train and 20% test data

# Fit the vectorizer on the training URLs
x_train_vectorized = url_vectorizer.fit_transform(x_train)  # Transform the training URLs into a matrix of TF-IDF features

# Train the logistic regression model on the vectorized training data
l_regress.fit(x_train_vectorized, y_train)  # Fit the model on the training data

# Evaluate the model on the test data
x_test_vectorized = url_vectorizer.transform(x_test)  # Transform the test URLs using the fitted vectorizer
l_score = l_regress.score(x_test_vectorized, y_test)  # Evaluate the accuracy of the model on the test data

# Print the score as a percentage
print("Score: {:.2f}%".format(100 * l_score))  # Output the accuracy percentage

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Score: 92.18%


In [13]:
# Save the URL vectorizer and logistic regression model to files
file1 = "model.pkl"  # File name for the logistic regression model
with open(file1, 'wb') as f:
    pickle.dump(l_regress, f)  # Save the trained model

file2 = "vector.pkl"  # File name for the URL vectorizer
with open(file2, 'wb') as f2:
    pickle.dump(url_vectorizer, f2)  # Save the vectorizer

# To use the saved models for prediction
with open(file1, 'rb') as f1:
    lgr = pickle.load(f1)  # Load the logistic regression model

with open(file2, 'rb') as f2:
    url_vectorizer = pickle.load(f2)  # Load the URL vectorizer

# Example of predicting for new URLs
inputurls = ['hackthebox.eu', 'facebook.com']  # Example URLs for prediction (replace with actual test data)
x = url_vectorizer.transform(inputurls)  # Transform the input URLs using the loaded vectorizer
y_predict = l_regress.predict(x)  # Predict the legitimacy of the URLs

# Print the results
print(inputurls)  # Display the input URLs
print(y_predict)  # Display the predicted labels for the URLs

['yamleg.com', 'facebook.com']
['good' 'good']
