<a href="https://www.kaggle.com/code/madhumardoor/phishing-url?scriptVersionId=211853981" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [2]:
!pip install tldextract

Collecting tldextract
  Downloading tldextract-5.1.3-py3-none-any.whl.metadata (11 kB)
Collecting requests-file>=1.4 (from tldextract)
  Downloading requests_file-2.1.0-py2.py3-none-any.whl.metadata (1.7 kB)
Downloading tldextract-5.1.3-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.9/104.9 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading requests_file-2.1.0-py2.py3-none-any.whl (4.2 kB)
Installing collected packages: requests-file, tldextract
Successfully installed requests-file-2.1.0 tldextract-5.1.3


In [3]:
# Now import the necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
import re
from urllib.parse import urlparse
from tldextract import extract  # Now it should work

# Feature extraction functions (same as previously defined)
def get_url_length(url):
    return len(url)

def count_special_characters(url):
    special_characters = re.findall(r'[^A-Za-z0-9]', url)
    return len(special_characters)

def is_https(url):
    return 1 if url.startswith('https://') else 0

def get_domain(url):
    parsed_url = urlparse(url)
    domain = extract(url)
    return domain.domain

def get_path_length(url):
    parsed_url = urlparse(url)
    return len(parsed_url.path)

def count_subdomains(url):
    parsed_url = urlparse(url)
    domain = extract(url)
    subdomains = domain.subdomain
    return len(subdomains.split('.')) if subdomains else 0

def extract_features(url):
    return [
        get_url_length(url),
        count_special_characters(url),
        is_https(url),
        get_domain(url),
        get_path_length(url),
        count_subdomains(url)
    ]

In [4]:

data = pd.read_csv("/kaggle/input/phishing-url-websites-dataset-phiusiil/PhiUSIIL_Phishing_URL_Dataset.csv")

# Identify numeric columns for scaling
numeric_features = data.select_dtypes(include=['int64', 'float64']).columns

# Separate features (X) and labels (y)
X = data[numeric_features]
y = data['label']  # Assuming 'label' is your target variable

# Apply feature scaling only on numeric features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


In [5]:
data

Unnamed: 0,FILENAME,URL,URLLength,Domain,DomainLength,IsDomainIP,TLD,URLSimilarityIndex,CharContinuationRate,TLDLegitimateProb,...,Pay,Crypto,HasCopyrightInfo,NoOfImage,NoOfCSS,NoOfJS,NoOfSelfRef,NoOfEmptyRef,NoOfExternalRef,label
0,521848.txt,https://www.southbankmosaics.com,31,www.southbankmosaics.com,24,0,com,100.000000,1.000000,0.522907,...,0,0,1,34,20,28,119,0,124,1
1,31372.txt,https://www.uni-mainz.de,23,www.uni-mainz.de,16,0,de,100.000000,0.666667,0.032650,...,0,0,1,50,9,8,39,0,217,1
2,597387.txt,https://www.voicefmradio.co.uk,29,www.voicefmradio.co.uk,22,0,uk,100.000000,0.866667,0.028555,...,0,0,1,10,2,7,42,2,5,1
3,554095.txt,https://www.sfnmjournal.com,26,www.sfnmjournal.com,19,0,com,100.000000,1.000000,0.522907,...,1,1,1,3,27,15,22,1,31,1
4,151578.txt,https://www.rewildingargentina.org,33,www.rewildingargentina.org,26,0,org,100.000000,1.000000,0.079963,...,1,0,1,244,15,34,72,1,85,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
235790,660997.txt,https://www.skincareliving.com,29,www.skincareliving.com,22,0,com,100.000000,1.000000,0.522907,...,1,0,1,51,7,21,187,2,191,1
235791,77185.txt,https://www.winchester.gov.uk,28,www.winchester.gov.uk,21,0,uk,100.000000,0.785714,0.028555,...,1,0,0,50,1,7,88,0,31,1
235792,622132.txt,https://www.nononsensedesign.be,30,www.nononsensedesign.be,23,0,be,100.000000,1.000000,0.003319,...,0,0,1,27,10,30,58,2,67,1
235793,7503962.txt,https://patient-cell-40f5.updatedlogmylogin.wo...,55,patient-cell-40f5.updatedlogmylogin.workers.dev,47,0,dev,28.157537,0.465116,0.000961,...,0,0,0,0,0,3,0,0,0,0


In [6]:

# 1. **Random Forest Classifier**

rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions with the Random Forest model
y_pred_rf = rf_model.predict(X_test)

# Evaluate the model
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"Random Forest Model Accuracy: {accuracy_rf * 100:.2f}%")

# Cross-validation
rf_scores = cross_val_score(rf_model, X, y, cv=5)
print(f'Cross-validation scores (Random Forest): {rf_scores}')
print(f'Mean cross-validation score (Random Forest): {rf_scores.mean()}')

# Confusion Matrix for Random Fore
cm_rf = confusion_matrix(y_test, y_pred_rf)
print('Confusion Matrix (Random Forest):\n', cm_rf)

Random Forest Model Accuracy: 100.00%
Cross-validation scores (Random Forest): [1. 1. 1. 1. 1.]
Mean cross-validation score (Random Forest): 1.0
Confusion Matrix (Random Forest):
 [[20124     0]
 [    0 27035]]


**Conclusion**
The Random Forest model has demonstrated exceptional predictive power. Assuming no data leakage or overfitting, it is an ideal candidate for deployment or further exploration in real-world scenarios.

In [None]:
# 2. **Support Vector Machine (SVM)**

# Import necessary libraries
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import cross_val_score, train_test_split

# Define and train the SVM model
svm_model = SVC(random_state=42)
svm_model.fit(X_train, y_train)

# Make predictions with the SVM model
y_pred_svm = svm_model.predict(X_test)

# Evaluate the model
accuracy_svm = accuracy_score(y_test, y_pred_svm)
print(f"SVM Model Accuracy: {accuracy_svm * 100:.2f}%")

# Cross-validation
svm_scores = cross_val_score(svm_model, X, y, cv=5)
print(f'Cross-validation scores (SVM): {svm_scores}')
print(f'Mean cross-validation score (SVM): {svm_scores.mean()}')

# Confusion Matrix for SVM
cm_svm = confusion_matrix(y_test, y_pred_svm)
print('Confusion Matrix (SVM):\n', cm_svm)


SVM Model Accuracy: 99.99%


**Conclusion**
The SVM model is a strong alternative to Random Forest for this dataset. Its performance is nearly flawless, making it highly reliable for deployment or further exploration. However, consider testing both models in production-like environments to make an informed choice.






