# Phishing Domain Detection
## Testing Random Forest with 14 features obtained from RFE

[Dataset Link](https://data.mendeley.com/datasets/72ptz43s9v/1)<br>
[Dataset Description](https://www.sciencedirect.com/science/article/pii/S2352340920313202)

In [2]:
# Basic Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns

# Modelling
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier
from sklearn.svm import SVC
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
import warnings
import os

In [3]:
df = pd.read_csv("data/78_features.csv")

In [4]:
df['phishing'].value_counts()

phishing
0    56706
1    30497
Name: count, dtype: int64

In [5]:
df.shape

(87203, 78)

In [42]:
# MY RFE
# selected_features = ['domain_length',
#  'qty_hyphen_directory',
#  'qty_slash_directory',
#  'directory_length',
#  'qty_dot_file',
#  'qty_exclamation_file',
#  'qty_space_file',
#  'qty_tilde_file',
#  'qty_percent_file',
#  'file_length',
#  'time_response',
#  'asn_ip',
#  'time_domain_activation',
#  'ttl_hostname']

# RESEARCH PAPER RFE
selected_features = [
    "qty_dot_domain",
    "qty_vowels_domain",
    "domain_length",
    "qty_dot_directory",
    "qty_slash_directory",
    "directory_length",
    "qty_dot_file",
    "file_length",
    "params_length",
    "time_response",
    "asn_ip",
    "time_domain_activation",
    "time_domain_expiration",
    "ttl_hostname"
]

In [43]:
X = df[selected_features]
X.head()

Unnamed: 0,qty_dot_domain,qty_vowels_domain,domain_length,qty_dot_directory,qty_slash_directory,directory_length,qty_dot_file,file_length,params_length,time_response,asn_ip,time_domain_activation,time_domain_expiration,ttl_hostname
0,2,4,17,1,1,8,1,7,-1,0.207316,60781,-1,-1,892
1,2,5,16,3,3,42,1,9,165,0.499566,36024,579,150,9540
2,2,3,14,0,1,1,0,0,-1,0.935901,4766,-1,-1,589
3,2,7,19,2,5,62,1,9,-1,0.410021,20454,-1,-1,292
4,2,5,19,-1,-1,-1,-1,-1,-1,0.410761,53831,6998,306,3597


In [44]:
X_cols = X.columns
X_cols

Index(['qty_dot_domain', 'qty_vowels_domain', 'domain_length',
       'qty_dot_directory', 'qty_slash_directory', 'directory_length',
       'qty_dot_file', 'file_length', 'params_length', 'time_response',
       'asn_ip', 'time_domain_activation', 'time_domain_expiration',
       'ttl_hostname'],
      dtype='object')

In [45]:
y = df['phishing']
y

0        1
1        1
2        0
3        1
4        0
        ..
87198    0
87199    0
87200    1
87201    1
87202    0
Name: phishing, Length: 87203, dtype: int64

In [46]:
y.values.ravel()

array([1, 1, 0, ..., 1, 1, 0], dtype=int64)

In [47]:
# #Oversampling using SMOTE
# from imblearn.over_sampling import SMOTE

# X, y = SMOTE().fit_resample(X, y)

# # checking the sizes of the sample data
# print("Size of X:", X.shape)
# print("Size of y:", y.shape)

In [48]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(X)
X.shape

(87203, 14)

In [49]:
X

array([[ 0.18437124, -0.57076808, -0.23987577, ..., -1.12987003,
        -0.59495048, -0.4655459 ],
       [ 0.18437124, -0.17699992, -0.3915019 , ..., -0.93909471,
        -0.34390372,  0.2843392 ],
       [ 0.18437124, -0.96453623, -0.69475416, ..., -1.12987003,
        -0.59495048, -0.49181962],
       ...,
       [-1.24083264,  0.21676823,  0.5182549 , ..., -0.52300715,
         0.59045575,  0.70497895],
       [ 0.18437124, -0.17699992,  1.27638556, ..., -1.12987003,
        -0.59495048, -0.53838395],
       [ 0.18437124, -0.57076808, -0.3915019 , ..., -1.03086421,
        -0.48688399, -0.51696609]])

In [50]:
# separate dataset into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((69762, 14), (17441, 14), (69762,), (17441,))

In [51]:
# import pickle
# pickle.dump(scaler, open('scaling.pkl','wb'))

In [52]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score, f1_score
def evaluate_model(true, predicted):
    accuracy = accuracy_score(true, predicted)
    precision = precision_score(true, predicted, average='weighted')
    recall = recall_score(true, predicted, average='weighted')
    f1 = f1_score(true, predicted, average='weighted')
    class_report = classification_report(predicted , true, target_names=["legitimate","malicious"])
    return accuracy, precision, recall, f1, class_report



In [53]:
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

train_accuracies = []
train_precisions = []
train_recalls = []
train_f1_scores = []
test_accuracies = []
test_precisions = []
test_recalls = []
test_f1_scores = []

# Evaluate Train and Test dataset
# Evaluate Train dataset
model_train_accuracy, model_train_precision, model_train_recall, model_train_f1, model_classification_report_train = evaluate_model(y_train, y_train_pred)
# Evaluate Test dataset
model_test_accuracy, model_test_precision, model_test_recall, model_test_f1, model_classification_report_test = evaluate_model(y_test, y_test_pred)

print('-' * 20)
print('Train Accuracy:', model_train_accuracy)
print('Train Classification Report:\n', model_classification_report_train)
print('-' * 35)
print('Test Accuracy:', model_test_accuracy)
print('Test Classification Report:\n', model_classification_report_test)
print('=' * 35)
print('\n')

train_accuracies.append(model_train_accuracy)
train_precisions.append(model_train_precision)
train_recalls.append(model_train_recall)
train_f1_scores.append(model_train_f1)

test_accuracies.append(model_test_accuracy)
test_precisions.append(model_test_precision)
test_recalls.append(model_test_recall)
test_f1_scores.append(model_test_f1)

results_df = pd.DataFrame({
    'Test Accuracy': test_accuracies,
    'Test Precision': test_precisions,
    'Test Recall': test_recalls,
    'Test F1-score': test_f1_scores
})

results_df

--------------------
Train Accuracy: 0.9998853243886356
Train Classification Report:
               precision    recall  f1-score   support

  legitimate       1.00      1.00      1.00     45232
   malicious       1.00      1.00      1.00     24530

    accuracy                           1.00     69762
   macro avg       1.00      1.00      1.00     69762
weighted avg       1.00      1.00      1.00     69762

-----------------------------------
Test Accuracy: 0.9678917493263001
Test Classification Report:
               precision    recall  f1-score   support

  legitimate       0.97      0.98      0.98     11400
   malicious       0.96      0.95      0.95      6041

    accuracy                           0.97     17441
   macro avg       0.97      0.96      0.96     17441
weighted avg       0.97      0.97      0.97     17441





Unnamed: 0,Test Accuracy,Test Precision,Test Recall,Test F1-score
0,0.967892,0.968038,0.967892,0.967943


---------------------------------------------------------------------------------------------------------------------------

## Testing a URL

In [54]:
import dns.resolver
import whois
import ipwhois
import requests
import socket
import ssl
import time
from urllib.parse import urlparse, parse_qs

def extract_url_features(url):
    parsed_url = urlparse(url)
    domain = parsed_url.netloc.split(':')[0]
    path = parsed_url.path
    query = parsed_url.query
    params = parsed_url.params
    fragment = parsed_url.fragment
    
    def count_occurrences(string, chars):
        return {char: string.count(char) for char in chars}
    
    chars_to_count = ".-_/=?@&! ~,+*#$%"
    
    # URL FEATURES
    url_counts = count_occurrences(url, chars_to_count)
    tld = domain.split('.')[-1] if '.' in domain else ''

    # URL DOMAIN FEATURES
    domain_counts = count_occurrences(domain, chars_to_count)

    vowels = 'aeiou'
    qty_vowels_domain = sum(domain.lower().count(vowel) for vowel in vowels)

    def is_ip(domain):
        try:
            socket.inet_aton(domain)
            return True
        except socket.error:
            return False
        
    # URL DIRECTORY FEATURES
    path_counts = count_occurrences(path, chars_to_count)

    # URL FILE FEATURES
    query_counts = count_occurrences(query, chars_to_count)

    # URL PARAMETERS FEATURES
    params_counts = count_occurrences(params, chars_to_count)

    # ATTRIBUTES BASED ON RESOLVING URL AND EXTERNAL SERVICES
    
    def get_time_response(domain):
        try:
            start_time = time.time()
            requests.get(f"http://{domain}", timeout=5)
            return time.time() - start_time
        except:
            return None

    def domain_has_spf(domain):
        try:
            answers = dns.resolver.resolve(domain, 'TXT')
            for rdata in answers:
                if 'v=spf1' in str(rdata):
                    return 1
            return 0
        except:
            return -1
        
    def domain_to_ip(domain):
        try:
            ip_address = socket.gethostbyname(domain)
            return ip_address
        except socket.error:
            return -1
        
    def get_asn_ip(domain):
        try:
            # Convert domain to IP address
            ip_address = domain_to_ip(domain)
            if ip_address:
                # Perform WHOIS lookup using ipwhois library
                obj = ipwhois.IPWhois(ip_address)
                result = obj.lookup_rdap()
                # Extract ASN information if available
                asn = result.get('asn')
                if asn:
                    return int(asn.split(' ')[0])  # Extract ASN number
                else:
                    return -1  # Return -1 if ASN information not found
            else:
                return -1  # Return -1 if domain to IP conversion fails

        except Exception:
            return -1  # Return -1 on error

    def get_whois_info(domain):
        try:
            domain_info = whois.whois(domain)
            creation_date = domain_info.creation_date
            expiration_date = domain_info.expiration_date
            
            if isinstance(creation_date, list):
                creation_date = creation_date[0]
            if isinstance(expiration_date, list):
                expiration_date = expiration_date[0]
            
            time_domain_activation = (time.time() - creation_date.timestamp()) / (60 * 60 * 24) if creation_date else -1
            time_domain_expiration = (expiration_date.timestamp() - time.time()) / (60 * 60 * 24) if expiration_date else -1
            return time_domain_activation, time_domain_expiration
        except:
            return -1, -1
        
    def get_qty_ip_resolved(domain):
        try:
            ips = socket.gethostbyname_ex(domain)
            return len(ips[2])  # Return the number of resolved IPs
        except socket.gaierror:
            return -1  # Return -1 if there's an error resolving IPs
        
    def get_qty_nameservers(domain):
        try:
            answers = dns.resolver.resolve(domain, 'NS')
            return len(answers)  # Return the number of resolved name servers
        except dns.resolver.NoAnswer:
            return 0  # Return 0 if no name servers found
        except dns.resolver.NXDOMAIN:
            return -1  # Return -1 if domain does not exist
        except dns.resolver.Timeout:
            return -1  # Return -1 on timeout or other DNS resolution errors

    def get_qty_mx_servers(domain):
        try:
            answers = dns.resolver.resolve(domain, 'MX')
            qty_mx_servers = len(answers)
        except:
            qty_mx_servers = 0
        return qty_mx_servers
    
    def get_ttl_hostname(domain):
        try:
            answers = dns.resolver.resolve(domain, 'A')
            return answers.rrset.ttl  # Return TTL of the hostname
        except dns.resolver.NoAnswer:
            return -1  # Return -1 if no answer found
        except dns.resolver.NXDOMAIN:
            return -1  # Return -1 if domain does not exist
        except dns.resolver.Timeout:
            return -1  # Return -1 on timeout or other DNS resolution errors
        
    def check_tls_ssl_certificate(domain):
        try:
            context = ssl.create_default_context()
            with socket.create_connection((domain, 443)) as sock:
                with context.wrap_socket(sock, server_hostname=domain) as ssock:
                    cert = ssock.getpeercert()
                    return 1 if cert else 0  # Return True if valid certificate found
        except ssl.SSLError:
            return 0  # Return False if SSL error occurs or no certificate found
        except (socket.gaierror, socket.timeout):
            return 0  # Return False on connection or timeout errors

    # Function for qty_redirects (Number of Redirects)
    def get_qty_redirects(url):
        try:
            response = requests.head(url, allow_redirects=True)
            return len(response.history)  # Return the number of redirects followed
        except requests.RequestException:
            return -1  # Return -1 if there's an error in making the request

    # Function for url_google_index (Check if URL is Indexed on Google)
    def is_url_indexed_on_google(url):
        try:
            response = requests.get(f"https://www.google.com/search?q=info:{url}")
            return 1 if response.status_code == 200 and url in response.text else False
        except requests.RequestException:
            return 0  # Return False if there's an error in making the request

    # Function for domain_google_index (Check if Domain is Indexed on Google)
    def is_domain_indexed_on_google(domain):
        try:
            response = requests.get(f"https://www.google.com/search?q=site:{domain}")
            return 1 if response.status_code == 200 and domain in response.text else False
        except requests.RequestException:
            return 0  # Return False if there's an error in making the request
    
    
    time_response = get_time_response(domain)
    domain_spf = domain_has_spf(domain)
    time_domain_activation, time_domain_expiration = get_whois_info(domain)
    qty_mx_servers = get_qty_mx_servers(domain)
    qty_ip_resolved = get_qty_ip_resolved(domain)
    asn_ip = get_asn_ip(domain)
    qty_nameservers = get_qty_nameservers(domain)
    ttl_hostname = get_ttl_hostname(domain)
    tls_ssl_certificate = check_tls_ssl_certificate(domain)
    qty_redirects = get_qty_redirects(url)
    url_google_index = is_url_indexed_on_google(url)
    domain_google_index = is_domain_indexed_on_google(domain)
    
    features = {
        # URL features
        "qty_dot_url": url_counts['.'],
        "qty_hyphen_url": url_counts['-'],
        "qty_underline_url": url_counts['_'],
        "qty_slash_url": url_counts['/'],
        "qty_questionmark_url": url_counts['?'],
        "qty_equal_url": url_counts['='],
        "qty_at_url": url_counts['@'],
        "qty_and_url": url_counts['&'],
        "qty_exclamation_url": url_counts['!'],
        "qty_space_url": url_counts[' '],
        "qty_tilde_url": url_counts['~'],
        "qty_comma_url": url_counts[','],
        "qty_plus_url": url_counts['+'],
        "qty_asterisk_url": url_counts['*'],
        "qty_hashtag_url": url_counts['#'],
        "qty_dollar_url": url_counts['$'],
        "qty_percent_url": url_counts['%'],
        "qty_tld_url": len(tld),
        "length_url": len(url),
        
        # Domain features
        "qty_dot_domain": domain_counts['.'],
        "qty_hyphen_domain": domain_counts['-'],
        "qty_underline_domain": domain_counts['_'],
        "qty_slash_domain": domain_counts['/'],
        "qty_questionmark_domain": domain_counts['?'],
        "qty_equal_domain": domain_counts['='],
        "qty_at_domain": domain_counts['@'],
        "qty_and_domain": domain_counts['&'],
        "qty_exclamation_domain": domain_counts['!'],
        "qty_space_domain": domain_counts[' '],
        "qty_tilde_domain": domain_counts['~'],
        "qty_comma_domain": domain_counts[','],
        "qty_plus_domain": domain_counts['+'],
        "qty_asterisk_domain": domain_counts['*'],
        "qty_hashtag_domain": domain_counts['#'],
        "qty_dollar_domain": domain_counts['$'],
        "qty_percent_domain": domain_counts['%'],
        "qty_vowels_domain": qty_vowels_domain,
        "domain_length": len(domain),
        "domain_in_ip": 1 if is_ip(domain) else 0,
        "server_client_domain": 1 if "client" in domain or "server" in domain else 0,
        
        # Directory features
        "qty_dot_directory": path_counts['.'],
        "qty_hyphen_directory": path_counts['-'],
        "qty_underline_directory": path_counts['_'],
        "qty_slash_directory": path_counts['/'],
        "qty_questionmark_directory": path_counts['?'],
        "qty_equal_directory": path_counts['='],
        "qty_at_directory": path_counts['@'],
        "qty_and_directory": path_counts['&'],
        "qty_exclamation_directory": path_counts['!'],
        "qty_space_directory": path_counts[' '],
        "qty_tilde_directory": path_counts['~'],
        "qty_comma_directory": path_counts[','],
        "qty_plus_directory": path_counts['+'],
        "qty_asterisk_directory": path_counts['*'],
        "qty_hashtag_directory": path_counts['#'],
        "qty_dollar_directory": path_counts['$'],
        "qty_percent_directory": path_counts['%'],
        "directory_length": len(path),
        
        # File features
        "qty_dot_file": query_counts['.'],
        "qty_hyphen_file": query_counts['-'],
        "qty_underline_file": query_counts['_'],
        "qty_slash_file": query_counts['/'],
        "qty_questionmark_file": query_counts['?'],
        "qty_equal_file": query_counts['='],
        "qty_at_file": query_counts['@'],
        "qty_and_file": query_counts['&'],
        "qty_exclamation_file": query_counts['!'],
        "qty_space_file": query_counts[' '],
        "qty_tilde_file": query_counts['~'],
        "qty_comma_file": query_counts[','],
        "qty_plus_file": query_counts['+'],
        "qty_asterisk_file": query_counts['*'],
        "qty_hashtag_file": query_counts['#'],
        "qty_dollar_file": query_counts['$'],
        "qty_percent_file": query_counts['%'],
        "file_length": len(query),
        
        # Parameters features
        "qty_dot_params": params_counts['.'],
        "qty_hyphen_params": params_counts['-'],
        "qty_underline_params": params_counts['_'],
        "qty_slash_params": params_counts['/'],
        "qty_questionmark_params": params_counts['?'],
        "qty_equal_params": params_counts['='],
        "qty_at_params": params_counts['@'],
        "qty_and_params": params_counts['&'],
        "qty_exclamation_params": params_counts['!'],
        "qty_space_params": params_counts[' '],
        "qty_tilde_params": params_counts['~'],
        "qty_comma_params": params_counts[','],
        "qty_plus_params": params_counts['+'],
        "qty_asterisk_params": params_counts['*'],
        "qty_hashtag_params": params_counts['#'],
        "qty_dollar_params": params_counts['$'],
        "qty_percent_params": params_counts['%'],
        "params_length": len(params),
        "tld_present_params": 1 if tld != '' else 0,
        "qty_params": len(parse_qs(query)),
        "email_in_url": 1 if '@' in url else 0,
        
        # WHOIS and DNS features
        "time_response": time_response if time_response is not None else -1,
        "domain_spf": domain_spf if domain_spf is not None else -1,
        "asn_ip": asn_ip,
        "time_domain_activation": time_domain_activation if time_domain_activation is not None else -1,
        "time_domain_expiration": time_domain_expiration if time_domain_expiration is not None else -1,
        "qty_ip_resolved": qty_ip_resolved,  # Placeholder, requires DNS lookup
        "qty_nameservers": qty_nameservers,  # Placeholder, requires DNS lookup
        "qty_mx_servers": qty_mx_servers if qty_mx_servers  is not None else 0,
        "ttl_hostname": ttl_hostname,  # Placeholder, requires DNS lookup
        "tls_ssl_certificate": tls_ssl_certificate,  # Placeholder, requires SSL/TLS library
        "qty_redirects": qty_redirects,  # Placeholder, requires HTTP request handling
        "url_google_index": url_google_index,
        "domain_google_index": domain_google_index,
        "url_shortened": 1 if len(url) < 20 else 0,  # Example condition for shortened URL
    }
    
    return features

In [55]:
URL = "https://www.youtube.com/watch?v=H6988OpZKTU&ab_channel=RishabhMishra"
all_features = extract_url_features(URL)
all_features

{'qty_dot_url': 2,
 'qty_hyphen_url': 0,
 'qty_underline_url': 1,
 'qty_slash_url': 3,
 'qty_questionmark_url': 1,
 'qty_equal_url': 2,
 'qty_at_url': 0,
 'qty_and_url': 1,
 'qty_exclamation_url': 0,
 'qty_space_url': 0,
 'qty_tilde_url': 0,
 'qty_comma_url': 0,
 'qty_plus_url': 0,
 'qty_asterisk_url': 0,
 'qty_hashtag_url': 0,
 'qty_dollar_url': 0,
 'qty_percent_url': 0,
 'qty_tld_url': 3,
 'length_url': 68,
 'qty_dot_domain': 2,
 'qty_hyphen_domain': 0,
 'qty_underline_domain': 0,
 'qty_slash_domain': 0,
 'qty_questionmark_domain': 0,
 'qty_equal_domain': 0,
 'qty_at_domain': 0,
 'qty_and_domain': 0,
 'qty_exclamation_domain': 0,
 'qty_space_domain': 0,
 'qty_tilde_domain': 0,
 'qty_comma_domain': 0,
 'qty_plus_domain': 0,
 'qty_asterisk_domain': 0,
 'qty_hashtag_domain': 0,
 'qty_dollar_domain': 0,
 'qty_percent_domain': 0,
 'qty_vowels_domain': 5,
 'domain_length': 15,
 'domain_in_ip': 0,
 'server_client_domain': 0,
 'qty_dot_directory': 0,
 'qty_hyphen_directory': 0,
 'qty_underli

In [56]:
optimal_features = selected_features
optimal_features

['qty_dot_domain',
 'qty_vowels_domain',
 'domain_length',
 'qty_dot_directory',
 'qty_slash_directory',
 'directory_length',
 'qty_dot_file',
 'file_length',
 'params_length',
 'time_response',
 'asn_ip',
 'time_domain_activation',
 'time_domain_expiration',
 'ttl_hostname']

In [57]:
reqd_features = {key: all_features[key] for key in optimal_features}
reqd_features

{'qty_dot_domain': 2,
 'qty_vowels_domain': 5,
 'domain_length': 15,
 'qty_dot_directory': 0,
 'qty_slash_directory': 1,
 'directory_length': 6,
 'qty_dot_file': 0,
 'file_length': 38,
 'params_length': 0,
 'time_response': 1.232072114944458,
 'asn_ip': 15169,
 'time_domain_activation': -1,
 'time_domain_expiration': -1,
 'ttl_hostname': 182}

In [58]:
reqd_features.values()

dict_values([2, 5, 15, 0, 1, 6, 0, 38, 0, 1.232072114944458, 15169, -1, -1, 182])

In [59]:
reqd_features_array = np.array(list(reqd_features.values()))
reqd_features_array

array([ 2.00000000e+00,  5.00000000e+00,  1.50000000e+01,  0.00000000e+00,
        1.00000000e+00,  6.00000000e+00,  0.00000000e+00,  3.80000000e+01,
        0.00000000e+00,  1.23207211e+00,  1.51690000e+04, -1.00000000e+00,
       -1.00000000e+00,  1.82000000e+02])

In [60]:
model.predict(reqd_features_array.reshape(1,-1))

array([1], dtype=int64)