# **Phish in the Web**

### **Demo with tkinter**

In [None]:
#import necessary files
import pandas as pd
import pickle
import ipywidgets as widgets
from IPython.display import display
import re
from urllib.parse import urlparse  #for URL feature extraction


### **Load our RF Demo Trained Model**

In [None]:
#load the saved model
with open("phishing_rf_model.saved", "rb") as file:
    rf_model = pickle.load(file)

### **Feature extraction for the DEMO**

In [None]:
#function to extract features from URL and perform prediction
def predict_phishing(url):

    try:
        #extract features from URL
        num_dots = url.count('.')
        subdomain_level = url.count('//')
        path_level = url.count('/') - 2  #subtract 2 for domain and protocol
        url_length = len(url)
        num_dash = url.count('-')
        num_dash_in_hostname = url.split('/')[2].count('-')  #count dashes in hostname
        at_symbol = 1 if '@' in url else 0
        tilde_symbol = 1 if '~' in url else 0
        num_underscore = url.count('_')
        num_percent = url.count('%')
        num_query_components = len(urlparse(url).query.split('&'))
        num_ampersand = url.count('&')
        num_hash = url.count('#')
        num_numeric_chars = sum(c.isdigit() for c in url)
        no_https = 0 if 'https' in url else 1
        random_string = 1 if re.search(r'[0-9]{5}', url) else 0
        ip_address = 1 if re.match(r'\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b', url) else 0
        domain_in_subdomains = 1 if url.count('.') > 2 else 0
        domain_in_paths = 1 if '/' + url.split('/')[2].replace('.', '/') + '/' in url else 0
        https_in_hostname = 1 if 'https' in url.split('/')[2] else 0
        hostname_length = len(urlparse(url).netloc)
        path_length = len(urlparse(url).path)
        query_length = len(urlparse(url).query)
        double_slash_in_path = 1 if '//' in url else 0
        num_sensitive_words = 1 if re.search(r'admin|login|password|user', url, re.IGNORECASE) else 0

        #perform prediction
        input_data = pd.DataFrame([[num_dots, subdomain_level, path_level, url_length, num_dash, num_dash_in_hostname, at_symbol,
                       tilde_symbol, num_underscore, num_percent, num_query_components, num_ampersand, num_hash,
                       num_numeric_chars, no_https, random_string, ip_address, domain_in_subdomains, domain_in_paths,
                       https_in_hostname, hostname_length, path_length, query_length, double_slash_in_path,
                       num_sensitive_words]])
        prediction = rf_model.predict(input_data)

        #return prediction result
        if prediction[0] == 1:
            return "It's a PHISH! Proceed with caution..."
        else:
            return "No Phish! Swim freely :)"
    except IndexError:
        return "URL structure is not as expected. Re-enter!"

### **Text input & Output**

The UI for the demo

In [None]:
#text input for URL
url_text_input = widgets.Text(placeholder='Enter URL', description='URL:')
display(url_text_input)

#button to trigger prediction
predict_button = widgets.Button(description='Predict')
display(predict_button)

#output widget for displaying prediction result
result_label = widgets.Label()
display(result_label)

#function to handle button click event
def on_predict_button_clicked(b):
    url = url_text_input.value.strip()
    if url:
        result_label.value = predict_phishing(url)
    else:
        result_label.value = "Please enter a URL."

#attach button click event
predict_button.on_click(on_predict_button_clicked)

Text(value='', description='URL:', placeholder='Enter URL')

Button(description='Predict', style=ButtonStyle())

Label(value='')

## **Examples:**

1. http:// t-info.mail.adobe.com/r/?id=hc43f43t4a,afd67070,affc7349&p1=t.mid.accor-mail.com/r/?id=159593f159593159593,hde43e13b13,ecdfafef,ee5cfa06&p1=filepmgklf.com/victimemail @domain.com

- this is phishing & is detected as phishing

2. http://google.com-redirect@valimail.com

- this is phishing, and is detected as phishing

3. https://colab.research.google.com

- not phishing & resulted in not phishing

4. https://chat.openai.com/c/2118426e-f3a8-4b6a-8200-b6a2fd5a8072

- not phishing, but resulted in phish


**75% correct**