In [17]:
import pandas as pd
import urllib
from urllib.parse import urlparse
from sklearn.preprocessing import StandardScaler
import pickle

def preprocess_and_extract_features(df):
    # Step 1: Extract URL components (protocol, domain, path, query, fragment) for each URL
    urls = [url for url in df['url']]
    df['protocol'], df['domain'], df['path'], df['query'], df['fragment'] = zip(*[urllib.parse.urlsplit(x) for x in urls])

    # Step 2: Define the needed columns and calculate features for each component
    needed_cols = ['url', 'domain', 'path', 'query', 'fragment']
    for col in needed_cols:
        df[f'{col}_length'] = df[col].str.len()
        df[f'qty_dot_{col}'] = df[[col]].applymap(lambda x: str.count(x, '.'))
        df[f'qty_hyphen_{col}'] = df[[col]].applymap(lambda x: str.count(x, '-'))
        df[f'qty_slash_{col}'] = df[[col]].applymap(lambda x: str.count(x, '/'))
        df[f'qty_questionmark_{col}'] = df[[col]].applymap(lambda x: str.count(x, '?'))
        df[f'qty_equal_{col}'] = df[[col]].applymap(lambda x: str.count(x, '='))
        df[f'qty_at_{col}'] = df[[col]].applymap(lambda x: str.count(x, '@'))
        df[f'qty_and_{col}'] = df[[col]].applymap(lambda x: str.count(x, '&'))
        df[f'qty_exclamation_{col}'] = df[[col]].applymap(lambda x: str.count(x, '!'))
        df[f'qty_space_{col}'] = df[[col]].applymap(lambda x: str.count(x, ' '))
        df[f'qty_tilde_{col}'] = df[[col]].applymap(lambda x: str.count(x, '~'))
        df[f'qty_comma_{col}'] = df[[col]].applymap(lambda x: str.count(x, ','))
        df[f'qty_plus_{col}'] = df[[col]].applymap(lambda x: str.count(x, '+'))
        df[f'qty_asterisk_{col}'] = df[[col]].applymap(lambda x: str.count(x, '*'))
        df[f'qty_hashtag_{col}'] = df[[col]].applymap(lambda x: str.count(x, '#'))
        df[f'qty_dollar_{col}'] = df[[col]].applymap(lambda x: str.count(x, '$'))
        df[f'qty_percent_{col}'] = df[[col]].applymap(lambda x: str.count(x, '%'))

    return df

# Load the trained RandomForestClassifier model
rfc_model = pickle.load(open('E:/BTECH/cstk/Phishing-URL-Detection-main/code/rfc.pkl', 'rb'))

def predict_phishing(url):
    # Preprocess the input URL and extract features
    data = {
        'url': [url]
    }
    df = pd.DataFrame(data)
    df_with_features = preprocess_and_extract_features(df)
    df_with_features = df_with_features.drop(columns=['url', 'protocol', 'domain', 'path', 'query', 'fragment','qty_slash_domain', 'qty_questionmark_domain','qty_equal_domain', 'qty_at_domain', 'qty_and_domain',
     'qty_exclamation_domain', 'qty_space_domain', 'qty_tilde_domain','qty_comma_domain', 'qty_plus_domain', 
     'qty_asterisk_domain','qty_hashtag_domain', 'qty_dollar_domain', 'qty_percent_domain', 'qty_questionmark_path', 
     'qty_hashtag_path', 'qty_hashtag_query', 'qty_at_fragment','qty_tilde_fragment', 'qty_plus_fragment'])
    
    # Scale the features using the same StandardScaler used during training
    X_train = pd.read_csv('E:\\BTECH\\cstk\\Phishing-URL-Detection-main\\data\\url_updated.csv')
    X_train = X_train.drop(columns=['url', 'protocol', 'domain', 'path', 'query', 'fragment','phishing'])
    ss = StandardScaler()
    ss.fit(X_train)
    user_input_features_sc = ss.transform(df_with_features)

    # Make predictions using the trained RandomForestClassifier
    probability_scores = rfc_model.predict_proba(user_input_features_sc)

    # Extract the probability of class 1 (phishing) for the URL
    phishing_probability = probability_scores[0, 1]

    # Set the threshold probability for classifying URLs as phishing or non-phishing

    # Print the results
    if phishing_probability >= 0.5:
        print(f"The URL '{url}' is classified as phishing with probability {phishing_probability:.4f}.")
    else:
        print(f"The URL '{url}' is classified as non-phishing ")

# Example usage:
url=input("Enter Link:")
#url = "http://news.co.global.prod.fastly.net/NjFmMmFjOWMwZjMxMzZkZjBhYWJhZQ==/?type=ist&orders=780653221&auth=ODZlYWQxZTYwOTk0N2Q5OTQwN2FkYQ=="
predict_phishing(url)


Enter Link:https://practice.geeksforgeeks.org/contest/megajob-a-thon-hiring-challenge-freshers
The URL 'https://practice.geeksforgeeks.org/contest/megajob-a-thon-hiring-challenge-freshers' is classified as phishing with probability 0.7754.


In [11]:
import os
print(os.getcwd())


C:\Users\rudra


In [1]:
import pandas as pd
import urllib
from urllib.parse import urlparse
import pickle

def preprocess_and_extract_features(df):
    # Step 1: Extract URL components (protocol, domain, path, query, fragment) for each URL
    urls = [url for url in df['url']]
    df['protocol'], df['domain'], df['path'], df['query'], df['fragment'] = zip(*[urllib.parse.urlsplit(x) for x in urls])

    # Step 2: Define the needed columns and calculate features for each component
    needed_cols = ['url', 'domain', 'path', 'query', 'fragment']
    for col in needed_cols:
        df[f'{col}_length'] = df[col].str.len()
        df[f'qty_dot_{col}'] = df[[col]].applymap(lambda x: str.count(x, '.'))
        df[f'qty_hyphen_{col}'] = df[[col]].applymap(lambda x: str.count(x, '-'))
        df[f'qty_slash_{col}'] = df[[col]].applymap(lambda x: str.count(x, '/'))
        df[f'qty_questionmark_{col}'] = df[[col]].applymap(lambda x: str.count(x, '?'))
        df[f'qty_equal_{col}'] = df[[col]].applymap(lambda x: str.count(x, '='))
        df[f'qty_at_{col}'] = df[[col]].applymap(lambda x: str.count(x, '@'))
        df[f'qty_and_{col}'] = df[[col]].applymap(lambda x: str.count(x, '&'))
        df[f'qty_exclamation_{col}'] = df[[col]].applymap(lambda x: str.count(x, '!'))
        df[f'qty_space_{col}'] = df[[col]].applymap(lambda x: str.count(x, ' '))
        df[f'qty_tilde_{col}'] = df[[col]].applymap(lambda x: str.count(x, '~'))
        df[f'qty_comma_{col}'] = df[[col]].applymap(lambda x: str.count(x, ','))
        df[f'qty_plus_{col}'] = df[[col]].applymap(lambda x: str.count(x, '+'))
        df[f'qty_asterisk_{col}'] = df[[col]].applymap(lambda x: str.count(x, '*'))
        df[f'qty_hashtag_{col}'] = df[[col]].applymap(lambda x: str.count(x, '#'))
        df[f'qty_dollar_{col}'] = df[[col]].applymap(lambda x: str.count(x, '$'))
        df[f'qty_percent_{col}'] = df[[col]].applymap(lambda x: str.count(x, '%'))

    return df

# Load the trained RandomForestClassifier model
rfc_model = pickle.load(open('E:/BTECH/cstk/Phishing-URL-Detection-main/code/rfc.pkl', 'rb'))

def predict_phishing(url):
    # Preprocess the input URL and extract features
    data = {
        'url': [url]
    }
    df = pd.DataFrame(data)
    df_with_features = preprocess_and_extract_features(df)
    
    # Drop columns not used during training
    columns_to_drop = ['url', 'protocol', 'domain', 'path', 'query', 'fragment', 'qty_slash_domain', 'qty_questionmark_domain', 'qty_equal_domain', 'qty_at_domain', 'qty_and_domain', 'qty_exclamation_domain', 'qty_space_domain', 'qty_tilde_domain', 'qty_comma_domain', 'qty_plus_domain', 'qty_asterisk_domain', 'qty_hashtag_domain', 'qty_dollar_domain', 'qty_percent_domain', 'qty_questionmark_path', 'qty_hashtag_path', 'qty_hashtag_query', 'qty_at_fragment', 'qty_tilde_fragment', 'qty_plus_fragment']
    df_with_features = df_with_features.drop(columns=columns_to_drop)
    
    # Make predictions using the trained RandomForestClassifier
    probability_scores = rfc_model.predict_proba(df_with_features)

    # Extract the probability of class 1 (phishing) for the URL
    phishing_probability = probability_scores[0, 1]

    # Set the threshold probability for classifying URLs as phishing or non-phishing

    # Print the results
    if phishing_probability >= 0.5:
        print(f"The URL '{url}' is classified as phishing with probability {phishing_probability:.4f}.")
    else:
        print(f"The URL '{url}' is classified as non-phishing ")

# Example usage:
url=input("Enter Link:")
#url = "http://news.co.global.prod.fastly.net/NjFmMmFjOWMwZjMxMzZkZjBhYWJhZQ==/?type=ist&orders=780653221&auth=ODZlYWQxZTYwOTk0N2Q5OTQwN2FkYQ=="
predict_phishing(url)


Enter Link:https://practice.geeksforgeeks.org/contest/megajob-a-thon-hiring-challenge-freshers
The URL 'https://practice.geeksforgeeks.org/contest/megajob-a-thon-hiring-challenge-freshers' is classified as non-phishing 
