In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import numpy as np
from scipy import stats

In [5]:
data = pd.read_csv('Dataset.csv')

In [11]:
data.columns

Index(['Type', 'url_length', 'number_of_dots_in_url',
       'having_repeated_digits_in_url', 'number_of_digits_in_url',
       'number_of_special_char_in_url', 'number_of_hyphens_in_url',
       'number_of_underline_in_url', 'number_of_slash_in_url',
       'number_of_questionmark_in_url', 'number_of_equal_in_url',
       'number_of_at_in_url', 'number_of_dollar_in_url',
       'number_of_exclamation_in_url', 'number_of_hashtag_in_url',
       'number_of_percent_in_url', 'domain_length', 'number_of_dots_in_domain',
       'number_of_hyphens_in_domain', 'having_special_characters_in_domain',
       'number_of_special_characters_in_domain', 'having_digits_in_domain',
       'number_of_digits_in_domain', 'having_repeated_digits_in_domain',
       'number_of_subdomains', 'having_dot_in_subdomain',
       'having_hyphen_in_subdomain', 'average_subdomain_length',
       'average_number_of_dots_in_subdomain',
       'average_number_of_hyphens_in_subdomain',
       'having_special_characters_

In [None]:
#random forrest

In [7]:
import re
import tldextract


def extract_features(url):
    # URL length
    url_length = len(url)

    # Number of dots in URL
    number_of_dots_in_url = url.count('.')

    # Having repeated digits in URL
    having_repeated_digits_in_url = 1 if re.search(r'(\d)\1{2,}', url) else 0

    # Number of digits in URL
    number_of_digits_in_url = len(re.findall(r'\d', url))

    # Number of special characters in URL
    number_of_special_char_in_url = len(re.findall(r'[!@#$%^&*(),.?":{}|<>]', url))

    # Number of hyphens in URL
    number_of_hyphens_in_url = url.count('-')

    # Number of underscores in URL
    number_of_underline_in_url = url.count('_')

    # Number of slashes in URL
    number_of_slash_in_url = url.count('/')

    # Number of question marks in URL
    number_of_questionmark_in_url = url.count('?')

    # Number of equal signs in URL
    number_of_equal_in_url = url.count('=')

    # Number of at signs in URL
    number_of_at_in_url = url.count('@')

    # Number of dollar signs in URL
    number_of_dollar_in_url = url.count('$')

    # Number of exclamation marks in URL
    number_of_exclamation_in_url = url.count('!')

    # Number of hashtags in URL
    number_of_hashtag_in_url = url.count('#')

    # Number of percent signs in URL
    number_of_percent_in_url = url.count('%')

    # Extract domain details
    ext = tldextract.extract(url)
    domain = ext.domain
    subdomain = ext.subdomain

    # Domain length
    domain_length = len(domain)

    # Number of dots in domain
    number_of_dots_in_domain = domain.count('.')

    # Number of hyphens in domain
    number_of_hyphens_in_domain = domain.count('-')

    # Having special characters in domain
    having_special_characters_in_domain = 1 if re.search(r'[!@#$%^&*(),.?":{}|<>]', domain) else 0

    # Number of special characters in domain
    number_of_special_characters_in_domain = len(re.findall(r'[!@#$%^&*(),.?":{}|<>]', domain))

    # Having digits in domain
    having_digits_in_domain = 1 if re.search(r'\d', domain) else 0

    # Number of digits in domain
    number_of_digits_in_domain = len(re.findall(r'\d', domain))

    # Having repeated digits in domain
    having_repeated_digits_in_domain = 1 if re.search(r'(\d)\1{2,}', domain) else 0

    # Number of subdomains
    number_of_subdomains = len(subdomain.split('.'))

    # Having dot in subdomain
    having_dot_in_subdomain = 1 if '.' in subdomain else 0

    # Having hyphen in subdomain
    having_hyphen_in_subdomain = 1 if '-' in subdomain else 0

    # Average subdomain length
    average_subdomain_length = np.mean([len(part) for part in subdomain.split('.')])

    # Average number of dots in subdomain
    average_number_of_dots_in_subdomain = number_of_dots_in_url / number_of_subdomains if number_of_subdomains > 0 else 0

    # Average number of hyphens in subdomain
    average_number_of_hyphens_in_subdomain = number_of_hyphens_in_url / number_of_subdomains if number_of_subdomains > 0 else 0

    # Having special characters in subdomain
    having_special_characters_in_subdomain = 1 if re.search(r'[!@#$%^&*(),.?":{}|<>]', subdomain) else 0

    # Number of special characters in subdomain
    number_of_special_characters_in_subdomain = len(re.findall(r'[!@#$%^&*(),.?":{}|<>]', subdomain))

    # Having digits in subdomain
    having_digits_in_subdomain = 1 if re.search(r'\d', subdomain) else 0

    # Number of digits in subdomain
    number_of_digits_in_subdomain = len(re.findall(r'\d', subdomain))

    # Having repeated digits in subdomain
    having_repeated_digits_in_subdomain = 1 if re.search(r'(\d)\1{2,}', subdomain) else 0

    # Check if URL has a path
    path = url.split(domain)[-1] if domain in url else ""
    having_path = 1 if path else 0
    path_length = len(path)

    # Check if URL has a query
    having_query = 1 if '?' in url else 0

    # Check if URL has a fragment
    having_fragment = 1 if '#' in url else 0

    # Check if URL has an anchor
    having_anchor = 1 if '#' in url else 0

    # Entropy calculations
    def calculate_entropy(string):
        probabilities = [float(string.count(c)) / len(string) for c in dict.fromkeys(list(string))]
        entropy = - sum([p * np.log2(p) for p in probabilities])
        return entropy

    entropy_of_url = calculate_entropy(url)
    entropy_of_domain = calculate_entropy(domain)

    # Return features as a list
    return [
        url_length, number_of_dots_in_url, having_repeated_digits_in_url,
        number_of_digits_in_url, number_of_special_char_in_url, number_of_hyphens_in_url,
        number_of_underline_in_url, number_of_slash_in_url, number_of_questionmark_in_url,
        number_of_equal_in_url, number_of_at_in_url, number_of_dollar_in_url,
        number_of_exclamation_in_url, number_of_hashtag_in_url, number_of_percent_in_url,
        domain_length, number_of_dots_in_domain, number_of_hyphens_in_domain,
        having_special_characters_in_domain, number_of_special_characters_in_domain,
        having_digits_in_domain, number_of_digits_in_domain, having_repeated_digits_in_domain,
        number_of_subdomains, having_dot_in_subdomain, having_hyphen_in_subdomain,
        average_subdomain_length, average_number_of_dots_in_subdomain,
        average_number_of_hyphens_in_subdomain, having_special_characters_in_subdomain,
        number_of_special_characters_in_subdomain, having_digits_in_subdomain,
        number_of_digits_in_subdomain, having_repeated_digits_in_subdomain, having_path,
        path_length, having_query, having_fragment, having_anchor,
        entropy_of_url, entropy_of_domain
    ]


In [4]:


# Load your dataset
data = pd.read_csv('Dataset.csv')


# Data preprocessing
X = data.drop('Type', axis=1)
y = data['Type']

# Handle missing values if any
X.fillna(0, inplace=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Model preparation
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)

# Predictions
y_pred = model.predict(X_test_scaled)

# Evaluation
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.96      0.97      0.96     38569
           1       0.97      0.95      0.96     35816

    accuracy                           0.96     74385
   macro avg       0.96      0.96      0.96     74385
weighted avg       0.96      0.96      0.96     74385



In [6]:
#Remaining samples after outlier removal: 0
#All samples were removed by the outlier detection. Consider relaxing the filtering criteria.

#hence didnot provided outlier treatment

In [7]:
import joblib

# Save the trained model and scaler
joblib.dump(model, 'phishing_model.pkl')
joblib.dump(scaler, 'scaler.pkl')


['scaler.pkl']

In [None]:
###


###


#  LOGISTIC REGRESSION

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Load your dataset
data = pd.read_csv('Dataset.csv')

# Data preprocessing
X = data.drop('Type', axis=1)
y = data['Type']

# If the target variable 'Type' is categorical, convert it to numeric values
# using LabelEncoder if not done already
if y.dtype == 'object' or y.dtype == 'category':
    label_encoder = LabelEncoder()
    y = label_encoder.fit_transform(y)

# Handle missing values if any
X.fillna(0, inplace=True)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Model preparation using Logistic Regression
model = LogisticRegression(random_state=42)
model.fit(X_train_scaled, y_train)

# Predictions
y_pred = model.predict(X_test_scaled)

# Evaluation
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Optionally, print accuracy as well
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")


In [None]:
######

##########

#### DECISION TREE

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score

# Load your dataset
data = pd.read_csv('Dataset.csv')

# Data preprocessing
X = data.drop('Type', axis=1)
y = data['Type']

# If the target variable 'Type' is categorical, convert it to numeric values
# using LabelEncoder if not done already
if y.dtype == 'object' or y.dtype == 'category':
    label_encoder = LabelEncoder()
    y = label_encoder.fit_transform(y)

# Handle missing values if any
X.fillna(0, inplace=True)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Feature scaling is optional for Decision Trees but I am keeping it consistent
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Model preparation using Decision Tree Classifier
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train_scaled, y_train)

# Predictions
y_pred = model.predict(X_test_scaled)

# Evaluation
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Optionally, print accuracy as well
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")


Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.95      0.95     38569
           1       0.95      0.94      0.94     35816

    accuracy                           0.95     74385
   macro avg       0.95      0.95      0.95     74385
weighted avg       0.95      0.95      0.95     74385

Accuracy: 0.95


In [5]:
# Example of tuning max depth
modelll = DecisionTreeClassifier(max_depth=5, random_state=42)  # Limiting the depth to 5
modelll.fit(X_train_scaled, y_train)


# Predictions
y_pred = modelll.predict(X_test_scaled)

# Evaluation
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Optionally, print accuracy as well
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")



Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.90      0.84     38569
           1       0.87      0.73      0.79     35816

    accuracy                           0.82     74385
   macro avg       0.83      0.81      0.81     74385
weighted avg       0.82      0.82      0.82     74385

Accuracy: 0.82


In [7]:
#####################

#########SVM ################


############################################

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

# Load your dataset
data = pd.read_csv('Dataset.csv')

# Data preprocessing
X = data.drop('Type', axis=1)
y = data['Type']

# If the target variable 'Type' is categorical, convert it to numeric values
# using LabelEncoder if not done already
if y.dtype == 'object' or y.dtype == 'category':
    label_encoder = LabelEncoder()
    y = label_encoder.fit_transform(y)

# Handle missing values if any
X.fillna(0, inplace=True)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Feature scaling is crucial for SVM
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Model preparation using Support Vector Classifier (SVC)
model = SVC(kernel='linear', random_state=42)  # Using linear kernel
model.fit(X_train_scaled, y_train)

# Predictions
y_pred = model.predict(X_test_scaled)

# Evaluation
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Optionally, print accuracy as well
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
