In [None]:
# Importing modules

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import *


from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis


import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
# Reading in data
df = pd.read_csv("phishing.csv")

In [None]:
# Examining dataframe 
df.head()


In [None]:
# Checking for NA
df.isna()


In [None]:
# Checking for NULL
df.isnull()


In [None]:
df.columns

In [None]:
# Verifying that all values are not null
df.notnull()


In [None]:
# Extracting labels
labels = df.loc[:, ~df.columns.str.contains('class')]
labels


In [None]:

# Create a correlation matrix
corr_matrix = labels.corr().abs()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
print(pd.DataFrame(upper))

In [None]:
relevant = [1, 2, 3, 4, 5, 6, 7, 9, 10, 12, 13, 14, 15, 16, 17, 18, 23, 24, 25, 28, 30]
features = np.array(labels.columns)[relevant]
features

In [None]:
labels = labels[features]


In [None]:
# Extracting target
target = df['class']
target



In [None]:
# Train test split
training_labels, testing_labels, training_target, testing_target  = train_test_split(labels, target, random_state = 42, )


In [None]:
'''
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis

# Define the range of hyperparameters for each classifier
param_grid_lr = {
    'penalty': ['l1', 'l2'],
    'C': [0.1, 1, 10],
    'solver': ['liblinear', 'saga']
}
param_grid_svc = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale', 'auto']
}
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 20],
    'max_features': ['sqrt', 'log2'],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
param_grid_gb = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [3, 5, 10]
}
param_grid_dt = {
    'max_depth': [5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
param_grid_knn = {
    'n_neighbors': [5, 10, 15],
    'weights': ['uniform', 'distance']
}
param_grid_ada = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.1, 0.5, 1.0]
}

# Create a list of classifiers with their corresponding hyperparameters
classifiers = [
    (LogisticRegression(), param_grid_lr),
    (SVC(), param_grid_svc),
    (RandomForestClassifier(random_state=42), param_grid_rf),
    (GradientBoostingClassifier(random_state=42), param_grid_gb),
    (DecisionTreeClassifier(random_state=42), param_grid_dt),
    (KNeighborsClassifier(), param_grid_knn),
    (GaussianNB(), None),
    (AdaBoostClassifier(random_state=42), param_grid_ada),
    (LinearDiscriminantAnalysis(), None),
    (QuadraticDiscriminantAnalysis(), None)
]

# Loop over the classifiers and perform grid search
for clf, param_grid in classifiers:
    if param_grid is not None:
        grid_search = GridSearchCV(clf, param_grid=param_grid, cv=5)
        grid_search.fit(training_labels, training_target)
        print(clf.__class__.__name__)
        print("Best parameters:", grid_search.best_params_)
        print("Training accuracy:", grid_search.best_score_)
        print("Test accuracy:", grid_search.score(testing_labels, testing_target))
        print("---")
    else:
        clf.fit(training_labels, training_target)
        print(clf.__class__.__name__)
        print("Training accuracy:", clf.score(training_labels, training_target))
        print("Test accuracy:", clf.score(testing_labels, testing_target))
        print("---")
'''

In [None]:
# Populating a list of classifiers with hyperparameters
classifiers = [
    LogisticRegression(penalty='l1', C=10, solver='liblinear'),
    SVC(kernel='rbf', C=1.0, gamma='scale'),
    RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42),
    GradientBoostingClassifier(learning_rate=0.1, n_estimators=100, max_depth=5, random_state=42),
    DecisionTreeClassifier(max_depth=10, random_state=42),
    KNeighborsClassifier(n_neighbors=5, weights='uniform'),
    GaussianNB(),
    AdaBoostClassifier(n_estimators=100, learning_rate=1.0, random_state=42),
    LinearDiscriminantAnalysis(solver='svd'),
    QuadraticDiscriminantAnalysis()
]



In [None]:
# Dictionary to store prediction data
predictions = {}

models = []

# Iterating over all classifiers
for classifier in classifiers:

    # Fitting classifier
    classifier.fit(training_labels.values, training_target.values)

    # Updating predictions dict
    predictions[str(classifier)] = classifier.predict(testing_labels.values)

    # Obtaining confusion matrix
    cm = confusion_matrix(y_pred= predictions[str(classifier)], y_true = testing_target.values)
    models.append(classifier)

    # Plotting confusion matrix
    ax = plt.subplot()
    sns.heatmap(cm, annot=True, ax=ax, cmap='Blues')
    ax.set_xlabel('Predicted labels')
    ax.set_ylabel('True labels')
    ax.set_title(str(classifier))
    plt.show()

  


In [None]:
# Obtaining metrics - accuracy, f1, recall, precision

metrics = { 'accuracy': [], 'f1' : [], 'recall' : [], 'precision' : []  }
for classifier_name, prediction in predictions.items():
    metrics['accuracy'].append(accuracy_score(testing_target, prediction))
    metrics['f1'].append(f1_score(testing_target, prediction, average='weighted'))
    metrics['recall'].append(recall_score(testing_target, prediction, average='weighted'))
    metrics['precision'].append(precision_score(testing_target, prediction, average='weighted'))
    

In [None]:
# Examining the performance metrics using a bar chart

fig, ax = plt.subplots(figsize=(10, 5))
x = np.arange(len(classifiers))
width = 0.2
rects1 = ax.bar(x - width*1.5, metrics['accuracy'], width, label='Accuracy')
rects2 = ax.bar(x - width*0.5, metrics['precision'], width, label='Precision')
rects3 = ax.bar(x + width*0.5, metrics['recall'], width, label='Recall')
rects4 = ax.bar(x + width*1.5, metrics['f1'], width, label='F1-score')
ax.set_xticks(x)
ax.set_xticklabels(classifiers, rotation=45, ha='right')
ax.legend()
ax.set_ylabel('Score')
ax.set_title('Performance metrics of classifiers')
plt.show()

In [None]:
# Creating DataFrame from metrics dictionary
performance = pd.DataFrame(metrics)

# Adding a column for classifier names
performance['Classifier'] = list(predictions.keys())

# Reordering columns for better visibility
performance = performance[['Classifier', 'accuracy', 'f1', 'recall', 'precision']]

# Displaying DataFrame
performance.head(11)

In [None]:
import joblib

for model in models:
    joblib.dump(model, f'model/{model.__class__.__name__}.pkl', compress=9)


In [None]:
print(models[2].__class__.__name__, models[2].feature_importances_)

In [None]:
# Dictionary to store prediction data
predictions_train = {}

models = []

# Iterating over all classifiers
for model in models:

    # Fitting classifier
    # model.fit(training_labels.values, training_target.values)

    # Updating predictions dict
    predictions[str(classifier)] = classifier.predict(testing_labels.values)

    # Obtaining confusion matrix
    cm = confusion_matrix(y_pred= predictions[str(classifier)], y_true = testing_target.values)
    models.append(classifier)

    # Plotting confusion matrix
    ax = plt.subplot()
    sns.heatmap(cm, annot=True, ax=ax, cmap='Blues')
    ax.set_xlabel('Predicted labels')
    ax.set_ylabel('True labels')
    ax.set_title(str(classifier))
    plt.show()

  
