In [3]:
import pandas as pd

In [4]:
full_data = pd.read_csv('dataset_full.csv')
test_data = pd.read_csv('dataset_small.csv')

In [6]:
test_data.head()

Unnamed: 0,qty_dot_url,qty_hyphen_url,qty_underline_url,qty_slash_url,qty_questionmark_url,qty_equal_url,qty_at_url,qty_and_url,qty_exclamation_url,qty_space_url,...,qty_ip_resolved,qty_nameservers,qty_mx_servers,ttl_hostname,tls_ssl_certificate,qty_redirects,url_google_index,domain_google_index,url_shortened,phishing
0,2,0,0,0,0,0,0,0,0,0,...,1,4,2,3598,0,0,0,0,0,0
1,4,0,0,2,0,0,0,0,0,0,...,1,4,1,3977,1,0,0,0,0,0
2,1,0,0,1,0,0,0,0,0,0,...,1,2,1,10788,0,0,0,0,0,0
3,2,0,0,3,0,0,0,0,0,0,...,1,2,1,14339,1,0,0,0,0,1
4,1,1,0,4,0,0,0,0,0,0,...,1,2,1,389,1,1,0,0,0,1


In [7]:
test_data.shape

(58645, 112)

In [6]:
# use 0.1 percent of the data for training
train_data = full_data.sample(frac=1, random_state=1)


In [36]:
train_data.shape
    

(88647, 112)

In [13]:
train_data.columns

Index(['qty_dot_url', 'qty_hyphen_url', 'qty_underline_url', 'qty_slash_url',
       'qty_questionmark_url', 'qty_equal_url', 'qty_at_url', 'qty_and_url',
       'qty_exclamation_url', 'qty_space_url',
       ...
       'qty_ip_resolved', 'qty_nameservers', 'qty_mx_servers', 'ttl_hostname',
       'tls_ssl_certificate', 'qty_redirects', 'url_google_index',
       'domain_google_index', 'url_shortened', 'phishing'],
      dtype='object', length=112)

In [7]:
X_train = train_data.drop('phishing', axis=1)
y_train = train_data['phishing']

In [25]:
y_train.shape

(886,)

In [22]:
X_train.dtypes.unique()

array([dtype('int64'), dtype('float64')], dtype=object)

In [23]:
# check for missing values

X_train.isnull().sum()

qty_dot_url             0
qty_hyphen_url          0
qty_underline_url       0
qty_slash_url           0
qty_questionmark_url    0
                       ..
tls_ssl_certificate     0
qty_redirects           0
url_google_index        0
domain_google_index     0
url_shortened           0
Length: 111, dtype: int64

In [None]:
# try to run 10 classification algorithms on the data
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline


models = [
    ('LR', LogisticRegression()),
    ('CART', DecisionTreeClassifier()),
    ('RF', RandomForestClassifier()),
    ('SVM', SVC()),
    ('KNN', KNeighborsClassifier()),
    ('NB', GaussianNB()),
    ('NN', MLPClassifier()),
    ('GB', GradientBoostingClassifier()),
    ('AB', AdaBoostClassifier()),
    ('BG', BaggingClassifier()),
    ('ET', ExtraTreesClassifier())
]

results = []
names = []
scoring = 'accuracy'
for name, model in models:
    kfold = StratifiedKFold(n_splits=10)
    pipeline = Pipeline(steps=[('scaler', StandardScaler()), ('model', model)])
    cv_results = cross_val_score(pipeline, X_train, y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    print('%s: %f (%f)' % (name, cv_results.mean(), cv_results.std()))

In [31]:
# use NN model for prediction
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

model = MLPClassifier()

model.fit(X_train, y_train)

y_pred = model.predict(test_data.drop('phishing', axis=1))

accuracy_score(test_data['phishing'], y_pred)

print(classification_report(test_data['phishing'], y_pred))


              precision    recall  f1-score   support

           0       0.77      0.76      0.77     27998
           1       0.78      0.80      0.79     30647

    accuracy                           0.78     58645
   macro avg       0.78      0.78      0.78     58645
weighted avg       0.78      0.78      0.78     58645



In [33]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

model = MLPClassifier( activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
                      beta_2=0.999, early_stopping=False, epsilon=1e-08,
                      hidden_layer_sizes=(100,), learning_rate='constant',
                      learning_rate_init=0.001, max_iter=200, momentum=0.9,
                      nesterovs_momentum=True, power_t=0.5, random_state=1,
                      shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
                      verbose=False, warm_start=False)

model.fit(X_train, y_train)

y_pred = model.predict(test_data.drop('phishing', axis=1))

accuracy_score(test_data['phishing'], y_pred)

print(classification_report(test_data['phishing'], y_pred))

              precision    recall  f1-score   support

           0       0.91      0.52      0.66     27998
           1       0.68      0.95      0.80     30647

    accuracy                           0.74     58645
   macro avg       0.79      0.73      0.73     58645
weighted avg       0.79      0.74      0.73     58645



In [34]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

model = MLPClassifier( activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
                      beta_2=0.999, early_stopping=False, epsilon=1e-08,
                      hidden_layer_sizes=(100,), learning_rate='constant',
                      learning_rate_init=0.001, max_iter=200, momentum=0.9,
                      nesterovs_momentum=True, power_t=0.5, random_state=1,
                      shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
                      verbose=False, warm_start=False)

model.fit(X_train, y_train)

y_pred = model.predict(test_data.drop('phishing', axis=1))

accuracy_score(test_data['phishing'], y_pred)

print(classification_report(test_data['phishing'], y_pred))

              precision    recall  f1-score   support

           0       0.90      0.68      0.77     27998
           1       0.76      0.93      0.84     30647

    accuracy                           0.81     58645
   macro avg       0.83      0.81      0.81     58645
weighted avg       0.83      0.81      0.81     58645



In [37]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

model = MLPClassifier( activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
                      beta_2=0.999, early_stopping=False, epsilon=1e-08,
                      hidden_layer_sizes=(100,), learning_rate='constant',
                      learning_rate_init=0.001, max_iter=200, momentum=0.9,
                      nesterovs_momentum=True, power_t=0.5, random_state=1,
                      shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
                      verbose=False, warm_start=False)

model.fit(X_train, y_train)

y_pred = model.predict(test_data.drop('phishing', axis=1))

accuracy_score(test_data['phishing'], y_pred)

print(classification_report(test_data['phishing'], y_pred))

              precision    recall  f1-score   support

           0       0.90      0.68      0.77     27998
           1       0.76      0.93      0.84     30647

    accuracy                           0.81     58645
   macro avg       0.83      0.81      0.81     58645
weighted avg       0.83      0.81      0.81     58645



In [40]:
# use pycaret
from pycaret.classification import *

exp1 = setup(data = train_data, target = 'phishing', session_id=123)

best_model = compare_models()

best_model

Unnamed: 0,Description,Value
0,Session id,123
1,Target,phishing
2,Target type,Binary
3,Original data shape,"(88647, 112)"
4,Transformed data shape,"(88647, 112)"
5,Transformed train set shape,"(62052, 112)"
6,Transformed test set shape,"(26595, 112)"
7,Numeric features,111
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,0.9705,0.9948,0.9601,0.9548,0.9575,0.9349,0.9349,1.179
et,Extra Trees Classifier,0.9698,0.9947,0.9588,0.9541,0.9564,0.9333,0.9333,1.431
xgboost,Extreme Gradient Boosting,0.9697,0.9949,0.9565,0.9558,0.9561,0.9329,0.933,0.391
lightgbm,Light Gradient Boosting Machine,0.9668,0.9944,0.9532,0.9509,0.9521,0.9267,0.9267,0.467
gbc,Gradient Boosting Classifier,0.9541,0.9898,0.9343,0.933,0.9336,0.8985,0.8985,2.776
dt,Decision Tree Classifier,0.9535,0.9481,0.9304,0.9347,0.9325,0.897,0.897,0.244
ada,Ada Boost Classifier,0.9383,0.9849,0.9104,0.9111,0.9107,0.8636,0.8636,1.316
ridge,Ridge Classifier,0.916,0.0,0.9525,0.8297,0.8869,0.8205,0.8255,0.13
lda,Linear Discriminant Analysis,0.9153,0.9768,0.9538,0.8275,0.8862,0.8193,0.8245,0.725
lr,Logistic Regression,0.9148,0.9697,0.9138,0.8509,0.8812,0.8149,0.8162,5.837


In [10]:
from sklearn.ensemble import RandomForestClassifier


rfc = RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='sqrt',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       monotonic_cst=None, n_estimators=100, n_jobs=-1,
                       oob_score=False, random_state=123, verbose=0,
                       warm_start=False)

rfc.fit(X_train, y_train)


In [12]:
from sklearn.metrics import accuracy_score, classification_report

y_pred = rfc.predict(test_data.drop('phishing', axis=1))

accuracy_score(test_data['phishing'], y_pred)

print(classification_report(test_data['phishing'], y_pred))



              precision    recall  f1-score   support

           0       1.00      1.00      1.00     27998
           1       1.00      1.00      1.00     30647

    accuracy                           1.00     58645
   macro avg       1.00      1.00      1.00     58645
weighted avg       1.00      1.00      1.00     58645



In [16]:
columns = X_train.columns.tolist()

print(columns)

['qty_dot_url', 'qty_hyphen_url', 'qty_underline_url', 'qty_slash_url', 'qty_questionmark_url', 'qty_equal_url', 'qty_at_url', 'qty_and_url', 'qty_exclamation_url', 'qty_space_url', 'qty_tilde_url', 'qty_comma_url', 'qty_plus_url', 'qty_asterisk_url', 'qty_hashtag_url', 'qty_dollar_url', 'qty_percent_url', 'qty_tld_url', 'length_url', 'qty_dot_domain', 'qty_hyphen_domain', 'qty_underline_domain', 'qty_slash_domain', 'qty_questionmark_domain', 'qty_equal_domain', 'qty_at_domain', 'qty_and_domain', 'qty_exclamation_domain', 'qty_space_domain', 'qty_tilde_domain', 'qty_comma_domain', 'qty_plus_domain', 'qty_asterisk_domain', 'qty_hashtag_domain', 'qty_dollar_domain', 'qty_percent_domain', 'qty_vowels_domain', 'domain_length', 'domain_in_ip', 'server_client_domain', 'qty_dot_directory', 'qty_hyphen_directory', 'qty_underline_directory', 'qty_slash_directory', 'qty_questionmark_directory', 'qty_equal_directory', 'qty_at_directory', 'qty_and_directory', 'qty_exclamation_directory', 'qty_spac

In [14]:
# save the model
import joblib

joblib.dump(rfc, 'phishing_model.pkl')



['phishing_model.pkl']

In [15]:
# deploy the model using flask
from flask import Flask, request, jsonify
import joblib
import numpy as np

app = Flask(__name__)

model = joblib.load('phishing_model.pkl')

@app.route('/predict', methods=['POST'])
def predict():
    data = request.get_json()
    prediction = model.predict([np.array(list(data.values()))])
    output = prediction[0]
    return jsonify(output)

#api server health check
@app.route('/', methods=['GET'])
def health():
    return jsonify("Healthy")

if __name__ == '__main__':
    app.run(port=5000, debug=True)


# sample request body with 111 features ['qty_dot_url', 'qty_hyphen_url', 'qty_underline_url', 'qty_slash_url', 'qty_questionmark_url', 'qty_equal_url', 'qty_at_url', 'qty_and_url', 'qty_exclamation_url', 'qty_space_url', 'qty_tilde_url', 'qty_comma_url', 'qty_plus_url', 'qty_asterisk_url', 'qty_hashtag_url', 'qty_dollar_url', 'qty_percent_url', 'qty_tld_url', 'length_url', 'qty_dot_domain', 'qty_hyphen_domain', 'qty_underline_domain', 'qty_slash_domain', 'qty_questionmark_domain', 'qty_equal_domain', 'qty_at_domain', 'qty_and_domain', 'qty_exclamation_domain', 'qty_space_domain', 'qty_tilde_domain', 'qty_comma_domain', 'qty_plus_domain', 'qty_asterisk_domain', 'qty_hashtag_domain', 'qty_dollar_domain', 'qty_percent_domain', 'qty_vowels_domain', 'domain_length', 'domain_in_ip', 'server_client_domain', 'qty_dot_directory', 'qty_hyphen_directory', 'qty_underline_directory', 'qty_slash_directory', 'qty_questionmark_directory', 'qty_equal_directory', 'qty_at_directory', 'qty_and_directory', 'qty_exclamation_directory', 'qty_space_directory', 'qty_tilde_directory', 'qty_comma_directory', 'qty_plus_directory', 'qty_asterisk_directory', 'qty_hashtag_directory', 'qty_dollar_directory', 'qty_percent_directory', 'directory_length', 'qty_dot_file', 'qty_hyphen_file', 'qty_underline_file', 'qty_slash_file', 'qty_questionmark_file', 'qty_equal_file', 'qty_at_file', 'qty_and_file', 'qty_exclamation_file', 'qty_space_file', 'qty_tilde_file', 'qty_comma_file', 'qty_plus_file', 'qty_asterisk_file', 'qty_hashtag_file', 'qty_dollar_file', 'qty_percent_file', 'file_length', 'qty_dot_params', 'qty_hyphen_params', 'qty_underline_params', 'qty_slash_params', 'qty_questionmark_params', 'qty_equal_params', 'qty_at_params', 'qty_and_params', 'qty_exclamation_params', 'qty_space_params', 'qty_tilde_params', 'qty_comma_params', 'qty_plus_params', 'qty_asterisk_params', 'qty_hashtag_params', 'qty_dollar_params', 'qty_percent_params', 'params_length', 'tld_present_params', 'qty_params', 'email_in_url', 'time_response', 'domain_spf', 'asn_ip', 'time_domain_activation', 'time_domain_expiration', 'qty_ip_resolved', 'qty_nameservers', 'qty_mx_servers', 'ttl_hostname', 'tls_ssl_certificate', 'qty_redirects', 'url_google_index', 'domain_google_index', 'url_shortened']

body = {
    "qty_dot_url": 1,
    "qty_hyphen_url": 1,
    "qty_underline_url": 1,
    "qty_slash_url": 1,
    "qty_questionmark_url": 1,
    "qty_equal_url": 1,
    "qty_at_url": 1,
    "qty_and_url": 1,
    "qty_exclamation_url": 1,
    "qty_space_url": 1,
    "qty_tilde_url": 1,
    "qty_comma_url": 1,
    "qty_plus_url": 1,
    "qty_asterisk_url": 1,
    "qty_hashtag_url": 1,
    "qty_dollar_url": 1,
    "qty_percent_url": 1,
    "qty_tld_url": 1,
    "length_url": 1,
    "qty_dot_domain": 1,
    "qty_hyphen_domain": 1,
    "qty_underline_domain": 1,
    "qty_slash_domain": 1,
    "qty_questionmark_domain": 1,
    "qty_equal_domain": 1,
    "qty_at_domain": 1,
    "qty_and_domain": 1,
    "qty_exclamation_domain": 1,
    "qty_space_domain": 1,
    "qty_tilde_domain": 1,
    "qty_comma_domain": 1,
    "qty_plus_domain": 1,
    "qty_asterisk_domain": 1,
    "qty_hashtag_domain": 1,
    "qty_dollar_domain": 1,
    "qty_percent_domain": 1,
    "qty_vowels_domain": 1,
    "domain_length": 1,
    "domain_in_ip": 1,
    "server_client_domain": 1,
    "qty_dot_directory": 1,
    "qty_hyphen_directory": 1,
    "qty_underline_directory": 1,
    "qty_slash_directory": 1,
    "qty_questionmark_directory": 1,
    "qty_equal_directory": 1,
    "qty_at_directory": 1,
    "qty_and_directory": 1,
    "qty_exclamation_directory": 1,
    "qty_space_directory": 1,
    "qty_tilde_directory": 1,
    "qty_comma_directory": 1,
    "qty_plus_directory": 1,
    "qty_asterisk_directory": 1,
    "qty_hashtag_directory": 1,
    "qty_dollar_directory": 1,
    "qty_percent_directory": 1,
    "directory_length": 1,
    "qty_dot_file": 1,
    "qty_hyphen_file": 1,
    "qty_underline_file": 1,
    "qty_slash_file": 1,
    "qty_questionmark_file": 1,
    "qty_equal_file": 1,
    "qty_at_file": 1,
    "qty_and_file": 1,
    "qty_exclamation_file": 1,
    "qty_space_file": 1,
    "qty_tilde_file": 1,
    "qty_comma_file": 1,
    "qty_plus_file": 1,
    "qty_asterisk_file": 1,
    "qty_hashtag_file": 1,
    "qty_dollar_file": 1,
    "qty_percent_file": 1,
    "file_length": 1,
    "qty_dot_params": 1,
    
} 


 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
 * Restarting with stat


SystemExit: 1

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
