In [None]:
import pandas as pd

from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

from xgboost import XGBClassifier



In [None]:
df = pd.read_csv('../data/dataset_small.csv')

In [None]:
df["phishing"].value_counts()

## Prep dataset

In [None]:
# sample 10k rows from each class in phishing column
df = df.groupby('phishing', group_keys=False).apply(lambda x: x.sample(min(len(x), 10000)))

# create X and y
X = df.drop('phishing', axis=1)
y = df['phishing']

# remove all negative values in X
X[X < 0] = 0

cols = ['qty_dot_url','qty_hyphen_url','qty_underline_url','qty_slash_url','qty_questionmark_url',
            'qty_equal_url','qty_at_url','qty_and_url','qty_asterisk_url','qty_tld_url','length_url',
            'qty_dot_domain','qty_hyphen_domain','qty_vowels_domain','domain_length','qty_dot_directory',
            'qty_hyphen_directory','qty_underline_directory','qty_slash_directory','qty_equal_directory',
            'qty_at_directory','qty_and_directory','qty_asterisk_directory','qty_percent_directory',
            'directory_length','qty_dot_params','qty_hyphen_params','qty_underline_params','qty_slash_params',
            'qty_questionmark_params','qty_equal_params','qty_at_params','qty_and_params','qty_percent_params',
            'params_length','tld_present_params','qty_params','email_in_url']

X = X[cols]

# # select the 50 most powerful features
# X_new = SelectKBest(chi2, k=50).fit_transform(X, y)

# # print the most powerful features
# X.columns[SelectKBest(chi2, k=50).fit(X, y).get_support()].tolist()

# # remove features from X which are not in X_new
# X = X[X.columns[SelectKBest(chi2, k=50).fit(X, y).get_support()].tolist()]

# split the dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

## Naive Bayes

In [None]:
# train NB
clf = MultinomialNB()
clf.fit(X_train, y_train)

# predict the test set
y_pred = clf.predict(X_test)

# print F1
print("F1 score: ", accuracy_score(y_test, y_pred))

## SVC

In [None]:
# train SVC
from sklearn.svm import SVC
clf = SVC()
clf.fit(X_train, y_train)

# predict the test set
y_pred = clf.predict(X_test)

# print F1
print("F1 score: ", accuracy_score(y_test, y_pred))

## Random Forest

In [None]:
# train random forest
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

# predict the test set
y_pred = clf.predict(X_test)

# print F1
print("F1 score: ", accuracy_score(y_test, y_pred))

## XGBoost

In [None]:
# train XGBoost
clf = XGBClassifier()
clf.fit(X_train, y_train)

# predict the test set
y_pred = clf.predict(X_test)

# print F1
print("F1 score: ", accuracy_score(y_test, y_pred))

# print classification report
print(classification_report(y_test, y_pred))

In [None]:
# perform cross validation

scores = cross_val_score(clf, X, y, cv=5)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))



In [None]:
# create a ROC curve
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt

# predict probabilities
probs = clf.predict_proba(X_test)

# keep probabilities for the positive outcome only
probs = probs[:, 1]

# calculate AUC
auc = roc_auc_score(y_test, probs)
print('AUC: %.2f' % auc)

# calculate roc curve
fpr, tpr, thresholds = roc_curve(y_test, probs)

# plot no skill
plt.plot([0, 1], [0, 1], linestyle='--')

# plot the roc curve for the model
plt.plot(fpr, tpr, marker='.')