# Malicious Domain Dataset - Multi-layer Perceptron Classifier

In [4]:
import pandas as pd
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split

# Load preprocessed dataset
datasource = "preprocessed.csv"
data = pd.read_csv(datasource)
data.Class.value_counts()

0    32613
1     3815
Name: Class, dtype: int64

## Resampling

As the preprocessed dataset is unbalanced, we resample to create a balanced set for training and testing.

In [5]:
from sklearn.utils import resample

sample_size = min(data.Class.value_counts())

malicious = resample(data.loc[data["Class"] == 0], replace=False, n_samples=sample_size, random_state=42)
benign = resample(data.loc[data["Class"] == 1], replace=False, n_samples=sample_size, random_state=42)

balanced = pd.concat([malicious, benign])
balanced.Class.value_counts()

X = balanced.drop(columns=["Class"])
y = balanced["Class"]

## Cross-validation

Here we assess how well an MLP classifier will generalize the dataset by performing 10-fold cross-validation.

In [6]:
from sklearn.model_selection import cross_validate

clf = MLPClassifier()

folds = 10
metrics = ["accuracy", "f1", "precision", "recall"]
scores = cross_validate(clf, X, y, scoring=metrics, cv=folds)

scores = pd.DataFrame.from_dict(scores)
scores

Unnamed: 0,fit_time,score_time,test_accuracy,test_f1,test_precision,test_recall
0,3.100808,0.004038,0.975098,0.975484,0.959391,0.992126
1,2.397105,0.003668,0.975098,0.975033,0.976316,0.973753
2,2.127884,0.003649,0.981651,0.981771,0.97416,0.989501
3,2.928346,0.003677,0.973788,0.974026,0.96401,0.984252
4,2.182321,0.003765,0.961992,0.960864,0.988889,0.934383
5,2.046045,0.00385,0.980341,0.980595,0.969309,0.992147
6,2.156228,0.003633,0.980341,0.980443,0.976623,0.984293
7,2.299502,0.003744,0.963303,0.964286,0.940299,0.989529
8,1.924057,0.004065,0.985583,0.985545,0.989446,0.981675
9,1.922176,0.004221,0.980341,0.98008,0.994609,0.965969


In [7]:
scores.mean()

fit_time          2.308447
score_time        0.003831
test_accuracy     0.975754
test_f1           0.975813
test_precision    0.973305
test_recall       0.978763
dtype: float64

## Finalized Model

In [8]:
clf = MLPClassifier()

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.4, random_state=42)

clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.9709480122324159

Note: For the above estimator the MLP hyperparameters were left as their default values. Using hyperparameter search methods the model could potentially be further improved.