# Malicious Domain Dataset - Multi-layer Perceptron Classifier

In [36]:
import pandas as pd
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split

# Load preprocessed dataset
datasource = "preprocessed.csv"
data = pd.read_csv(datasource)
data.Class.value_counts()

0    32613
1     3815
Name: Class, dtype: int64

## Resampling

As the preprocessed dataset is unbalanced, we resample to create a balanced set for training and testing.

In [44]:
from sklearn.utils import resample

sample_size = min(data.Class.value_counts())

malicious = resample(data.loc[data["Class"] == 0], replace=False, n_samples=sample_size, random_state=42)
benign = resample(data.loc[data["Class"] == 1], replace=False, n_samples=sample_size, random_state=42)

balanced = pd.concat([malicious, benign])
balanced.Class.value_counts()

0    3815
1    3815
Name: Class, dtype: int64

## Cross-validation

Here we assess how well an MLP classifier will generalize the dataset by performing 10-fold cross-validation.

In [40]:
from sklearn.model_selection import cross_validate

clf = MLPClassifier()

folds = 10
metrics = ["accuracy", "f1", "precision", "recall"]
scores = cross_validate(clf, X, y, scoring=metrics, cv=folds)

scores = pd.DataFrame.from_dict(scores)
scores

Unnamed: 0,fit_time,score_time,test_accuracy,test_f1,test_precision,test_recall
0,5.720392,0.008197,0.968158,0.823708,0.981884,0.709424
1,10.252855,0.009758,0.975295,0.873596,0.942424,0.814136
2,5.527157,0.006407,0.993138,0.96749,0.96124,0.973822
3,6.588592,0.006818,0.996432,0.983139,0.974293,0.992147
4,5.883871,0.01061,0.991216,0.959184,0.935323,0.984293
5,7.539819,0.006787,0.940708,0.615658,0.955801,0.454068
6,4.109619,0.007353,0.987922,0.941333,0.95664,0.926509
7,6.147995,0.006213,0.988197,0.940361,0.997059,0.889764
8,7.343403,0.00624,0.992312,0.962366,0.986226,0.939633
9,4.493744,0.006175,0.991488,0.958831,0.97043,0.947507


In [41]:
scores.mean()

fit_time          6.360745
score_time        0.007456
test_accuracy     0.982487
test_f1           0.902567
test_precision    0.966132
test_recall       0.863130
dtype: float64

## Finalized Model

In [42]:
clf = MLPClassifier()

X = balanced.drop(columns=["Class"])
y = balanced["Class"]
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.4, random_state=42)

clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.9676714722586283

Note: For the above estimator the MLP hyperparameters were left as their default values. Using hyperparameter search methods the model could potentially be further improved.