# Malicious Domain Dataset - Multi-layer Perceptron Classifier

In [6]:
import pandas as pd
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split

# Load preprocessed dataset
datasource = "preprocessed.csv"
data = pd.read_csv(datasource)
data.Class.value_counts()

0    32613
1     3815
Name: Class, dtype: int64

## Resampling

As the preprocessed dataset is unbalanced, we resample to create a balanced set for training and testing.

In [7]:
from sklearn.utils import resample

sample_size = min(data.Class.value_counts())

malicious = resample(data.loc[data["Class"] == 0], replace=False, n_samples=sample_size, random_state=42)
benign = resample(data.loc[data["Class"] == 1], replace=False, n_samples=sample_size, random_state=42)

balanced = pd.concat([malicious, benign])
balanced.Class.value_counts()

X = balanced.drop(columns=["Class"])
y = balanced["Class"]

## Cross-validation

Here we assess how well an MLP classifier will generalize the dataset by performing 10-fold cross-validation.

In [25]:
from sklearn.model_selection import cross_validate
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

clf = MLPClassifier(max_iter=500)

stages = [("scaler", StandardScaler()), ("classifier", clf)]
pipeline = Pipeline(stages)

folds = 10
metrics = ["accuracy", "f1", "precision", "recall"]
scores = cross_validate(pipeline, X, y, scoring=metrics, cv=folds)

scores = pd.DataFrame.from_dict(scores)
scores

Unnamed: 0,fit_time,score_time,test_accuracy,test_f1,test_precision,test_recall
0,4.83152,0.004489,0.984273,0.984293,0.981723,0.986877
1,5.144829,0.003754,0.97772,0.977836,0.971503,0.984252
2,6.130333,0.003791,0.986894,0.986911,0.984334,0.989501
3,5.550243,0.004805,0.97903,0.978947,0.98153,0.976378
4,5.275499,0.004753,0.973788,0.973822,0.971279,0.976378
5,5.744339,0.004631,0.986894,0.986877,0.989474,0.984293
6,4.853648,0.004337,0.984273,0.984416,0.976804,0.992147
7,4.968927,0.004915,0.989515,0.989529,0.989529,0.989529
8,5.127678,0.004621,0.990826,0.99085,0.989556,0.992147
9,4.783918,0.004408,0.97772,0.977483,0.989276,0.965969


In [9]:
scores.mean()

fit_time          4.392021
score_time        0.004833
test_accuracy     0.983224
test_f1           0.983218
test_precision    0.983003
test_recall       0.983489
dtype: float64

## Finalized Model

In [30]:
clf = MLPClassifier(max_iter=500)

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.4, random_state=42)

stages = [("scaler", StandardScaler()), ("classifier", clf)]
pipeline = Pipeline(stages)

pipeline.fit(X_train, y_train)
pipeline.score(X_test, y_test)

0.9803407601572739

Note: For the above estimator the MLP hyperparameters were left as their default values. Using hyperparameter search methods the model could potentially be further improved.