# Malicious Domain Dataset - Multi-layer Perceptron Classifier

In [21]:
# Load preprocessed dataset

import pandas as pd
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold

datasource = "preprocessed.csv"
data = pd.read_csv(datasource)
data.Class.value_counts()

0    32613
1     3815
Name: Class, dtype: int64

In [18]:
# Simple model - no cross validation

clf = MLPClassifier()

X = data.drop(columns=["Class"])
y = data["Class"]
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=8000, random_state=42)

y_train.value_counts()

clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.9862459546925566

## Resampling

As the preprocessed dataset is unbalanced, we resample to create a balanced set for training and testing.

In [20]:
from sklearn.utils import resample

sample_size = min(data.Class.value_counts())

malicious = resample(data.loc[data["Class"] == 0], replace=False, n_samples=sample_size, random_state=42)
benign = resample(data.loc[data["Class"] == 1], replace=False, n_samples=sample_size, random_state=42)

balanced = pd.concat([malicious, benign])
balanced.Class.value_counts()

0    3815
1    3815
Name: Class, dtype: int64

## Cross-validation

In [4]:
from sklearn.model_selection import cross_validate

folds = 10

metrics = ["accuracy", "f1", "precision", "recall"]

scores = cross_validate(clf, X, y, scoring=metrics, cv=folds)

scores = pd.DataFrame.from_dict(scores)
scores

Unnamed: 0,fit_time,score_time,test_accuracy,test_f1,test_precision,test_recall
0,2.1237,0.003812,0.97903,0.979275,0.966752,0.992126
1,1.992488,0.003989,0.975098,0.975033,0.976316,0.973753
2,3.229548,0.003792,0.975098,0.974768,0.986559,0.963255
3,2.40445,0.003647,0.980341,0.980443,0.974093,0.986877
4,2.417074,0.006196,0.967235,0.967148,0.968421,0.965879
5,2.307723,0.003796,0.980341,0.980289,0.984169,0.97644
6,2.664626,0.004671,0.97903,0.979058,0.979058,0.979058
7,2.698351,0.004075,0.976409,0.976316,0.981481,0.971204
8,3.121962,0.004215,0.976409,0.976,0.994565,0.958115
9,2.673412,0.004063,0.984273,0.984456,0.974359,0.994764


In [27]:
# Manual testing
all_X = data.drop(columns="Class")
all_y = data["Class"]
p = clf.predict(all_X)

results = pd.DataFrame.from_dict({"Class": all_y, "Prediction": p})
results.loc[results["Class"] != results["Prediction"]].value_counts()

#Class  Prediction
# 0      1             769
# 1      0              74

Class  Prediction
1      0             469
0      1              24
dtype: int64