In [1]:
import pandas as pd
from time import time
col_names = ["duration","protocol_type","service","flag","src_bytes",
    "dst_bytes","land","wrong_fragment","urgent","hot","num_failed_logins",
    "logged_in","num_compromised","root_shell","su_attempted","num_root",
    "num_file_creations","num_shells","num_access_files","num_outbound_cmds",
    "is_host_login","is_guest_login","count","srv_count","serror_rate",
    "srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate",
    "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count",
    "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate",
    "dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate",
    "dst_host_rerror_rate","dst_host_srv_rerror_rate","label"]
data01_ten_percent = pd.read_csv("kddcup_data_10_percent", header = None, names = col_names)

In [2]:
data01_ten_percent['label'].value_counts()

smurf.              280790
neptune.            107201
normal.              97278
back.                 2203
satan.                1589
ipsweep.              1247
portsweep.            1040
warezclient.          1020
teardrop.              979
pod.                   264
nmap.                  231
guess_passwd.           53
buffer_overflow.        30
land.                   21
warezmaster.            20
imap.                   12
rootkit.                10
loadmodule.              9
ftp_write.               8
multihop.                7
phf.                     4
perl.                    3
spy.                     2
Name: label, dtype: int64

In [3]:
num_features = [
    "duration","src_bytes",
    "dst_bytes","land","wrong_fragment","urgent","hot","num_failed_logins",
    "logged_in","num_compromised","root_shell","su_attempted","num_root",
    "num_file_creations","num_shells","num_access_files","num_outbound_cmds",
    "is_host_login","is_guest_login","count","srv_count","serror_rate",
    "srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate",
    "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count",
    "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate",
    "dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate",
    "dst_host_rerror_rate","dst_host_srv_rerror_rate"
]
features = data01_ten_percent[num_features].astype(float)

In [4]:
from hmmlearn import hmm
labels = data01_ten_percent['label'].copy()
labels[labels!='normal.'] = 'attack.'
labels.value_counts()

attack.    396743
normal.     97278
Name: label, dtype: int64

In [5]:
hmmClassifier = hmm.GaussianHMM(n_components=3, covariance_type="full", n_iter=100)

In [6]:
hmmClassifier.fit(features)

GaussianHMM(algorithm='viterbi', covariance_type='full', covars_prior=0.01,
            covars_weight=1, init_params='stmc', means_prior=0, means_weight=0,
            min_covar=0.001, n_components=3, n_iter=100, params='stmc',
            random_state=None, startprob_prior=1.0, tol=0.01,
            transmat_prior=1.0, verbose=False)

In [7]:
data01_corrected = pd.read_csv("corrected", header = None, names = col_names)
data01_corrected['label'].value_counts()

smurf.              164091
normal.              60593
neptune.             58001
snmpgetattack.        7741
mailbomb.             5000
guess_passwd.         4367
snmpguess.            2406
satan.                1633
warezmaster.          1602
back.                 1098
mscan.                1053
apache2.               794
processtable.          759
saint.                 736
portsweep.             354
ipsweep.               306
httptunnel.            158
pod.                    87
nmap.                   84
buffer_overflow.        22
multihop.               18
sendmail.               17
named.                  17
ps.                     16
rootkit.                13
xterm.                  13
teardrop.               12
land.                    9
xlock.                   9
xsnoop.                  4
ftp_write.               3
loadmodule.              2
udpstorm.                2
phf.                     2
sqlattack.               2
worm.                    2
perl.                    2
i

In [8]:
data01_corrected['label'][data01_corrected['label']!= 'normal.'] = 'attack.'
data01_corrected['label'].value_counts()
#print(data01_corrected.columns)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


attack.    250436
normal.     60593
Name: label, dtype: int64

In [9]:
!pip install sklearn




In [24]:
from sklearn.model_selection import train_test_split
data01_corrected[num_features] = data01_corrected[num_features].astype(float)

In [25]:
feature_train, feature_test, labels_train, labels_test = train_test_split(data01_corrected[num_features], data01_corrected['label'], test_size = 0.1, random_state = 42)

In [26]:
pred = hmmClassifier.predict(feature_test)

In [27]:
print(pred)
print(set(pred))
print(type(pred[0]))
print(len(pred))

[0 0 0 ... 0 0 0]
{0, 2}
<class 'numpy.int32'>
31103


In [28]:
type(labels_test)
print(set(labels_test))
print(len(labels_test))

{'attack.', 'normal.'}
31103


In [29]:
print(labels_test)

220755    attack.
158047    attack.
25478     attack.
99780     normal.
71382     attack.
           ...   
11298     attack.
80850     normal.
170793    attack.
278032    attack.
277089    attack.
Name: label, Length: 31103, dtype: object


In [30]:
labels_test = labels_test.to_numpy()
for i in range(len(labels_test)):
    if labels_test[i] == "attack.":
        labels_test[i] = 2
    elif labels_test[i] == "normal.":
        labels_test[i] = 0
for i in labels_test:
    print(i)

2
2
2
0
2
0
0
2
0
2
2
2
2
2
2
0
2
2
0
2
2
2
0
2
2
2
2
2
0
2
2
2
2
2
2
2
2
2
0
0
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
0
0
0
0
2
2
0
2
2
0
2
2
2
2
2
2
0
2
0
2
2
2
2
0
0
2
2
2
2
2
2
2
2
2
2
0
2
2
2
2
2
2
2
2
0
2
2
2
2
2
2
2
2
2
0
2
0
2
2
2
2
2
2
2
2
0
0
2
2
2
0
2
2
2
2
2
0
2
2
2
0
0
2
2
2
2
2
2
2
2
0
2
2
2
2
2
2
2
2
2
2
0
0
0
2
2
2
0
2
2
2
2
2
2
2
0
0
2
2
2
2
2
0
0
2
2
2
0
2
2
2
2
2
0
0
2
2
0
2
2
2
0
2
2
2
2
0
2
2
0
2
2
2
0
2
2
2
2
2
2
0
2
0
0
2
2
2
2
2
2
2
0
2
2
0
2
0
0
0
2
0
2
0
2
2
2
2
2
2
2
2
2
0
2
2
0
0
0
2
0
2
2
2
2
0
2
2
2
2
0
2
0
2
2
2
2
2
2
2
2
2
0
2
2
2
2
2
2
2
2
2
2
0
2
2
2
2
2
2
0
2
2
2
2
0
2
0
2
2
0
2
2
0
2
0
2
2
0
2
2
2
2
2
2
2
0
2
2
2
2
0
0
0
2
2
0
2
2
2
2
0
2
0
0
0
2
2
2
0
2
2
2
2
2
2
0
2
2
2
0
2
0
0
2
2
2
2
2
2
0
2
2
2
2
2
2
2
2
0
0
0
0
0
0
2
2
2
2
2
2
2
2
2
2
2
0
2
2
2
0
2
2
2
2
2
2
2
2
2
2
2
2
2
0
0
2
2
2
0
0
2
0
2
2
2
2
2
2
2
2
0
2
2
2
0
0
2
2
2
2
0
0
0
2
2
2
2
2
0
0
2
2
2
2
2
2
2
2
2
2
2
0
2
2
0
2
0
0
0
0
0
2
0
2
2
2
2
2
2
2
0
2
2
2
0
2
2
2
2
2
0
2
0
2


2
2
2
2
2
2
2
2
2
2
0
2
0
0
2
2
2
2
2
2
2
2
2
2
0
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
0
2
2
2
2
0
2
2
2
0
2
2
2
2
2
0
2
2
2
2
2
2
2
2
0
2
2
2
2
2
2
2
0
2
0
2
2
2
2
2
2
2
0
2
2
2
2
0
2
2
2
2
2
0
2
0
0
0
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
0
2
2
2
2
2
2
2
2
2
2
2
0
2
2
2
2
2
2
2
2
2
0
2
2
2
2
2
2
0
0
2
2
2
2
2
2
2
2
0
0
2
2
0
2
2
0
2
2
2
2
2
2
0
0
2
2
2
2
0
2
2
2
2
0
2
2
2
0
2
2
2
2
2
0
2
2
2
2
0
2
2
2
2
2
2
2
2
2
2
2
2
0
0
2
2
2
2
2
0
0
2
2
2
2
2
2
2
0
2
0
2
2
2
0
0
0
2
0
2
2
2
2
2
2
0
2
0
2
0
2
0
2
2
0
2
2
2
2
2
2
2
2
0
2
2
0
2
2
0
2
2
2
2
2
2
2
2
2
2
2
2
0
0
2
2
2
2
2
0
0
2
2
0
2
2
0
2
2
2
2
2
2
2
2
2
2
2
2
2
0
2
2
0
2
2
0
2
2
0
2
0
2
0
0
2
2
2
0
2
2
2
2
0
2
2
2
2
2
2
2
2
2
2
2
2
2
2
0
2
0
2
0
2
2
2
2
2
2
2
2
2
0
2
2
2
2
0
2
2
2
2
2
2
2
2
2
2
2
0
0
0
2
2
2
2
0
2
2
2
0
2
0
2
2
2
2
2
2
2
2
2
2
0
2
2
2
2
2
2
2
2
2
2
2
2
0
0
0
2
2
2
2
2
2
2
2
2
2
2
0
2
2
2
2
2
2
2
2
2
2
2
2
2
0
2
0
2
2
2
2
0
2
0
2
2
0
2
2
2
0
2
0
2
2
0
0
0
2
0
2
2
2
2
2
2
0
2
2
2
2
2
0
2
2
2
0
0
0
2
2
2
2
0
2
2
2
2
2
2
2
2
2
2
0


2
2
2
2
0
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
0
0
2
2
0
2
2
0
2
2
2
2
2
2
2
2
0
2
2
2
2
2
2
2
2
0
2
2
2
2
2
2
2
2
2
2
2
2
0
2
2
2
2
2
2
2
0
2
2
2
2
2
2
2
2
2
2
2
2
2
0
2
2
0
2
2
2
2
2
2
2
2
2
0
2
2
2
2
2
2
2
2
2
2
2
0
2
2
2
2
0
2
0
2
0
2
2
2
0
2
0
2
2
2
2
0
2
0
2
2
2
2
2
2
2
0
0
2
2
0
2
2
2
2
2
2
2
2
2
2
2
2
0
2
2
0
2
2
2
2
2
2
2
2
2
2
0
2
2
2
0
2
2
2
2
0
2
2
2
0
2
0
0
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
0
2
2
0
2
2
2
2
2
2
2
0
2
2
2
0
2
2
2
2
2
2
2
2
2
2
2
2
0
2
2
2
0
2
2
0
2
2
2
2
2
2
2
2
2
2
0
0
2
2
2
0
2
0
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
0
0
2
2
2
2
2
2
2
2
2
2
0
2
2
2
2
0
2
2
2
0
2
2
2
2
2
2
2
0
2
2
0
2
2
2
2
2
2
2
2
0
0
2
2
0
2
2
2
2
2
2
2
2
2
2
0
2
2
0
2
2
0
2
2
2
2
2
2
0
0
0
2
2
2
2
2
2
2
2
0
2
0
2
2
2
2
2
2
2
2
0
2
2
2
2
2
2
2
2
2
2
2
2
0
2
0
2
2
2
2
2
2
2
2
2
0
2
2
0
0
2
2
2
2
2
2
0
2
2
2
2
2
2
2
2
2
2
0
2
0
2
2
2
2
2
2
2
2
2
0
2
2
2
2
2
2
2
0
2
2
2
2
2
2
2
2
2
2
0
2
2
2
2
2
2
2
2
2
2
2
2
2
0
2
2
2
2
0
2
2
0
2
0
2
2
2
2
0
2
0
2
2
2
2
2
2
2
2
2
2
2
2
2
2


0
2
2
0
2
2
2
2
0
2
2
2
2
2
0
2
0
0
2
0
2
2
0
2
2
2
0
2
2
2
2
2
2
2
2
0
2
2
2
2
2
2
0
2
2
2
2
2
2
2
2
2
0
2
2
0
2
0
0
0
2
2
2
2
2
2
2
0
2
2
2
2
2
2
0
0
0
2
0
2
2
2
0
2
2
0
2
2
0
2
2
0
2
2
2
2
0
0
2
0
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
0
0
0
0
2
2
2
0
0
2
0
0
2
0
2
0
2
0
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
0
2
0
2
2
2
2
2
2
2
2
2
2
0
2
2
0
2
2
2
0
2
2
0
2
2
0
2
2
2
0
2
2
2
0
2
2
2
2
2
2
2
2
0
2
0
2
2
0
2
2
2
2
2
2
0
2
2
0
2
2
0
2
2
0
0
2
2
2
2
2
2
2
2
2
2
2
0
2
2
0
2
0
2
2
2
2
2
0
0
2
2
0
2
2
2
2
0
2
2
2
2
2
0
2
0
2
2
2
0
2
2
2
2
0
2
2
2
2
2
2
0
2
0
2
2
2
2
0
0
2
2
2
2
2
0
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
0
2
2
2
2
2
2
2
2
2
2
2
2
0
2
2
2
2
0
2
2
2
2
0
2
0
2
2
2
2
2
2
2
2
2
2
0
0
2
2
2
2
2
0
2
2
0
2
2
2
2
2
2
2
2
2
0
0
0
2
2
2
0
2
2
2
2
2
0
2
0
2
0
0
2
2
2
2
0
2
0
2
2
2
2
2
0
2
2
2
0
0
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
0
2
0
2
2
2
0
0
2
0
2
2
2
2
2
2
2
0
2
2
0
2
2
2
2
2
2
2
2
2
2
2
0
2
2
2
2
0
2
2
2
2
2
2
2
0
2
0
0
0
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2


2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
0
2
0
2
2
2
2
2
0
2
2
2
2
2
2
0
2
2
2
2
2
2
0
2
2
2
2
2
2
2
2
2
2
2
2
2
2
0
2
2
0
2
2
0
2
2
2
2
0
2
0
2
2
0
2
2
2
2
0
2
2
0
2
2
0
2
0
0
2
2
2
0
2
2
2
2
2
2
2
2
0
2
0
2
2
2
2
2
2
2
2
2
2
2
2
2
0
2
2
2
2
2
2
2
2
2
2
2
2
2
2
0
2
2
0
2
2
2
2
2
2
2
2
0
2
0
0
2
0
2
0
2
2
2
0
2
2
2
2
0
0
2
2
2
2
2
0
2
2
2
2
2
2
0
2
2
2
2
2
2
2
2
2
2
0
2
2
2
2
2
0
2
2
2
2
2
2
2
2
0
2
2
2
2
2
2
2
2
2
2
2
0
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
0
2
2
2
2
0
2
2
0
2
2
0
2
2
0
2
2
0
2
0
0
2
2
2
2
0
2
0
0
2
2
2
2
2
2
2
2
0
2
2
2
2
2
2
2
2
2
0
2
2
0
2
0
2
2
2
2
0
2
2
2
2
2
2
2
2
2
2
2
0
2
2
2
2
0
2
2
0
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
0
2
2
0
2
2
2
2
2
2
2
2
2
2
2
0
2
2
0
2
0
2
2
2
2
0
2
2
2
2
0
2
2
2
0
2
0
2
2
2
2
2
0
0
2
2
2
2
2
2
2
2
2
0
2
0
2
2
2
0
2
2
2
0
0
2
2
2
2
2
2
2
2
0
0
0
0
2
2
2
2
2
2
2
0
0
2
2
2
0
0
2
2
2
2
2
0
0
0
0
2
2
0
2
2
2
0
2
0
2
2
0
2
2
0
2
2
2
2
2
2
2
2
0
2
0
2
2
2
2
2
0
2
2
2
0
0
2
2
2
2
2
2
2
2
2
2
2
0
0
0
2
2
2
2
0
2
2
2
2
2
0
2
2
2
2
2
2
2
2
2
0
0
2
2


2
2
2
2
2
2
2
0
2
2
2
2
2
2
2
2
0
0
2
2
2
2
2
2
2
2
2
2
0
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
0
0
0
2
2
0
2
0
2
2
2
2
2
0
2
2
2
2
2
2
2
2
2
2
2
2
2
0
2
2
2
2
2
2
2
2
2
0
2
0
2
2
2
0
2
2
2
0
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
0
0
2
2
0
2
2
2
2
2
2
2
2
0
2
0
2
0
0
2
2
0
2
2
2
2
2
0
2
2
2
2
2
0
2
2
2
2
2
0
2
2
2
2
0
2
2
2
2
0
2
2
2
2
0
0
2
2
2
0
2
2
2
2
2
2
2
0
0
2
2
2
0
2
2
2
2
2
2
2
0
2
0
0
2
2
2
0
2
2
0
2
2
2
2
0
0
2
2
2
2
2
2
0
2
2
2
0
2
2
0
2
2
2
0
0
0
2
0
0
2
0
2
2
2
2
2
2
2
2
2
0
0
2
2
2
2
2
2
2
2
2
2
2
2
2
0
0
2
0
2
2
2
0
2
0
2
2
2
2
0
2
0
2
0
2
2
0
0
0
0
2
2
0
0
2
2
0
0
0
2
2
2
2
2
2
2
2
2
2
2
0
2
0
2
2
2
2
2
2
2
0
2
2
2
2
2
2
2
0
2
0
2
2
2
2
2
0
2
2
0
0
2
2
0
0
2
2
0
2
2
2
0
2
0
2
2
2
2
0
0
2
2
0
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
0
2
2
0
2
0
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
0
2
2
2
2
2
0
0
2
2
2
0
2
2
2
0
2
2
0
2
2
0
2
2
2
2
2
0
2
2
2
2
0
0
2
2
2
2
2
2
2
2
2
0
2
0
2
0
2
2
0
2
0
0
2
0
0
2
2
2
0
2
2
2
0
2
0
2
2
2
2
2
2
2
2
2
2
0
2
2
2
0
2
0
2
2
0
2
2
2
0
0
2
2
2
2
2
2
2
2
0
0
2
2
0
2
2
2
2
2


2
2
0
2
2
2
2
2
2
2
0
2
2
2
2
2
2
2
2
2
2
0
2
2
2
2
2
2
2
2
2
2
2
2
0
2
0
2
2
2
2
2
0
2
2
2
2
2
2
2
2
0
2
2
2
2
2
2
2
2
2
2
0
2
2
2
0
2
2
2
0
2
2
2
2
2
2
2
2
0
2
2
2
2
2
2
2
2
2
2
0
0
2
2
0
2
2
2
2
2
2
2
0
2
2
2
0
2
2
2
2
2
2
2
2
2
2
2
2
0
2
2
0
2
2
2
0
2
2
2
2
2
2
2
2
0
2
2
2
2
0
0
0
2
2
2
2
2
2
2
0
2
2
2
2
2
2
2
2
2
2
2
2
0
2
2
2
2
2
2
2
2
0
2
2
0
2
2
2
2
2
2
2
0
0
2
2
2
0
2
0
0
0
2
2
2
2
2
2
0
2
2
2
2
0
0
2
0
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
0
2
2
2
2
2
2
2
0
2
2
2
2
0
2
2
2
0
0
2
0
2
2
0
0
2
2
2
2
2
2
2
2
2
2
0
2
2
2
2
2
2
2
2
2
2
2
0
2
0
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
0
2
2
2
2
2
2
0
2
2
0
2
0
2
2
2
2
2
2
2
0
0
0
2
2
2
2
0
2
2
2
2
2
2
2
2
2
0
2
2
2
2
2
2
2
0
2
2
2
2
2
2
2
2
2
2
2
0
2
2
2
2
0
2
2
2
0
2
2
2
0
2
2
2
0
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
0
2
0
2
2
2
2
2
0
2
0
2
0
0
0
2
2
2
2
2
2
2
0
2
2
2
2
2
2
2
0
2
0
2
2
2
2
2
2
2
2
0
2
2
2
2
0
2
2
2
2
2
2
2
2
2
2
2
2
2
0
2
2
2
2
0
2
2
2
2
0
0
2
0
2
2
2
0
2
2
2
0
2
0
2
2
2
2
2
2
0
2
2
2
2
0
0
2
2
2
2
2
2
2
2
2
2
2


In [34]:
from sklearn.metrics import accuracy_score
cm_train = metrics.accuracy_score(pred, labels_test)
print(cm_train)

ValueError: Classification metrics can't handle a mix of binary and unknown targets