In [1]:
import pandas as pd
from time import time
col_names = ["duration","protocol_type","service","flag","src_bytes",
    "dst_bytes","land","wrong_fragment","urgent","hot","num_failed_logins",
    "logged_in","num_compromised","root_shell","su_attempted","num_root",
    "num_file_creations","num_shells","num_access_files","num_outbound_cmds",
    "is_host_login","is_guest_login","count","srv_count","serror_rate",
    "srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate",
    "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count",
    "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate",
    "dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate",
    "dst_host_rerror_rate","dst_host_srv_rerror_rate","label"]
data01_ten_percent = pd.read_csv("preprocessed_kddcup_data")

In [2]:
data01_ten_percent['label'].value_counts()

dos       391458
normal     97278
Name: label, dtype: int64

In [3]:
num_features = [
    "duration","src_bytes",
    "dst_bytes","land","wrong_fragment","urgent","hot","num_failed_logins",
    "logged_in","num_compromised","root_shell","su_attempted","num_root",
    "num_file_creations","num_shells","num_access_files","num_outbound_cmds",
    "is_host_login","is_guest_login","count","srv_count","serror_rate",
    "srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate",
    "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count",
    "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate",
    "dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate",
    "dst_host_rerror_rate","dst_host_srv_rerror_rate"
]
features = data01_ten_percent[num_features].astype(float)

In [4]:
from hmmlearn import hmm
labels = data01_ten_percent['label'].copy()
labels[labels!='normal'] = 'attack'
labels.value_counts()

attack    391458
normal     97278
Name: label, dtype: int64

In [5]:
hmmClassifier = hmm.GaussianHMM(n_components=3, covariance_type="full", n_iter=100)

In [6]:
hmmClassifier.fit(features)

GaussianHMM(algorithm='viterbi', covariance_type='full', covars_prior=0.01,
            covars_weight=1, init_params='stmc', means_prior=0, means_weight=0,
            min_covar=0.001, n_components=3, n_iter=100, params='stmc',
            random_state=None, startprob_prior=1.0, tol=0.01,
            transmat_prior=1.0, verbose=False)

In [7]:
data01_corrected = pd.read_csv("corrected", header = None, names = col_names)
data01_corrected['label'].value_counts()

smurf.              164091
normal.              60593
neptune.             58001
snmpgetattack.        7741
mailbomb.             5000
guess_passwd.         4367
snmpguess.            2406
satan.                1633
warezmaster.          1602
back.                 1098
mscan.                1053
apache2.               794
processtable.          759
saint.                 736
portsweep.             354
ipsweep.               306
httptunnel.            158
pod.                    87
nmap.                   84
buffer_overflow.        22
multihop.               18
named.                  17
sendmail.               17
ps.                     16
rootkit.                13
xterm.                  13
teardrop.               12
xlock.                   9
land.                    9
xsnoop.                  4
ftp_write.               3
sqlattack.               2
phf.                     2
worm.                    2
perl.                    2
loadmodule.              2
udpstorm.                2
i

In [8]:
data01_corrected['label'][data01_corrected['label']!= 'normal.'] = 'attack.'
data01_corrected['label'].value_counts()
#print(data01_corrected.columns)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


attack.    250436
normal.     60593
Name: label, dtype: int64

In [9]:
from sklearn.model_selection import train_test_split
data01_corrected[num_features] = data01_corrected[num_features].astype(float)

In [10]:
feature_train, feature_test, labels_train, labels_test = train_test_split(data01_corrected[num_features], data01_corrected['label'], test_size = 0.1, random_state = 42)

In [11]:
pred = hmmClassifier.predict(feature_test)

In [12]:
print(pred)
print(set(pred))
print(type(pred[0]))
print(len(pred))

[0 0 0 ... 0 0 0]
{0, 2}
<class 'numpy.int32'>
31103


In [13]:
type(labels_test)
print(set(labels_test))
print(len(labels_test))

{'normal.', 'attack.'}
31103


In [14]:
print(labels_test)

220755    attack.
158047    attack.
25478     attack.
99780     normal.
71382     attack.
98481     normal.
212000    normal.
113616    attack.
147946    normal.
34016     attack.
235034    attack.
263966    attack.
177492    attack.
220302    attack.
264147    attack.
284136    normal.
14377     attack.
308291    attack.
133340    normal.
104772    attack.
250310    attack.
165252    attack.
91550     normal.
103447    attack.
292669    attack.
181482    attack.
159891    attack.
251812    attack.
287636    normal.
267725    attack.
           ...   
36858     normal.
260112    attack.
25488     attack.
200740    attack.
44450     attack.
290233    normal.
268573    attack.
297170    attack.
22060     attack.
205090    attack.
295193    attack.
100568    normal.
198193    attack.
129963    attack.
209011    attack.
204032    attack.
101366    attack.
37194     normal.
229378    attack.
296294    attack.
216924    attack.
121157    attack.
138680    normal.
214061    normal.
118743    

In [15]:
labels_test = labels_test.to_numpy()
for i in range(len(labels_test)):
    if labels_test[i] == "attack.":
        labels_test[i] = 2
    elif labels_test[i] == "normal.":
        labels_test[i] = 0

In [16]:
totalZero = 0
totalTwo = 0
for i in labels_test : 
    if i == 0 : 
        totalZero = totalZero + 1
    elif i == 2 : 
        totalTwo = totalTwo + 1
print("The total number of fraud packets in the testing dataset : ", totalZero)
print("The total number of normal packets in testing dataset : ", totalTwo)

The total number of fraud packets in the testing dataset :  6061
The total number of normal packets in testing dataset :  25042


In [17]:
totalZeroPred = 0
totalTwoPred = 0
for i in pred : 
    if i == 0 : 
        totalZeroPred = totalZeroPred + 1
    elif i == 2 : 
        totalTwoPred = totalTwoPred + 1
print("The total number of fraud packets in Predicted data : ", totalZeroPred)
print("The total number of normal packets in Predicted data : ", totalTwoPred)

The total number of fraud packets in Predicted data :  28650
The total number of normal packets in Predicted data :  2453


In [18]:
print("The True Positive Rate for the predicted dataset is : ", (totalTwoPred / (totalZeroPred + totalTwoPred)) * 100)

The True Positive Rate for the predicted dataset is :  7.8866990322476935


In [19]:
print("The True Negative Rate for the predicted dataset is : ", (totalZeroPred / (totalZeroPred + totalTwoPred)) * 100)

The True Negative Rate for the predicted dataset is :  92.11330096775231
