In [1]:
import pandas as pd
from time import time
col_names = ["duration","protocol_type","service","flag","src_bytes",
    "dst_bytes","land","wrong_fragment","urgent","hot","num_failed_logins",
    "logged_in","num_compromised","root_shell","su_attempted","num_root",
    "num_file_creations","num_shells","num_access_files","num_outbound_cmds",
    "is_host_login","is_guest_login","count","srv_count","serror_rate",
    "srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate",
    "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count",
    "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate",
    "dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate",
    "dst_host_rerror_rate","dst_host_srv_rerror_rate","label"]
data01_ten_percent = pd.read_csv("preprocessed_kddcup_data")

In [2]:
data01_ten_percent['label'].value_counts()

dos       391458
normal     97278
Name: label, dtype: int64

In [3]:
data01_ten_percent.drop(['dst_bytes', 'land', 'root_shell', 'su_attempted', 'num_shells', 'num_access_files', 'num_outbound_cmds', 'serror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_rerror_rate'], axis = 1, inplace=True) 

In [4]:
data01_ten_percent.dtypes

duration                      int64
protocol_type                 int64
service                       int64
flag                          int64
src_bytes                     int64
wrong_fragment                int64
urgent                        int64
hot                           int64
num_failed_logins             int64
logged_in                     int64
num_compromised               int64
num_root                      int64
num_file_creations            int64
is_host_login                 int64
is_guest_login                int64
count                         int64
srv_count                     int64
srv_serror_rate             float64
rerror_rate                 float64
srv_diff_host_rate          float64
dst_host_same_srv_rate      float64
dst_host_srv_serror_rate    float64
dst_host_srv_rerror_rate    float64
label                        object
dtype: object

In [5]:
num_features = [
   "duration","protocol_type","service","flag","src_bytes","wrong_fragment","urgent","hot","num_failed_logins","logged_in","num_compromised","num_root","num_file_creations","is_host_login","is_guest_login","count","srv_count","srv_serror_rate","rerror_rate","srv_diff_host_rate","dst_host_same_srv_rate","dst_host_srv_serror_rate","dst_host_srv_rerror_rate"
]
features = data01_ten_percent[num_features].astype(float)

In [6]:
from hmmlearn import hmm
labels = data01_ten_percent['label'].copy()
labels[labels!='normal.'] = 'attack.'
labels.value_counts()

attack.    488736
Name: label, dtype: int64

In [7]:
hmmClassifier = hmm.GaussianHMM(n_components=3, covariance_type="full", n_iter=100)

In [8]:
hmmClassifier.fit(features)

GaussianHMM(algorithm='viterbi', covariance_type='full', covars_prior=0.01,
            covars_weight=1, init_params='stmc', means_prior=0, means_weight=0,
            min_covar=0.001, n_components=3, n_iter=100, params='stmc',
            random_state=None, startprob_prior=1.0, tol=0.01,
            transmat_prior=1.0, verbose=False)

In [9]:
data01_corrected = pd.read_csv("corrected_preprocessed_kddcup_data")
data01_corrected['label'].value_counts()

smurf              72535
normal             25953
neptune            20661
mailbomb            5000
snmpgetattack       4718
guess_passwd        1367
warezmaster         1288
satan               1243
mscan               1053
saint                736
processtable         506
apache2              398
portsweep            345
ipsweep              256
httptunnel           122
back                  99
pod                   45
nmap                  44
buffer_overflow       17
ps                    14
xterm                 13
rootkit               13
multihop              13
sendmail              12
named                 10
xlock                  9
xsnoop                 4
ftp_write              3
teardrop               3
udpstorm               2
loadmodule             2
perl                   1
imap                   1
phf                    1
worm                   1
Name: label, dtype: int64

In [10]:
data01_corrected.dtypes

duration                         int64
protocol_type                    int64
service                          int64
flag                             int64
src_bytes                        int64
dst_bytes                        int64
land                             int64
wrong_fragment                   int64
urgent                           int64
hot                              int64
num_failed_logins                int64
logged_in                        int64
num_compromised                  int64
root_shell                       int64
su_attempted                     int64
num_root                         int64
num_file_creations               int64
num_shells                       int64
num_access_files                 int64
num_outbound_cmds                int64
is_host_login                    int64
is_guest_login                   int64
count                            int64
srv_count                        int64
serror_rate                    float64
srv_serror_rate          

In [11]:
data01_corrected.drop(['dst_bytes', 'land', 'root_shell', 'su_attempted', 'num_shells', 'num_access_files', 'num_outbound_cmds', 'serror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_rerror_rate'], axis = 1, inplace=True) 

In [12]:
data01_corrected['label'][data01_corrected['label']!= 'normal.'] = 'attack.'
data01_corrected['label'].value_counts()
#print(data01_corrected.columns)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


attack.    136488
Name: label, dtype: int64

In [13]:
!pip install sklearn




In [14]:
data01_corrected.dtypes

duration                      int64
protocol_type                 int64
service                       int64
flag                          int64
src_bytes                     int64
wrong_fragment                int64
urgent                        int64
hot                           int64
num_failed_logins             int64
logged_in                     int64
num_compromised               int64
num_root                      int64
num_file_creations            int64
is_host_login                 int64
is_guest_login                int64
count                         int64
srv_count                     int64
srv_serror_rate             float64
rerror_rate                 float64
srv_diff_host_rate          float64
dst_host_same_srv_rate      float64
dst_host_srv_serror_rate    float64
dst_host_srv_rerror_rate    float64
label                        object
dtype: object

In [15]:
from sklearn.model_selection import train_test_split
data01_corrected[num_features] = data01_corrected[num_features].astype(float)

In [16]:
feature_train, feature_test, labels_train, labels_test = train_test_split(data01_corrected[num_features], data01_corrected['label'], test_size = 0.1, random_state = 42)

In [17]:
pred = hmmClassifier.predict(feature_test)

In [18]:
print(pred)
print(set(pred))
print(type(pred[0]))
print(len(pred))

[0 0 2 ... 0 0 0]
{0, 1, 2}
<class 'numpy.int32'>
13649


In [19]:
type(labels_test)
print(set(labels_test))
print(len(labels_test))

{'attack.'}
13649


In [20]:
print(labels_test)

5601      attack.
58101     attack.
80179     attack.
14967     attack.
17029     attack.
26759     attack.
14825     attack.
45880     attack.
115614    attack.
9801      attack.
103511    attack.
80761     attack.
92627     attack.
92466     attack.
133264    attack.
26779     attack.
52544     attack.
201       attack.
87587     attack.
60860     attack.
20061     attack.
47895     attack.
27022     attack.
62075     attack.
41695     attack.
81194     attack.
110446    attack.
104556    attack.
111740    attack.
125913    attack.
           ...   
120495    attack.
81513     attack.
75670     attack.
84598     attack.
30039     attack.
86193     attack.
94017     attack.
78450     attack.
19916     attack.
102553    attack.
92358     attack.
44129     attack.
26892     attack.
14120     attack.
78241     attack.
63724     attack.
93648     attack.
56518     attack.
118004    attack.
72669     attack.
128719    attack.
6602      attack.
101322    attack.
131964    attack.
30813     

In [21]:
labels_test = labels_test.to_numpy()
for i in range(len(labels_test)):
    if labels_test[i] == "attack.":
        labels_test[i] = 2
    elif labels_test[i] == "normal.":
        labels_test[i] = 0

In [22]:
totalZero = 0
totalTwo = 0
for i in labels_test : 
    if i == 0 : 
        totalZero = totalZero + 1
    elif i == 2 : 
        totalTwo = totalTwo + 1
print("The total number of fraud packets in the testing dataset : ", totalZero)
print("The total number of normal packets in testing dataset : ", totalTwo)

The total number of fraud packets in the testing dataset :  0
The total number of normal packets in testing dataset :  13649


In [23]:
totalZeroPred = 0
totalTwoPred = 0
for i in pred : 
    if i == 0 : 
        totalZeroPred = totalZeroPred + 1
    elif i == 2 : 
        totalTwoPred = totalTwoPred + 1
print("The total number of fraud packets in Predicted data : ", totalZeroPred)
print("The total number of normal packets in Predicted data : ", totalTwoPred)

The total number of fraud packets in Predicted data :  12229
The total number of normal packets in Predicted data :  1419


In [24]:
print("The True Positive Rate for the predicted dataset is : ", (totalTwoPred / (totalZeroPred + totalTwoPred)) * 100)

The True Positive Rate for the predicted dataset is :  10.397127784290738


In [25]:
print("The True Negative Rate for the predicted dataset is : ", (totalZeroPred / (totalZeroPred + totalTwoPred)) * 100)

The True Negative Rate for the predicted dataset is :  89.60287221570925
