In [1]:
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
from time import time


# I. Train Dataset

## 1. Define the column header

In [2]:
col_names = ["duration","protocol_type","service","flag","src_bytes","dst_bytes","land","wrong_fragment","urgent","hot",
             "num_failed_logins","logged_in","num_compromised","root_shell","su_attempted","num_root","num_file_creations",
             "num_shells","num_access_files","num_outbound_cmds","is_host_login","is_guest_login","count","srv_count",
             "serror_rate","srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate","diff_srv_rate",
             "srv_diff_host_rate","dst_host_count","dst_host_srv_count","dst_host_same_srv_rate","dst_host_diff_srv_rate",
             "dst_host_same_src_port_rate","dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate",
             "dst_host_rerror_rate","dst_host_srv_rerror_rate","label"]

## 2. Read the training dataset from a text file

In [3]:
train_df = pd.read_csv("../kddcup.data/kddcup.train.set", header=None, names=col_names)

## 3. Check information of the train dataset

In [4]:
train_df.describe()

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
count,4898431.0,4898431.0,4898431.0,4898431.0,4898431.0,4898431.0,4898431.0,4898431.0,4898431.0,4898431.0,...,4898431.0,4898431.0,4898431.0,4898431.0,4898431.0,4898431.0,4898431.0,4898431.0,4898431.0,4898431.0
mean,48.34243,1834.621,1093.623,5.716116e-06,0.0006487792,7.961733e-06,0.01243766,3.205108e-05,0.143529,0.008088304,...,232.9811,189.2142,0.7537132,0.03071111,0.605052,0.006464107,0.1780911,0.1778859,0.0579278,0.05765941
std,723.3298,941431.1,645012.3,0.002390833,0.04285434,0.007215084,0.4689782,0.007299408,0.3506116,3.856481,...,64.02094,105.9128,0.411186,0.1085432,0.4809877,0.04125978,0.3818382,0.3821774,0.2309428,0.2309777
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,45.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,255.0,49.0,0.41,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,520.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,255.0,255.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,1032.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,255.0,255.0,1.0,0.04,1.0,0.0,0.0,0.0,0.0,0.0
max,58329.0,1379964000.0,1309937000.0,1.0,3.0,14.0,77.0,5.0,1.0,7479.0,...,255.0,255.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [5]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4898431 entries, 0 to 4898430
Data columns (total 42 columns):
duration                       int64
protocol_type                  object
service                        object
flag                           object
src_bytes                      int64
dst_bytes                      int64
land                           int64
wrong_fragment                 int64
urgent                         int64
hot                            int64
num_failed_logins              int64
logged_in                      int64
num_compromised                int64
root_shell                     int64
su_attempted                   int64
num_root                       int64
num_file_creations             int64
num_shells                     int64
num_access_files               int64
num_outbound_cmds              int64
is_host_login                  int64
is_guest_login                 int64
count                          int64
srv_count                      in

In [6]:
train_df.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,label
0,0,tcp,http,SF,215,45076,0,0,0,0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal.
1,0,tcp,http,SF,162,4528,0,0,0,0,...,1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,normal.
2,0,tcp,http,SF,236,1228,0,0,0,0,...,2,1.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,normal.
3,0,tcp,http,SF,233,2032,0,0,0,0,...,3,1.0,0.0,0.33,0.0,0.0,0.0,0.0,0.0,normal.
4,0,tcp,http,SF,239,486,0,0,0,0,...,4,1.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,normal.


## 4. Check duplicates of the dataset records

In [7]:
train_df.duplicated().value_counts()

True     3823439
False    1074992
dtype: int64

## 5. Remove the duplicates from the dataset and store the new non-duplicated dataset in a new data frame

In [8]:
train_df_no_duplicates = train_df.drop_duplicates()

## 6. Check information of the new data frame

Check duplicates

In [9]:
train_df_no_duplicates.duplicated().value_counts()

False    1074992
dtype: int64

Check information

In [10]:
train_df_no_duplicates.describe()

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
count,1074992.0,1074992.0,1074992.0,1074992.0,1074992.0,1074992.0,1074992.0,1074992.0,1074992.0,1074992.0,...,1074992.0,1074992.0,1074992.0,1074992.0,1074992.0,1074992.0,1074992.0,1074992.0,1074992.0,1074992.0
mean,134.8908,5837.54,4873.73,2.418623e-05,0.002735834,3.627934e-05,0.05427947,0.0001460476,0.6301163,0.03570817,...,161.9723,159.2091,0.6700944,0.04949892,0.09335762,0.02173602,0.190658,0.1901296,0.07954099,0.07829662
std,1321.605,2009617.0,1376866.0,0.004917892,0.08835986,0.01540162,0.9974618,0.01558112,0.4827732,8.232096,...,102.2645,109.8842,0.4259518,0.1310801,0.2298291,0.05636309,0.3907621,0.3913478,0.2621995,0.260984
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,50.0,18.0,0.07,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,219.0,332.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,224.0,248.0,1.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0
75%,0.0,306.0,1721.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,255.0,255.0,1.0,0.06,0.04,0.02,0.0,0.0,0.0,0.0
max,58329.0,1379964000.0,1309937000.0,1.0,3.0,14.0,77.0,5.0,1.0,7479.0,...,255.0,255.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [11]:
train_df_no_duplicates.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1074992 entries, 0 to 4898430
Data columns (total 42 columns):
duration                       1074992 non-null int64
protocol_type                  1074992 non-null object
service                        1074992 non-null object
flag                           1074992 non-null object
src_bytes                      1074992 non-null int64
dst_bytes                      1074992 non-null int64
land                           1074992 non-null int64
wrong_fragment                 1074992 non-null int64
urgent                         1074992 non-null int64
hot                            1074992 non-null int64
num_failed_logins              1074992 non-null int64
logged_in                      1074992 non-null int64
num_compromised                1074992 non-null int64
root_shell                     1074992 non-null int64
su_attempted                   1074992 non-null int64
num_root                       1074992 non-null int64
num_file_creations  

## 7. Reset index of the new data set

In [12]:
train_df_no_duplicates.reset_index(inplace=True, drop=True)

## 8. Save the new train dataset to a file

In [31]:
train_df_no_duplicates.to_csv('../kddcup.data/kddcup.train.nodups.txt', header=False, index=False)

## 9. Split the data set into attributes and classes

Remove the label column

In [14]:
X_train = train_df_no_duplicates.drop('label', axis=1)

Extract the label column (the 42nd column)

In [15]:
y_train = train_df_no_duplicates.iloc[:,41]

## 10. Save the attributes and classes to text files

Save the train attributes

In [17]:
X_train.to_csv('../kddcup.data/x_train.txt', header=False, index=False)

Save the class

In [18]:
y_train.to_csv('../kddcup.data/y_train.txt', header=False, index=False)

# 2. Test Dataset

## 11. Read the test set

In [19]:
test_df = pd.read_csv("../kddcup.data/kddcup.test.set", header=None, names=col_names)

## 12. Check the test set

In [20]:
test_df.describe()

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
count,311029.0,311029.0,311029.0,311029.0,311029.0,311029.0,311029.0,311029.0,311029.0,311029.0,...,311029.0,311029.0,311029.0,311029.0,311029.0,311029.0,311029.0,311029.0,311029.0,311029.0
mean,17.902736,1731.702,747.9937,2.9e-05,0.000762,5.1e-05,0.014677,0.002363,0.172476,0.011243,...,235.282681,199.193914,0.793494,0.024953,0.547919,0.004566,0.058764,0.058791,0.142659,0.141693
std,407.6444,127656.7,16120.18,0.005379,0.040367,0.009821,0.312068,0.04999,0.377794,1.958325,...,60.913298,100.30647,0.38709,0.096003,0.491963,0.035773,0.231296,0.232997,0.34438,0.346573
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,105.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,255.0,244.0,0.97,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,520.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,255.0,255.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,1032.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,255.0,255.0,1.0,0.01,1.0,0.0,0.0,0.0,0.0,0.0
max,57715.0,62825650.0,5203179.0,1.0,3.0,3.0,101.0,4.0,1.0,796.0,...,255.0,255.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [21]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 311029 entries, 0 to 311028
Data columns (total 42 columns):
duration                       311029 non-null int64
protocol_type                  311029 non-null object
service                        311029 non-null object
flag                           311029 non-null object
src_bytes                      311029 non-null int64
dst_bytes                      311029 non-null int64
land                           311029 non-null int64
wrong_fragment                 311029 non-null int64
urgent                         311029 non-null int64
hot                            311029 non-null int64
num_failed_logins              311029 non-null int64
logged_in                      311029 non-null int64
num_compromised                311029 non-null int64
root_shell                     311029 non-null int64
su_attempted                   311029 non-null int64
num_root                       311029 non-null int64
num_file_creations             311029 

In [22]:
test_df.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,label
0,0,udp,private,SF,105,146,0,0,0,0,...,254,1.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,normal.
1,0,udp,private,SF,105,146,0,0,0,0,...,254,1.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,normal.
2,0,udp,private,SF,105,146,0,0,0,0,...,254,1.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,normal.
3,0,udp,private,SF,105,146,0,0,0,0,...,254,1.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,snmpgetattack.
4,0,udp,private,SF,105,146,0,0,0,0,...,254,1.0,0.01,0.01,0.0,0.0,0.0,0.0,0.0,snmpgetattack.


## 13. Check duplicates of the test set

In [23]:
test_df.duplicated().value_counts()

True     233738
False     77291
dtype: int64

## 14. Remove the duplicates

In [25]:
test_df_no_duplicates = test_df.drop_duplicates()

In [26]:
test_df_no_duplicates.duplicated().value_counts()

False    77291
dtype: int64

## 15. Reset index of the new test set

In [27]:
test_df_no_duplicates.reset_index(inplace = True, drop=True)

In [28]:
test_df_no_duplicates.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77291 entries, 0 to 77290
Data columns (total 42 columns):
duration                       77291 non-null int64
protocol_type                  77291 non-null object
service                        77291 non-null object
flag                           77291 non-null object
src_bytes                      77291 non-null int64
dst_bytes                      77291 non-null int64
land                           77291 non-null int64
wrong_fragment                 77291 non-null int64
urgent                         77291 non-null int64
hot                            77291 non-null int64
num_failed_logins              77291 non-null int64
logged_in                      77291 non-null int64
num_compromised                77291 non-null int64
root_shell                     77291 non-null int64
su_attempted                   77291 non-null int64
num_root                       77291 non-null int64
num_file_creations             77291 non-null int64
num_

In [29]:
test_df_no_duplicates.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,label
0,0,udp,private,SF,105,146,0,0,0,0,...,254,1.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,normal.
1,0,udp,private,SF,105,146,0,0,0,0,...,254,1.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,snmpgetattack.
2,0,udp,private,SF,105,146,0,0,0,0,...,254,1.0,0.01,0.01,0.0,0.0,0.0,0.0,0.0,snmpgetattack.
3,0,udp,private,SF,105,146,0,0,0,0,...,255,1.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0,snmpgetattack.
4,0,udp,domain_u,SF,29,0,0,0,0,0,...,3,0.3,0.3,0.3,0.0,0.0,0.0,0.0,0.0,normal.


## 16. Save the test set to a text file.

In [32]:
test_df_no_duplicates.to_csv('../kddcup.data/kddcup.test.nodups.txt', header=False, index=False)

## 17. Split the test set into attributes and classes

In [33]:
X_test = test_df_no_duplicates.drop('label', axis=1)

In [34]:
y_test = train_df_no_duplicates.iloc[:,41]

## 18. Save them to text files

In [35]:
X_test.to_csv('../kddcup.data/x_test.txt', header=False, index=False)

In [36]:
y_test.to_csv('../kddcup.data/y_test.txt', header=False, index=False)