# KDD Cup 1999 Data

http://kdd.ics.uci.edu/databases/kddcup99/kddcup99.html

In [1]:
import pandas as pd
import matplotlib.pyplot as pyplot
from sklearn import datasets
import sklearn.preprocessing as sp
from sklearn.externals import joblib
% matplotlib inline


|ファイル名|ファイル内容|
|---|---|
|kddcup.data|フルデータ|
|kddcup.data_10_percent|フルデータの10%を抽出した学習用データ|
|corrected|正常・攻撃のラベル付けがなされた評価用データ|
|kddcup.testdata.unlabeled|正常・攻撃のラベル付けがなされていないデータ|
|kddcup.testdata.unlabeled_10_percent|正常・攻撃のラベル付けがなされていないデータの10%サブセット|
|kddcup.newtestdata_10_percent_unlabeled|正常・攻撃のラベル付けがなされていないデータの10%サブセット|

In [2]:
col_names = ["duration","protocol_type","service","flag","src_bytes",
             "dst_bytes","land","wrong_fragment","urgent","hot","num_failed_logins",
             "logged_in","num_compromised","root_shell","su_attempted","num_root","num_file_creations",
             "num_shells","num_access_files","num_outbound_cmds","is_host_login","is_guest_login","count",
             "srv_count","serror_rate","srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate",
             "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count",
             "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate",
             "dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate",
             "dst_host_rerror_rate","dst_host_srv_rerror_rate","label"]

In [3]:
#kdd_data = pd.read_csv("kddcup.data", header=None, names = col_names)

In [4]:
kdd_data_10percent = pd.read_csv("kddcup.data_10_percent", header=None, names = col_names)


In [5]:
kdd_data_10percent.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,label
0,0,tcp,http,SF,181,5450,0,0,0,0,...,9,1.0,0.0,0.11,0.0,0.0,0.0,0.0,0.0,normal.
1,0,tcp,http,SF,239,486,0,0,0,0,...,19,1.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0,normal.
2,0,tcp,http,SF,235,1337,0,0,0,0,...,29,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,normal.
3,0,tcp,http,SF,219,1337,0,0,0,0,...,39,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,normal.
4,0,tcp,http,SF,217,2032,0,0,0,0,...,49,1.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,normal.


# Transform Objects Into Categories

In [6]:
kdd_data_10percent.dtypes

duration                         int64
protocol_type                   object
service                         object
flag                            object
src_bytes                        int64
dst_bytes                        int64
land                             int64
wrong_fragment                   int64
urgent                           int64
hot                              int64
num_failed_logins                int64
logged_in                        int64
num_compromised                  int64
root_shell                       int64
su_attempted                     int64
num_root                         int64
num_file_creations               int64
num_shells                       int64
num_access_files                 int64
num_outbound_cmds                int64
is_host_login                    int64
is_guest_login                   int64
count                            int64
srv_count                        int64
serror_rate                    float64
srv_serror_rate          

In [7]:
kdd_data_10percent['label'].value_counts()

smurf.              280790
neptune.            107201
normal.              97278
back.                 2203
satan.                1589
ipsweep.              1247
portsweep.            1040
warezclient.          1020
teardrop.              979
pod.                   264
nmap.                  231
guess_passwd.           53
buffer_overflow.        30
land.                   21
warezmaster.            20
imap.                   12
rootkit.                10
loadmodule.              9
ftp_write.               8
multihop.                7
phf.                     4
perl.                    3
spy.                     2
Name: label, dtype: int64

In [8]:
kdd_data_10percent.protocol_type = kdd_data_10percent.protocol_type.astype("category")

In [9]:
kdd_data_10percent.service = kdd_data_10percent.service.astype("category")

In [10]:
kdd_data_10percent.flag = kdd_data_10percent.flag.astype("category")

In [11]:
kdd_data_10percent.label = kdd_data_10percent.label.astype("category")

In [12]:
kdd_data_10percent.dtypes

duration                          int64
protocol_type                  category
service                        category
flag                           category
src_bytes                         int64
dst_bytes                         int64
land                              int64
wrong_fragment                    int64
urgent                            int64
hot                               int64
num_failed_logins                 int64
logged_in                         int64
num_compromised                   int64
root_shell                        int64
su_attempted                      int64
num_root                          int64
num_file_creations                int64
num_shells                        int64
num_access_files                  int64
num_outbound_cmds                 int64
is_host_login                     int64
is_guest_login                    int64
count                             int64
srv_count                         int64
serror_rate                     float64


In [13]:
kdd_data_10percent['protocol_type'].value_counts()

icmp    283602
tcp     190065
udp      20354
Name: protocol_type, dtype: int64

In [14]:
kdd_data_10percent['service'].value_counts()

ecr_i          281400
private        110893
http            64293
smtp             9723
other            7237
domain_u         5863
ftp_data         4721
eco_i            1642
ftp               798
finger            670
urp_i             538
telnet            513
ntp_u             380
auth              328
pop_3             202
time              157
csnet_ns          126
remote_job        120
gopher            117
imap4             117
domain            116
discard           116
systat            115
iso_tsap          115
shell             112
echo              112
rje               111
whois             110
sql_net           110
printer           109
                ...  
uucp_path         106
uucp              106
bgp               106
klogin            106
ssh               105
supdup            105
nnsp              105
hostnames         104
login             104
daytime           103
efs               103
netbios_ns        102
link              102
pop_2             101
ldap      

In [15]:
kdd_data_10percent['flag'].value_counts()

SF        378440
S0         87007
REJ        26875
RSTR         903
RSTO         579
SH           107
S1            57
S2            24
RSTOS0        11
S3            10
OTH            8
Name: flag, dtype: int64

# Transform Categories Into Integers

In [16]:
le_protocol_type=sp.LabelEncoder()

In [17]:
le_protocol_type.fit(kdd_data_10percent['protocol_type'])

LabelEncoder()

In [37]:
joblib.dump(le_protocol_type, 'dump/le_protocol_type.pkl') 

['dump/le_protocol_type.pkl']

In [18]:
kdd_data_10percent.protocol_type=le_protocol_type.transform(kdd_data_10percent['protocol_type']) 

In [19]:
kdd_data_10percent.protocol_type.value_counts()

0    283602
1    190065
2     20354
Name: protocol_type, dtype: int64

In [20]:
le_service=sp.LabelEncoder()

In [21]:
le_service.fit(kdd_data_10percent['service'])

LabelEncoder()

In [38]:
joblib.dump(le_service, 'dump/le_service.pkl') 

['dump/le_service.pkl']

In [22]:
kdd_data_10percent.service=le_service.transform(kdd_data_10percent['service']) 

In [23]:
kdd_data_10percent.service.value_counts()

14    281400
45    110893
22     64293
50      9723
40      7237
11      5863
19      4721
13      1642
18       798
17       670
61       538
56       513
39       380
3        328
43       202
59       157
6        126
47       120
20       117
24       117
10       116
9        116
55       115
25       115
49       112
12       112
48       111
65       110
51       110
44       109
       ...  
63       106
62       106
4        106
26       106
52       105
54       105
37       105
21       104
30       104
8        103
15       103
34       102
29       102
42       101
28       101
23        99
16        99
33        99
32        98
27        98
7         97
36        95
2         92
0         43
60        14
1         11
58         7
41         1
46         1
57         1
Name: service, dtype: int64

In [24]:
le_flag=sp.LabelEncoder()

In [25]:
le_flag.fit(kdd_data_10percent['flag'])

LabelEncoder()

In [39]:
joblib.dump(le_flag, 'dump/le_flag.pkl') 

['dump/le_flag.pkl']

In [26]:
kdd_data_10percent.flag=le_flag.transform(kdd_data_10percent['flag']) 

In [27]:
kdd_data_10percent['flag'].value_counts()

9     378440
5      87007
1      26875
4        903
2        579
10       107
6         57
7         24
3         11
8         10
0          8
Name: flag, dtype: int64

In [28]:
train_labels = kdd_data_10percent['label'].copy()

In [29]:
train_features = kdd_data_10percent.drop('label',axis=1)

In [30]:
train_labels.head()

0    normal.
1    normal.
2    normal.
3    normal.
4    normal.
Name: label, dtype: category
Categories (23, object): [back., buffer_overflow., ftp_write., guess_passwd., ..., spy., teardrop., warezclient., warezmaster.]

In [31]:
train_features.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
0,0,1,22,9,181,5450,0,0,0,0,...,9,9,1.0,0.0,0.11,0.0,0.0,0.0,0.0,0.0
1,0,1,22,9,239,486,0,0,0,0,...,19,19,1.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0
2,0,1,22,9,235,1337,0,0,0,0,...,29,29,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0
3,0,1,22,9,219,1337,0,0,0,0,...,39,39,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0
4,0,1,22,9,217,2032,0,0,0,0,...,49,49,1.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0


# SVM training

In [32]:
from sklearn import svm

clf = svm.SVC()
clf.fit(train_features, train_labels)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [33]:
#joblib.dump(clf, 'dump/clf.pkl') 

['dump/clf.pkl']

In [None]:
#clf = joblib.load('dump/clf.pkl')

In [34]:
test_pred = clf.predict(train_features)

In [35]:
test_pred

array(['normal.', 'normal.', 'normal.', ..., 'normal.', 'normal.',
       'normal.'], dtype=object)

In [36]:
from sklearn.metrics import classification_report, accuracy_score

print(classification_report(train_labels, test_pred))
print(accuracy_score(train_labels, test_pred))

                  precision    recall  f1-score   support

           back.       1.00      1.00      1.00      2203
buffer_overflow.       1.00      1.00      1.00        30
      ftp_write.       1.00      0.75      0.86         8
   guess_passwd.       1.00      1.00      1.00        53
           imap.       1.00      0.92      0.96        12
        ipsweep.       0.96      1.00      0.98      1247
           land.       1.00      0.95      0.98        21
     loadmodule.       1.00      1.00      1.00         9
       multihop.       1.00      1.00      1.00         7
        neptune.       1.00      1.00      1.00    107201
           nmap.       0.98      0.78      0.87       231
         normal.       1.00      1.00      1.00     97278
           perl.       1.00      1.00      1.00         3
            phf.       1.00      1.00      1.00         4
            pod.       1.00      1.00      1.00       264
      portsweep.       1.00      0.93      0.96      1040
        rootk

In [40]:
train_labels = kdd_data_10percent['label'].copy()