# KDD Cup 1999 Data

http://kdd.ics.uci.edu/databases/kddcup99/kddcup99.html

In [2]:
import pandas as pd
import matplotlib.pyplot as pyplot
from sklearn import datasets
import sklearn.preprocessing as sp
from sklearn.externals import joblib
from sklearn import svm
% matplotlib inline


|ファイル名|ファイル内容|
|---|---|
|kddcup.data|フルデータ|
|kddcup.data_10_percent|フルデータの10%を抽出した学習用データ|
|corrected|正常・攻撃のラベル付けがなされた評価用データ|
|kddcup.testdata.unlabeled|正常・攻撃のラベル付けがなされていないデータ|
|kddcup.testdata.unlabeled_10_percent|正常・攻撃のラベル付けがなされていないデータの10%サブセット|
|kddcup.newtestdata_10_percent_unlabeled|正常・攻撃のラベル付けがなされていないデータの10%サブセット|

In [3]:
col_names = ["duration","protocol_type","service","flag","src_bytes",
             "dst_bytes","land","wrong_fragment","urgent","hot","num_failed_logins",
             "logged_in","num_compromised","root_shell","su_attempted","num_root","num_file_creations",
             "num_shells","num_access_files","num_outbound_cmds","is_host_login","is_guest_login","count",
             "srv_count","serror_rate","srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate",
             "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count",
             "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate",
             "dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate",
             "dst_host_rerror_rate","dst_host_srv_rerror_rate","label"]

In [26]:
data = pd.read_csv("kddcup.data", header=None, names = col_names)

In [27]:
data.protocol_type = data.protocol_type.astype("category")

In [28]:
data.service = data.service.astype("category")

In [29]:
data.flag = data.flag.astype("category")

In [30]:
data.dtypes

duration                          int64
protocol_type                  category
service                        category
flag                           category
src_bytes                         int64
dst_bytes                         int64
land                              int64
wrong_fragment                    int64
urgent                            int64
hot                               int64
num_failed_logins                 int64
logged_in                         int64
num_compromised                   int64
root_shell                        int64
su_attempted                      int64
num_root                          int64
num_file_creations                int64
num_shells                        int64
num_access_files                  int64
num_outbound_cmds                 int64
is_host_login                     int64
is_guest_login                    int64
count                             int64
srv_count                         int64
serror_rate                     float64


In [31]:
le_protocol_type=sp.LabelEncoder()

In [32]:
le_protocol_type.fit(data.protocol_type)

LabelEncoder()

In [33]:
#joblib.dump(le_protocol_type, 'dump/kddcup.data/le_protocol_type.pkl') 

['dump/kddcup.data/le_protocol_type.pkl']

In [34]:
data.protocol_type.value_counts()

icmp    2833545
tcp     1870598
udp      194288
Name: protocol_type, dtype: int64

In [35]:
data.protocol_type=le_protocol_type.transform(data.protocol_type) 

In [36]:
data.protocol_type.value_counts()

0    2833545
1    1870598
2     194288
Name: protocol_type, dtype: int64

In [37]:
le_service=sp.LabelEncoder()

In [38]:
le_service.fit(data.service)

LabelEncoder()

In [39]:
#joblib.dump(le_service, 'dump/kddcup.data/le_service.pkl') 

['dump/kddcup.data/le_service.pkl']

In [40]:
data.service.value_counts()

ecr_i          2811660
private        1100831
http            623091
smtp             96554
other            72653
domain_u         57782
ftp_data         40697
eco_i            16338
finger            6891
urp_i             5378
ftp               5214
telnet            4277
ntp_u             3833
auth              3382
pop_3             1981
time              1579
domain            1113
Z39_50            1078
gopher            1077
mtp               1076
ssh               1075
whois             1073
remote_job        1073
rje               1070
link              1069
imap4             1069
ctf               1068
name              1067
supdup            1060
echo              1059
                ...   
vmnet             1053
iso_tsap          1052
netbios_dgm       1052
sql_net           1052
shell             1051
csnet_ns          1051
klogin            1050
hostnames         1050
bgp               1047
exec              1045
login             1045
printer           1045
http_443   

In [41]:
data.service=le_service.transform(data.service) 

In [42]:
data.service.value_counts()

15    2811660
49    1100831
24     623091
54      96554
44      72653
12      57782
20      40697
14      16338
18       6891
65       5378
19       5214
60       4277
43       3833
4        3382
47       1981
63       1579
11       1113
2        1078
21       1077
35       1076
56       1075
69       1073
51       1073
52       1070
33       1069
28       1069
8        1068
36       1067
58       1060
13       1059
       ...   
68       1053
29       1052
37       1052
55       1052
53       1051
7        1051
30       1050
23       1050
5        1047
17       1045
34       1045
48       1045
26       1044
16       1042
66       1041
32       1041
31       1040
41       1038
6        1021
0         521
64        148
1         135
62         12
50          9
45          5
61          3
3           2
27          2
22          2
25          1
Name: service, dtype: int64

In [43]:
le_flag=sp.LabelEncoder()

In [44]:
le_flag.fit(data.flag)

LabelEncoder()

In [46]:
#joblib.dump(le_flag, 'dump/kddcup.data/le_flag.pkl') 

['dump/kddcup.data/le_flag.pkl']

In [45]:
data.flag.value_counts()

SF        3744328
S0         869829
REJ        268874
RSTR         8094
RSTO         5344
SH           1040
S1            532
S2            161
RSTOS0        122
OTH            57
S3             50
Name: flag, dtype: int64

In [47]:
data.flag=le_flag.transform(data.flag) 

In [48]:
data.flag.value_counts()

9     3744328
5      869829
1      268874
4        8094
2        5344
10       1040
6         532
7         161
3         122
0          57
8          50
Name: flag, dtype: int64

In [49]:
#joblib.dump(data, 'dump/kddcup.data/kddcup.data.00.pkl') 

['dump/kddcup.data/kddcup.data.00.pkl']

In [50]:
le_label=sp.LabelEncoder()

In [51]:
le_label.fit(data.label)
#joblib.dump(le_label, 'dump/kddcup.data/le_label.pkl') 

['dump/kddcup.data/le_label.pkl']

In [52]:
data.label.value_counts()

smurf.              2807886
neptune.            1072017
normal.              972781
satan.                15892
ipsweep.              12481
portsweep.            10413
nmap.                  2316
back.                  2203
warezclient.           1020
teardrop.               979
pod.                    264
guess_passwd.            53
buffer_overflow.         30
land.                    21
warezmaster.             20
imap.                    12
rootkit.                 10
loadmodule.               9
ftp_write.                8
multihop.                 7
phf.                      4
perl.                     3
spy.                      2
Name: label, dtype: int64

In [53]:
data.label=le_label.transform(data.label) 

In [54]:
#joblib.dump(data, 'dump/kddcup.data/kddcup.data.01.pkl') 

['dump/kddcup.data/kddcup.data.01.pkl']

In [55]:
kdd_data_10percent = pd.read_csv("kddcup.data_10_percent", header=None, names = col_names)

In [57]:
kdd_data_10percent.protocol_type.value_counts()

icmp    283602
tcp     190065
udp      20354
Name: protocol_type, dtype: int64

In [58]:
kdd_data_10percent.protocol_type=le_protocol_type.transform(kdd_data_10percent.protocol_type)

In [59]:
kdd_data_10percent.protocol_type.value_counts()

0    283602
1    190065
2     20354
Name: protocol_type, dtype: int64

In [60]:
kdd_data_10percent.service.value_counts()

ecr_i          281400
private        110893
http            64293
smtp             9723
other            7237
domain_u         5863
ftp_data         4721
eco_i            1642
ftp               798
finger            670
urp_i             538
telnet            513
ntp_u             380
auth              328
pop_3             202
time              157
csnet_ns          126
remote_job        120
gopher            117
imap4             117
discard           116
domain            116
iso_tsap          115
systat            115
shell             112
echo              112
rje               111
whois             110
sql_net           110
printer           109
                ...  
uucp              106
klogin            106
vmnet             106
uucp_path         106
supdup            105
nnsp              105
ssh               105
login             104
hostnames         104
daytime           103
efs               103
netbios_ns        102
link              102
ldap              101
pop_2     

In [61]:
kdd_data_10percent.service=le_service.transform(kdd_data_10percent.service)

In [62]:
kdd_data_10percent.service.value_counts()

15    281400
49    110893
24     64293
54      9723
44      7237
12      5863
20      4721
14      1642
19       798
18       670
65       538
60       513
43       380
4        328
47       202
63       157
7        126
51       120
21       117
28       117
11       116
10       116
59       115
29       115
53       112
13       112
52       111
69       110
55       110
48       109
       ...  
67       106
66       106
5        106
30       106
56       105
58       105
41       105
23       104
34       104
9        103
16       103
38       102
33       102
46       101
32       101
26        99
17        99
37        99
36        98
31        98
8         97
40        95
2         92
0         43
64        14
1         11
62         7
45         1
50         1
61         1
Name: service, dtype: int64

In [63]:
kdd_data_10percent.flag.value_counts()

SF        378440
S0         87007
REJ        26875
RSTR         903
RSTO         579
SH           107
S1            57
S2            24
RSTOS0        11
S3            10
OTH            8
Name: flag, dtype: int64

In [64]:
kdd_data_10percent.flag=le_flag.transform(kdd_data_10percent.flag)

In [65]:
kdd_data_10percent.flag.value_counts()

9     378440
5      87007
1      26875
4        903
2        579
10       107
6         57
7         24
3         11
8         10
0          8
Name: flag, dtype: int64

In [66]:
#joblib.dump(kdd_data_10percent, 'dump/kddcup.data/kdd_data_10percent.00.pkl') 

['dump/kddcup.data/kdd_data_10percent.00.pkl']

In [67]:
kdd_data_10percent.label.value_counts()

smurf.              280790
neptune.            107201
normal.              97278
back.                 2203
satan.                1589
ipsweep.              1247
portsweep.            1040
warezclient.          1020
teardrop.              979
pod.                   264
nmap.                  231
guess_passwd.           53
buffer_overflow.        30
land.                   21
warezmaster.            20
imap.                   12
rootkit.                10
loadmodule.              9
ftp_write.               8
multihop.                7
phf.                     4
perl.                    3
spy.                     2
Name: label, dtype: int64

In [68]:
kdd_data_10percent.label=le_label.transform(kdd_data_10percent.label)

In [69]:
kdd_data_10percent.label.value_counts()

18    280790
9     107201
11     97278
0       2203
17      1589
5       1247
15      1040
21      1020
20       979
14       264
10       231
3         53
1         30
6         21
22        20
4         12
16        10
7          9
2          8
8          7
13         4
12         3
19         2
Name: label, dtype: int64

In [70]:
#joblib.dump(kdd_data_10percent, 'dump/kddcup.data/kdd_data_10percent.01.pkl') 

['dump/kddcup.data/kdd_data_10percent.01.pkl']

In [71]:
train_features = kdd_data_10percent.drop('label',axis=1)

In [72]:
train_labels = kdd_data_10percent['label'].copy()

In [73]:
train_features.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
0,0,1,24,9,181,5450,0,0,0,0,...,9,9,1.0,0.0,0.11,0.0,0.0,0.0,0.0,0.0
1,0,1,24,9,239,486,0,0,0,0,...,19,19,1.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0
2,0,1,24,9,235,1337,0,0,0,0,...,29,29,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0
3,0,1,24,9,219,1337,0,0,0,0,...,39,39,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0
4,0,1,24,9,217,2032,0,0,0,0,...,49,49,1.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0


In [74]:
train_labels.head()

0    11
1    11
2    11
3    11
4    11
Name: label, dtype: int64

In [75]:
le_label.inverse_transform(train_labels.head())

array(['normal.', 'normal.', 'normal.', 'normal.', 'normal.'], dtype=object)

In [76]:
clf = svm.SVC()
clf.fit(train_features, train_labels)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [77]:
#joblib.dump(clf, 'dump/kddcup.data/kdd_data_10percent.clf.pkl') 

['dump/kddcup.data/kdd_data_10percent.clf.pkl']

In [78]:
test_pred = clf.predict(train_features)

In [79]:
from sklearn.metrics import classification_report, accuracy_score

print(classification_report(train_labels, test_pred))
print(accuracy_score(train_labels, test_pred))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00      2203
          1       1.00      1.00      1.00        30
          2       1.00      0.75      0.86         8
          3       1.00      1.00      1.00        53
          4       1.00      0.92      0.96        12
          5       0.96      1.00      0.98      1247
          6       1.00      0.95      0.98        21
          7       1.00      1.00      1.00         9
          8       1.00      1.00      1.00         7
          9       1.00      1.00      1.00    107201
         10       0.98      0.78      0.87       231
         11       1.00      1.00      1.00     97278
         12       1.00      1.00      1.00         3
         13       1.00      1.00      1.00         4
         14       1.00      1.00      1.00       264
         15       1.00      0.93      0.96      1040
         16       1.00      0.90      0.95        10
         17       1.00      0.96      0.98   

In [100]:
train_features = kdd_data_10percent.drop('label',axis=1)

In [101]:
train_labels = kdd_data_10percent['label'].copy()

In [102]:
train_features= train_features.drop('service',axis=1)

In [103]:
clf2 = svm.SVC()
clf2.fit(train_features, train_labels)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [118]:
#joblib.dump(clf2, 'dump/kddcup.data/kdd_data_10percent.clf2.pkl') 

In [106]:
test_pred = clf2.predict(train_features)

In [115]:
from sklearn.metrics import classification_report, accuracy_score

print(classification_report(train_labels, test_pred))
print(accuracy_score(train_labels, test_pred))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00      2203
          1       1.00      1.00      1.00        30
          2       1.00      0.75      0.86         8
          3       1.00      1.00      1.00        53
          4       1.00      0.92      0.96        12
          5       0.96      1.00      0.97      1247
          6       0.95      0.95      0.95        21
          7       1.00      1.00      1.00         9
          8       1.00      1.00      1.00         7
          9       1.00      1.00      1.00    107201
         10       0.98      0.76      0.86       231
         11       1.00      1.00      1.00     97278
         12       1.00      1.00      1.00         3
         13       1.00      1.00      1.00         4
         14       1.00      1.00      1.00       264
         15       1.00      0.92      0.96      1040
         16       1.00      0.70      0.82        10
         17       0.99      0.95      0.97   

In [114]:
kdd_data_10percent.label = list(le_label.inverse_transform(kdd_data_10percent.label))

In [116]:
train_labels = kdd_data_10percent['label'].copy()

In [117]:
clf3 = svm.SVC()
clf3.fit(train_features, train_labels)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [119]:
#joblib.dump(clf3, 'dump/kddcup.data/kdd_data_10percent.clf3.pkl') 

['dump/kddcup.data/kdd_data_10percent.clf3.pkl']

In [120]:
test_pred = clf3.predict(train_features)

In [121]:
from sklearn.metrics import classification_report, accuracy_score

print(classification_report(train_labels, test_pred))
print(accuracy_score(train_labels, test_pred))

                  precision    recall  f1-score   support

           back.       1.00      1.00      1.00      2203
buffer_overflow.       1.00      1.00      1.00        30
      ftp_write.       1.00      0.75      0.86         8
   guess_passwd.       1.00      1.00      1.00        53
           imap.       1.00      0.92      0.96        12
        ipsweep.       0.96      1.00      0.97      1247
           land.       0.95      0.95      0.95        21
     loadmodule.       1.00      1.00      1.00         9
       multihop.       1.00      1.00      1.00         7
        neptune.       1.00      1.00      1.00    107201
           nmap.       0.98      0.76      0.86       231
         normal.       1.00      1.00      1.00     97278
           perl.       1.00      1.00      1.00         3
            phf.       1.00      1.00      1.00         4
            pod.       1.00      1.00      1.00       264
      portsweep.       1.00      0.92      0.96      1040
        rootk

In [122]:
corrected = pd.read_csv("corrected", header=None, names = col_names)

In [123]:
corrected.protocol_type=le_protocol_type.transform(corrected.protocol_type)

In [124]:
corrected.flag = le_flag.transform(corrected.flag)

In [125]:
test_features = corrected.drop(['label','service'],axis=1)

In [126]:
test_features.head()

Unnamed: 0,duration,protocol_type,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
0,0,2,9,105,146,0,0,0,0,0,...,255,254,1.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0
1,0,2,9,105,146,0,0,0,0,0,...,255,254,1.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0
2,0,2,9,105,146,0,0,0,0,0,...,255,254,1.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0
3,0,2,9,105,146,0,0,0,0,0,...,255,254,1.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0
4,0,2,9,105,146,0,0,0,0,0,...,255,254,1.0,0.01,0.01,0.0,0.0,0.0,0.0,0.0


In [127]:
test_labels = corrected.label.copy()

In [128]:
test_pred = clf3.predict(test_features)

In [129]:
from sklearn.metrics import classification_report, accuracy_score

print(classification_report(test_labels, test_pred))
print(accuracy_score(test_labels, test_pred))

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


                  precision    recall  f1-score   support

        apache2.       0.00      0.00      0.00       794
           back.       1.00      0.78      0.88      1098
buffer_overflow.       0.00      0.00      0.00        22
      ftp_write.       0.00      0.00      0.00         3
   guess_passwd.       0.00      0.00      0.00      4367
     httptunnel.       0.00      0.00      0.00       158
           imap.       0.00      0.00      0.00         1
        ipsweep.       0.43      0.95      0.59       306
           land.       0.50      0.56      0.53         9
     loadmodule.       0.00      0.00      0.00         2
       mailbomb.       0.00      0.00      0.00      5000
          mscan.       0.00      0.00      0.00      1053
       multihop.       0.00      0.00      0.00        18
          named.       0.00      0.00      0.00        17
        neptune.       1.00      0.99      0.99     58001
           nmap.       0.97      0.83      0.90        84
         norm