# KDD Cup 1999 Data

http://kdd.ics.uci.edu/databases/kddcup99/kddcup99.html

In [1]:
import sklearn
import pandas as pd
from sklearn import preprocessing
from sklearn.utils import resample
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
import numpy as np
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
import time
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.externals import joblib
from sklearn.utils import resample

In [2]:
print('The scikit-learn version is {}.'.format(sklearn.__version__))

The scikit-learn version is 0.18.1.


In [3]:
col_names = ["duration","protocol_type","service","flag","src_bytes",
             "dst_bytes","land","wrong_fragment","urgent","hot","num_failed_logins",
             "logged_in","num_compromised","root_shell","su_attempted","num_root","num_file_creations",
             "num_shells","num_access_files","num_outbound_cmds","is_host_login","is_guest_login","count",
             "srv_count","serror_rate","srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate",
             "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count",
             "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate",
             "dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate",
             "dst_host_rerror_rate","dst_host_srv_rerror_rate","label"]

In [4]:
data = pd.read_csv("kddcup.data_10_percent", header=None, names = col_names)

# 前処理
## カテゴリ化

In [5]:
data.label.value_counts()

smurf.              280790
neptune.            107201
normal.              97278
back.                 2203
satan.                1589
ipsweep.              1247
portsweep.            1040
warezclient.          1020
teardrop.              979
pod.                   264
nmap.                  231
guess_passwd.           53
buffer_overflow.        30
land.                   21
warezmaster.            20
imap.                   12
rootkit.                10
loadmodule.              9
ftp_write.               8
multihop.                7
phf.                     4
perl.                    3
spy.                     2
Name: label, dtype: int64

In [6]:
data['label2'] = data.label.where(data.label.str.contains('normal'),'atack')

In [7]:
data.label2.value_counts()

atack      396743
normal.     97278
Name: label2, dtype: int64

In [8]:
data['label3'] = data.label.copy()

In [9]:
data.loc[data.label.str.contains('back|land|neptune|pod|smurf|teardrop'),'label3'] = 'DoS'

In [10]:
data.loc[data.label.str.contains('buffer_overflow|loadmodule|perl|rootkit'),'label3'] = 'U2R'

In [11]:
data.loc[data.label.str.contains('ftp_write|guess_passwd|imap|multihop|phf|spy|warezclient|warezmaster'),'label3'] = 'R2L'

In [12]:
data.loc[data.label.str.contains('ipsweep|nmap|portsweep|satan'),'label3'] = 'Probe'

In [13]:
data.label3.value_counts()

DoS        391458
normal.     97278
Probe        4107
R2L          1126
U2R            52
Name: label3, dtype: int64

## サンプリング

In [14]:
data = resample(data,n_samples=10000,random_state=0)

In [15]:
data.shape

(10000, 44)

## 数値化

In [16]:
le_protocol_type = preprocessing.LabelEncoder()

In [17]:
le_protocol_type.fit(data.protocol_type)

LabelEncoder()

In [18]:
data.protocol_type=le_protocol_type.transform(data.protocol_type)

In [19]:
le_service = preprocessing.LabelEncoder()

In [20]:
le_service.fit(data.service)

LabelEncoder()

In [21]:
data.service = le_service.transform(data.service)

In [22]:
le_flag = preprocessing.LabelEncoder()

In [23]:
le_flag.fit(data.flag)

LabelEncoder()

In [24]:
data.flag = le_flag.transform(data.flag)

In [25]:
data.describe()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,...,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,49.5968,0.4634,21.1743,4.5342,1891.576,603.5551,0.0,0.0083,0.0,0.0336,...,232.5302,189.3929,0.757081,0.031554,0.606977,0.00752,0.172559,0.172351,0.060951,0.059412
std,636.594748,0.577143,13.340889,1.186917,72677.12,7482.216333,0.0,0.154381,0.0,0.777902,...,64.700843,105.664136,0.409374,0.112929,0.480181,0.049471,0.376877,0.377355,0.235419,0.233642
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,12.0,5.0,45.0,0.0,0.0,0.0,0.0,0.0,...,255.0,56.0,0.4975,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,12.0,5.0,520.0,0.0,0.0,0.0,0.0,0.0,...,255.0,255.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,1.0,38.0,5.0,1032.0,0.0,0.0,0.0,0.0,0.0,...,255.0,255.0,1.0,0.04,1.0,0.0,0.0,0.0,0.0,0.0
max,18259.0,2.0,59.0,6.0,5133876.0,646195.0,0.0,3.0,0.0,28.0,...,255.0,255.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [26]:
data.shape

(10000, 44)

## ラベルの分離

In [27]:
y_train_1 = data.label.copy() 

In [28]:
y_train_2 = data.label2.copy()

In [29]:
y_train_3 = data.label3.copy()

In [30]:
x_train = data.drop(['label','label2','label3'],axis=1)

In [31]:
x_train.shape

(10000, 41)

In [32]:
y_train_1.shape

(10000,)

In [33]:
y_train_2.shape

(10000,)

In [34]:
y_train_3.shape

(10000,)

## 標準化

In [35]:
ss = preprocessing.StandardScaler()

In [36]:
ss.fit(x_train)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [37]:
x_train = ss.transform(x_train)

In [38]:
col_names2 = ["duration","protocol_type","service","flag","src_bytes",
             "dst_bytes","land","wrong_fragment","urgent","hot","num_failed_logins",
             "logged_in","num_compromised","root_shell","su_attempted","num_root","num_file_creations",
             "num_shells","num_access_files","num_outbound_cmds","is_host_login","is_guest_login","count",
             "srv_count","serror_rate","srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate",
             "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count",
             "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate",
             "dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate",
             "dst_host_rerror_rate","dst_host_srv_rerror_rate"]

In [39]:
pd.DataFrame(x_train,columns=col_names2).describe()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,...,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,2.154305e-16,-1.536049e-15,-2.407852e-16,-1.723688e-15,4.403779e-16,-1.968453e-16,0.0,1.044563e-15,0.0,-5.197356e-16,...,-2.226097e-15,-9.123813000000001e-17,6.381895e-16,4.970468e-16,-3.66418e-16,3.156364e-16,1.586509e-16,-3.5998980000000004e-17,7.39675e-16,8.2645e-16
std,1.00005,1.00005,1.00005,1.00005,1.00005,1.00005,0.0,1.00005,0.0,1.00005,...,1.00005,1.00005,1.00005,1.00005,1.00005,1.00005,1.00005,1.00005,1.00005,1.00005
min,-0.07791343,-0.8029611,-1.587252,-3.820341,-0.02602842,-0.08066932,0.0,-0.05376578,0.0,-0.04319526,...,-3.578652,-1.78303,-1.849454,-0.2794293,-1.264123,-0.1520166,-0.4578883,-0.4567573,-0.2589172,-0.2542987
25%,-0.07791343,-0.8029611,-0.6877172,0.392465,-0.02540921,-0.08066932,0.0,-0.05376578,0.0,-0.04319526,...,0.347305,-1.262487,-0.634124,-0.2794293,-1.264123,-0.1520166,-0.4578883,-0.4567573,-0.2589172,-0.2542987
50%,-0.07791343,-0.8029611,-0.6877172,0.392465,-0.01887313,-0.08066932,0.0,-0.05376578,0.0,-0.04319526,...,0.347305,0.6209333,0.5934208,-0.2794293,0.8185306,-0.1520166,-0.4578883,-0.4567573,-0.2589172,-0.2542987
75%,-0.07791343,0.9297991,1.261276,0.392465,-0.01182792,-0.08066932,0.0,-0.05376578,0.0,-0.04319526,...,0.347305,0.6209333,0.5934208,0.07479433,0.8185306,-0.1520166,-0.4578883,-0.4567573,-0.2589172,-0.2542987
max,28.60582,2.662559,2.835463,1.235026,70.61701,86.28777,0.0,19.37965,0.0,35.95285,...,0.347305,0.6209333,0.5934208,8.576162,0.8185306,20.06296,2.195629,2.1934,3.98904,4.02596


## 学習

In [40]:
pca = PCA(copy=True, iterated_power='auto', n_components=3, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [41]:
pca.fit(x_train)

PCA(copy=True, iterated_power='auto', n_components=3, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [42]:
x_train2 = pca.transform(x_train)

In [43]:
x_train2.shape

(10000, 3)

In [44]:
clf = SVC(C=10.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma=1.0, kernel='poly',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [45]:
t1=time.perf_counter()
clf.fit(x_train2,y_train_2)
t2=time.perf_counter()

In [46]:
print(t2-t1,"秒")

165.32065879000584 秒


## 予測

In [47]:
t1=time.perf_counter()
pred=clf.predict(x_train2)
t2=time.perf_counter()


In [48]:
print(t2-t1,"秒")

0.04402793899498647 秒


In [49]:
print(classification_report(y_train_2, pred))
print(confusion_matrix(y_train_2, pred))

             precision    recall  f1-score   support

      atack       1.00      0.99      0.99      8058
    normal.       0.95      0.99      0.97      1942

avg / total       0.99      0.99      0.99     10000

[[7958  100]
 [  14 1928]]


## 学習

In [50]:
pca2 = PCA(copy=True, iterated_power='auto', n_components=4, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)


In [51]:
clf2 = SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma=0.17782794100389229,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [52]:
pca2.fit(x_train)

PCA(copy=True, iterated_power='auto', n_components=4, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [53]:
x_train3 = pca.transform(x_train)

In [54]:
t1=time.perf_counter()
clf2.fit(x_train3,y_train_3)
t2=time.perf_counter()

In [55]:
print(t2-t1,"秒")

0.11625554800411919 秒


## 予測

In [56]:
t1=time.perf_counter()
pred2=clf2.predict(x_train3)
t2=time.perf_counter()


In [57]:
print(t2-t1,"秒")

0.05945428799896035 秒


In [58]:
print(classification_report(y_train_3, pred2))
print(confusion_matrix(y_train_3, pred2))

             precision    recall  f1-score   support

        DoS       1.00      1.00      1.00      7931
      Probe       1.00      0.81      0.89        98
        R2L       0.78      0.26      0.39        27
        U2R       1.00      1.00      1.00         2
    normal.       0.97      1.00      0.98      1942

avg / total       0.99      0.99      0.99     10000

[[7894    0    0    0   37]
 [  12   79    0    0    7]
 [   2    0    7    0   18]
 [   0    0    0    2    0]
 [   3    0    2    0 1937]]
