# KDD Cup 1999 Data

http://kdd.ics.uci.edu/databases/kddcup99/kddcup99.html

In [43]:
import sklearn
import pandas as pd
from sklearn import preprocessing
from sklearn.utils import resample
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
import numpy as np
from sklearn.decomposition import PCA
from sklearn.neural_network import MLPClassifier 
from sklearn.pipeline import Pipeline
import time
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.externals import joblib
from sklearn.utils import resample

In [44]:
print('The scikit-learn version is {}.'.format(sklearn.__version__))

The scikit-learn version is 0.18.1.


In [45]:
col_names = ["duration","protocol_type","service","flag","src_bytes",
             "dst_bytes","land","wrong_fragment","urgent","hot","num_failed_logins",
             "logged_in","num_compromised","root_shell","su_attempted","num_root","num_file_creations",
             "num_shells","num_access_files","num_outbound_cmds","is_host_login","is_guest_login","count",
             "srv_count","serror_rate","srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate",
             "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count",
             "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate",
             "dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate",
             "dst_host_rerror_rate","dst_host_srv_rerror_rate","label"]

In [46]:
data = pd.read_csv("data/kddcup.data_10_percent", header=None, names = col_names)

In [47]:
data.shape

(494021, 42)

# 前処理
## カテゴリ化

In [48]:
data.label.value_counts()

smurf.              280790
neptune.            107201
normal.              97278
back.                 2203
satan.                1589
ipsweep.              1247
portsweep.            1040
warezclient.          1020
teardrop.              979
pod.                   264
nmap.                  231
guess_passwd.           53
buffer_overflow.        30
land.                   21
warezmaster.            20
imap.                   12
rootkit.                10
loadmodule.              9
ftp_write.               8
multihop.                7
phf.                     4
perl.                    3
spy.                     2
Name: label, dtype: int64

In [49]:
data['label2'] = data.label.where(data.label.str.contains('normal'),'atack')

In [50]:
data.label2.value_counts()

atack      396743
normal.     97278
Name: label2, dtype: int64

In [51]:
data['label3'] = data.label.copy()

In [52]:
data.loc[data.label.str.contains('back|land|neptune|pod|smurf|teardrop|mailbomb|apache2|processtable|udpstorm'),'label3'] = 'DoS'

In [53]:
data.loc[data.label.str.contains('buffer_overflow|loadmodule|perl|rootkit|ps|xterm|sqlattack'),'label3'] = 'U2R'

In [54]:
data.loc[data.label.str.contains('ftp_write|guess_passwd|imap|multihop|phf|spy|warezclient|warezmaster|snmpgetattack|snmpguess|httptunnel|sendmail|named|xlock|xsnoop|worm'),'label3'] = 'R2L'

In [55]:
data.loc[data.label.str.contains('ipsweep|nmap|portsweep|satan|mscan|saint'),'label3'] = 'Probe'

In [56]:
data.label3.value_counts()

DoS        391458
normal.     97278
Probe        4107
R2L          1126
U2R            52
Name: label3, dtype: int64

In [57]:
#joblib.dump(data,'dump/20171118/corrected.pkl')

## サンプリング

In [58]:
data = resample(data,n_samples=2000,random_state=0)

In [59]:
data.shape

(2000, 44)

## 数値化

In [60]:
le_protocol_type = preprocessing.LabelEncoder()

In [61]:
le_protocol_type.fit(data.protocol_type)

LabelEncoder()

In [62]:
data.protocol_type=le_protocol_type.transform(data.protocol_type)

In [63]:
le_service = preprocessing.LabelEncoder()

In [64]:
le_service.fit(data.service)

LabelEncoder()

In [65]:
data.service = le_service.transform(data.service)

In [66]:
le_flag = preprocessing.LabelEncoder()

In [67]:
le_flag.fit(data.flag)

LabelEncoder()

In [68]:
data.flag = le_flag.transform(data.flag)

In [69]:
data.describe()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
count,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,...,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0
mean,62.2135,0.4885,12.034,3.599,3435.584,606.287,0.0,0.0075,0.0,0.012,...,231.917,189.2135,0.75588,0.032405,0.594735,0.007305,0.175355,0.17432,0.05802,0.05678
std,712.957848,0.591644,5.784857,0.953758,114836.3,5315.351494,0.0,0.14985,0.0,0.154493,...,65.6843,105.264259,0.409118,0.115545,0.482721,0.043859,0.37943,0.379119,0.230094,0.228411
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,8.0,4.0,44.0,0.0,0.0,0.0,0.0,0.0,...,255.0,66.0,0.46,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,8.0,4.0,520.0,0.0,0.0,0.0,0.0,0.0,...,255.0,255.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,1.0,19.0,4.0,1032.0,0.0,0.0,0.0,0.0,0.0,...,255.0,255.0,1.0,0.04,1.0,0.0,0.0,0.0,0.0,0.0
max,14744.0,2.0,32.0,5.0,5133876.0,202976.0,0.0,3.0,0.0,3.0,...,255.0,255.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [70]:
data.shape

(2000, 44)

## ラベルの分離

In [71]:
y_test_1 = data.label.copy() 

In [72]:
y_test_2 = data.label2.copy()

In [73]:
y_test_3 = data.label3.copy()

In [74]:
x_test= data.drop(['label','label2','label3'],axis=1)

In [75]:
x_test.shape

(2000, 41)

In [76]:
y_test_1.shape

(2000,)

In [77]:
y_test_2.shape

(2000,)

In [78]:
y_test_3.shape

(2000,)

## 標準化

In [79]:
ss = preprocessing.StandardScaler()

In [80]:
ss.fit(x_test)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [81]:
x_test = ss.transform(x_test)

In [82]:
col_names2 = ["duration","protocol_type","service","flag","src_bytes",
             "dst_bytes","land","wrong_fragment","urgent","hot","num_failed_logins",
             "logged_in","num_compromised","root_shell","su_attempted","num_root","num_file_creations",
             "num_shells","num_access_files","num_outbound_cmds","is_host_login","is_guest_login","count",
             "srv_count","serror_rate","srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate",
             "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count",
             "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate",
             "dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate",
             "dst_host_rerror_rate","dst_host_srv_rerror_rate"]

In [83]:
pd.DataFrame(x_test,columns=col_names2).describe()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
count,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,...,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0
mean,1.274536e-16,1.367795e-16,-4.882761e-16,-2.858824e-16,3.2626680000000006e-17,3.601286e-18,0.0,2.375808e-16,0.0,-2.817746e-16,...,-1.356137e-16,-1.768585e-16,-9.126033e-17,1.248168e-16,3.029799e-16,-4.483913e-17,-1.521006e-16,1.263434e-16,1.183498e-16,2.871592e-16
std,1.00025,1.00025,1.00025,1.00025,1.00025,1.00025,0.0,1.00025,0.0,1.00025,...,1.00025,1.00025,1.00025,1.00025,1.00025,1.00025,1.00025,1.00025,1.00025,1.00025
min,-0.08728294,-0.8258718,-2.080779,-3.774439,-0.0299247,-0.1140919,0.0,-0.05006262,0.0,-0.0776931,...,-3.516438,-1.788457,-1.848048,-0.2805248,-1.232354,-0.1665965,-0.4622693,-0.4599179,-0.2522213,-0.2486495
25%,-0.08728294,-0.8258718,-0.6975122,0.4205473,-0.02954145,-0.1140919,0.0,-0.05006262,0.0,-0.0776931,...,0.3515113,-1.170809,-0.7233959,-0.2805248,-1.232354,-0.1665965,-0.4622693,-0.4599179,-0.2522213,-0.2486495
50%,-0.08728294,-0.8258718,-0.6975122,0.4205473,-0.02539539,-0.1140919,0.0,-0.05006262,0.0,-0.0776931,...,0.3515113,0.6251215,0.5968481,-0.2805248,0.839752,-0.1665965,-0.4622693,-0.4599179,-0.2522213,-0.2486495
75%,-0.08728294,0.8647562,1.204479,0.4205473,-0.02093575,-0.1140919,0.0,-0.05006262,0.0,-0.0776931,...,0.3515113,0.6251215,0.5968481,0.06574867,0.839752,-0.1665965,-0.4622693,-0.4599179,-0.2522213,-0.2486495
max,20.59793,2.555384,3.452288,1.469294,44.68728,38.08221,0.0,19.97498,0.0,19.34558,...,0.3515113,0.6251215,0.5968481,8.376311,0.839752,22.63922,2.173922,2.178436,4.094924,4.130525


## 学習

In [84]:
#ニューラルネットワークの隠れ層の候補をいろいろ定義
parameters = {'hidden_layer_sizes' : [(100,), (100, 10), (100, 100, 10), (100, 100, 100, 10)]}

#ニューラルネットワークのベストな隠れ層を探索
clf = GridSearchCV(MLPClassifier(), parameters)
clf.fit(x_test, y_test_3)
clf.best_params_


{'hidden_layer_sizes': (100, 100, 100, 10)}

In [86]:
clf.best_estimator_

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100, 100, 100, 10), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [87]:
clf = MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100, 100, 100, 10), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [88]:
clf.fit(x_test, y_test_3)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100, 100, 100, 10), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [89]:
pred = clf.predict(x_test)

In [90]:
print(classification_report(y_test_3, pred))
print(confusion_matrix(y_test_3, pred))

             precision    recall  f1-score   support

        DoS       1.00      1.00      1.00      1555
      Probe       1.00      1.00      1.00        19
        R2L       0.80      1.00      0.89         4
    normal.       1.00      1.00      1.00       422

avg / total       1.00      1.00      1.00      2000

[[1555    0    0    0]
 [   0   19    0    0]
 [   0    0    4    0]
 [   0    0    1  421]]
