# KDD Cup 1999 Data

http://kdd.ics.uci.edu/databases/kddcup99/kddcup99.html

In [1]:
import sklearn
import pandas as pd
from sklearn import preprocessing
from sklearn.utils import resample
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
import numpy as np
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
import time
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.externals import joblib
from sklearn.utils import resample

In [2]:
print('The scikit-learn version is {}.'.format(sklearn.__version__))

The scikit-learn version is 0.18.1.


In [3]:
col_names = ["duration","protocol_type","service","flag","src_bytes",
             "dst_bytes","land","wrong_fragment","urgent","hot","num_failed_logins",
             "logged_in","num_compromised","root_shell","su_attempted","num_root","num_file_creations",
             "num_shells","num_access_files","num_outbound_cmds","is_host_login","is_guest_login","count",
             "srv_count","serror_rate","srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate",
             "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count",
             "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate",
             "dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate",
             "dst_host_rerror_rate","dst_host_srv_rerror_rate","label"]

In [4]:
clf = joblib.load('dump/20171118/clf.pkl')

In [5]:
data = pd.read_csv("data/corrected", header=None, names = col_names)

In [6]:
data.shape

(311029, 42)

# 前処理
## カテゴリ化

In [7]:
data.label.value_counts()

smurf.              164091
normal.              60593
neptune.             58001
snmpgetattack.        7741
mailbomb.             5000
guess_passwd.         4367
snmpguess.            2406
satan.                1633
warezmaster.          1602
back.                 1098
mscan.                1053
apache2.               794
processtable.          759
saint.                 736
portsweep.             354
ipsweep.               306
httptunnel.            158
pod.                    87
nmap.                   84
buffer_overflow.        22
multihop.               18
sendmail.               17
named.                  17
ps.                     16
xterm.                  13
rootkit.                13
teardrop.               12
land.                    9
xlock.                   9
xsnoop.                  4
ftp_write.               3
udpstorm.                2
perl.                    2
worm.                    2
loadmodule.              2
sqlattack.               2
phf.                     2
i

In [8]:
data['label2'] = data.label.where(data.label.str.contains('normal'),'atack')

In [16]:
data.label2.value_counts()

atack      250436
normal.     60593
Name: label2, dtype: int64

In [17]:
data['label3'] = data.label.copy()

In [18]:
data.loc[data.label.str.contains('back|land|neptune|pod|smurf|teardrop|mailbomb|apache2|processtable|udpstorm'),'label3'] = 'DoS'

In [19]:
data.loc[data.label.str.contains('buffer_overflow|loadmodule|perl|rootkit|ps|xterm|sqlattack'),'label3'] = 'U2R'

In [20]:
data.loc[data.label.str.contains('ftp_write|guess_passwd|imap|multihop|phf|spy|warezclient|warezmaster|snmpgetattack|snmpguess|httptunnel|sendmail|named|xlock|xsnoop|worm'),'label3'] = 'R2L'

In [21]:
data.loc[data.label.str.contains('ipsweep|nmap|portsweep|satan|mscan|saint'),'label3'] = 'Probe'

In [22]:
data.label3.value_counts()

DoS        229851
normal.     60593
R2L         16347
Probe        4166
U2R            72
Name: label3, dtype: int64

In [23]:
#joblib.dump(data,'dump/20171118/corrected.pkl')

['dump/20171118/corrected.pkl']

## サンプリング

In [19]:
#data = resample(data,n_samples=40000,random_state=0)

In [20]:
#data.shape

## 数値化

In [24]:
le_protocol_type = preprocessing.LabelEncoder()

In [25]:
le_protocol_type.fit(data.protocol_type)

LabelEncoder()

In [26]:
data.protocol_type=le_protocol_type.transform(data.protocol_type)

In [27]:
le_service = preprocessing.LabelEncoder()

In [28]:
le_service.fit(data.service)

LabelEncoder()

In [29]:
data.service = le_service.transform(data.service)

In [30]:
le_flag = preprocessing.LabelEncoder()

In [31]:
le_flag.fit(data.flag)

LabelEncoder()

In [32]:
data.flag = le_flag.transform(data.flag)

In [33]:
data.describe()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
count,311029.0,311029.0,311029.0,311029.0,311029.0,311029.0,311029.0,311029.0,311029.0,311029.0,...,311029.0,311029.0,311029.0,311029.0,311029.0,311029.0,311029.0,311029.0,311029.0,311029.0
mean,17.902736,0.555456,25.16031,7.642901,1731.702,747.9937,2.9e-05,0.000762,5.1e-05,0.014677,...,235.282681,199.193914,0.793494,0.024953,0.547919,0.004566,0.058764,0.058791,0.142659,0.141693
std,407.6444,0.647019,14.588059,2.830005,127656.7,16120.18,0.005379,0.040367,0.009821,0.312068,...,60.913298,100.30647,0.38709,0.096003,0.491963,0.035773,0.231296,0.232997,0.34438,0.346573
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,14.0,9.0,105.0,0.0,0.0,0.0,0.0,0.0,...,255.0,244.0,0.97,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,14.0,9.0,520.0,0.0,0.0,0.0,0.0,0.0,...,255.0,255.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,1.0,46.0,9.0,1032.0,0.0,0.0,0.0,0.0,0.0,...,255.0,255.0,1.0,0.01,1.0,0.0,0.0,0.0,0.0,0.0
max,57715.0,2.0,64.0,10.0,62825650.0,5203179.0,1.0,3.0,3.0,101.0,...,255.0,255.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [34]:
data.shape

(311029, 44)

## ラベルの分離

In [35]:
y_test_1 = data.label.copy() 

In [36]:
y_test_2 = data.label2.copy()

In [37]:
y_test_3 = data.label3.copy()

In [38]:
x_test= data.drop(['label','label2','label3'],axis=1)

In [39]:
x_test.shape

(311029, 41)

In [40]:
y_test_1.shape

(311029,)

In [41]:
y_test_2.shape

(311029,)

In [42]:
y_test_3.shape

(311029,)

## 標準化

In [43]:
ss = preprocessing.StandardScaler()

In [44]:
ss.fit(x_test)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [45]:
x_test = ss.transform(x_test)

In [46]:
col_names2 = ["duration","protocol_type","service","flag","src_bytes",
             "dst_bytes","land","wrong_fragment","urgent","hot","num_failed_logins",
             "logged_in","num_compromised","root_shell","su_attempted","num_root","num_file_creations",
             "num_shells","num_access_files","num_outbound_cmds","is_host_login","is_guest_login","count",
             "srv_count","serror_rate","srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate",
             "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count",
             "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate",
             "dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate",
             "dst_host_rerror_rate","dst_host_srv_rerror_rate"]

In [47]:
pd.DataFrame(x_test,columns=col_names2).describe()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
count,311029.0,311029.0,311029.0,311029.0,311029.0,311029.0,311029.0,311029.0,311029.0,311029.0,...,311029.0,311029.0,311029.0,311029.0,311029.0,311029.0,311029.0,311029.0,311029.0,311029.0
mean,-2.575908e-14,4.133219e-14,5.639779e-13,-2.563808e-13,-9.452428e-15,1.146574e-14,-2.14576e-15,-4.268431e-15,1.52939e-14,6.81076e-14,...,-5.811937e-14,4.568262e-14,3.579529e-13,1.649841e-14,5.742343e-14,-6.019605e-14,7.131952e-13,1.28953e-13,3.330266e-13,-5.024232e-13
std,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,...,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002
min,-0.0439176,-0.8584868,-1.724722,-2.700671,-0.01356532,-0.04640115,-0.005379316,-0.01887649,-0.005237992,-0.0470317,...,-3.862589,-1.985856,-2.049899,-0.2599146,-1.113742,-0.1276438,-0.2540659,-0.2523262,-0.4142494,-0.4088416
25%,-0.0439176,-0.8584868,-0.7650317,0.47954,-0.0127428,-0.04640115,-0.005379316,-0.01887649,-0.005237992,-0.0470317,...,0.3236953,0.4466926,0.455982,-0.2599146,-1.113742,-0.1276438,-0.2540659,-0.2523262,-0.4142494,-0.4088416
50%,-0.0439176,-0.8584868,-0.7650317,0.47954,-0.009491894,-0.04640115,-0.005379316,-0.01887649,-0.005237992,-0.0470317,...,0.3236953,0.5563567,0.5334835,-0.2599146,0.9189357,-0.1276438,-0.2540659,-0.2523262,-0.4142494,-0.4088416
75%,-0.0439176,0.6870657,1.428547,0.47954,-0.005481131,-0.04640115,-0.005379316,-0.01887649,-0.005237992,-0.0470317,...,0.3236953,0.5563567,0.5334835,-0.1557514,0.9189357,-0.1276438,-0.2540659,-0.2523262,-0.4142494,-0.4088416
max,141.538,2.232618,2.662435,0.8328967,492.1324,322.7284,185.8972,74.29931,305.4637,323.6004,...,0.3236953,0.5563567,0.5334835,10.15641,0.9189357,27.82622,4.069414,4.039579,2.489527,2.476556


## 学習

In [48]:
pca = PCA(copy=True, iterated_power='auto', n_components=4, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)


In [49]:
t1=time.perf_counter()
pca.fit(x_test)
t2=time.perf_counter()

In [50]:
print(t2-t1,"秒")

1.1607541359990137 秒


In [51]:
t1=time.perf_counter()
x_test3 = pca.transform(x_test)
t2=time.perf_counter()

In [52]:
print(t2-t1,"秒")

0.054879934003110975 秒


## 予測

In [53]:
t1=time.perf_counter()
pred=clf.predict(x_test3)
t2=time.perf_counter()

In [54]:
print(t2-t1,"秒")

39.3743152140014 秒


In [55]:
print(classification_report(y_test_3, pred))
print(confusion_matrix(y_test_3, pred))

             precision    recall  f1-score   support

        DoS       0.88      0.95      0.91    229851
      Probe       0.05      0.05      0.05      4166
        R2L       0.06      0.00      0.00     16347
        U2R       0.67      0.03      0.05        72
    normal.       0.56      0.52      0.54     60593

avg / total       0.76      0.81      0.78    311029

[[218826   4739      0      0   6286]
 [  1086    225      0      0   2855]
 [  1046      5      1      1  15294]
 [    14      0      0      2     56]
 [ 29044      3     16      0  31530]]
