# KDD Cup 1999 Data(標準化、次元圧縮、SVM)

http://kdd.ics.uci.edu/databases/kddcup99/kddcup99.html

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as pyplot
from sklearn import datasets
from numpy import logspace
import sklearn.preprocessing as sp
from sklearn.externals import joblib
from sklearn.datasets import load_digits
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.externals import joblib
import sklearn
from sklearn.decomposition import PCA
import time
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
% matplotlib inline

In [2]:
print('The scikit-learn version is {}.'.format(sklearn.__version__))

The scikit-learn version is 0.18.1.



|ファイル名|ファイル内容|
|---|---|
|kddcup.data|フルデータ|
|kddcup.data_10_percent|フルデータの10%を抽出した学習用データ|
|corrected|正常・攻撃のラベル付けがなされた評価用データ|
|kddcup.testdata.unlabeled|正常・攻撃のラベル付けがなされていないデータ|
|kddcup.testdata.unlabeled_10_percent|正常・攻撃のラベル付けがなされていないデータの10%サブセット|
|kddcup.newtestdata_10_percent_unlabeled|正常・攻撃のラベル付けがなされていないデータの10%サブセット|

In [3]:
col_names = ["duration","protocol_type","service","flag","src_bytes",
             "dst_bytes","land","wrong_fragment","urgent","hot","num_failed_logins",
             "logged_in","num_compromised","root_shell","su_attempted","num_root","num_file_creations",
             "num_shells","num_access_files","num_outbound_cmds","is_host_login","is_guest_login","count",
             "srv_count","serror_rate","srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate",
             "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count",
             "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate",
             "dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate",
             "dst_host_rerror_rate","dst_host_srv_rerror_rate","label"]

In [4]:
data = pd.read_csv("kddcup.data_10_percent", header=None, names = col_names)

In [5]:
le_protocol_type=joblib.load('dump/le_protocol_type.pkl')

In [6]:
le_flag = joblib.load('dump/le_flag.pkl')

In [7]:
data.protocol_type = le_protocol_type.transform(data.protocol_type)

In [8]:
data.flag = le_flag.transform(data.flag)

In [9]:
x = data.drop(['label','service'],axis=1)

In [10]:
y = data.label.copy()

In [11]:
y.value_counts()

smurf.              280790
neptune.            107201
normal.              97278
back.                 2203
satan.                1589
ipsweep.              1247
portsweep.            1040
warezclient.          1020
teardrop.              979
pod.                   264
nmap.                  231
guess_passwd.           53
buffer_overflow.        30
land.                   21
warezmaster.            20
imap.                   12
rootkit.                10
loadmodule.              9
ftp_write.               8
multihop.                7
phf.                     4
perl.                    3
spy.                     2
Name: label, dtype: int64

In [12]:
scaler = StandardScaler(copy=True, with_mean=True, with_std=True)

In [13]:
x = scaler.fit_transform(x)

In [14]:
x.shape

(494021, 40)

In [15]:
pca = PCA(copy=True, iterated_power='auto', n_components=7, random_state=None,svd_solver='auto', tol=0.0, whiten=False)

In [16]:
x = pca.fit_transform(x)

In [17]:
x.shape

(494021, 7)

In [18]:
svc = SVC(C=10.0, cache_size=200, class_weight=None, coef0=0.0,decision_function_shape=None, degree=3, gamma=0.005623413251903491,kernel='rbf', max_iter=-1, probability=False, random_state=None,shrinking=True, tol=0.001, verbose=False)

In [19]:
t1=time.perf_counter()
clf = svc.fit(x,y)
t2=time.perf_counter()

In [20]:
print(t2-t1,"秒")

50.45493358999374 秒


In [21]:
data2 = pd.read_csv("kddcup.data", header=None, names = col_names)

In [22]:
data2.protocol_type = le_protocol_type.transform(data2.protocol_type)

In [23]:
data2.flag = le_flag.transform(data2.flag)

In [24]:
test_x = data2.drop(['label','service'],axis=1)

In [25]:
test_y = data2.label.copy()

In [26]:
test_y.value_counts()

smurf.              2807886
neptune.            1072017
normal.              972781
satan.                15892
ipsweep.              12481
portsweep.            10413
nmap.                  2316
back.                  2203
warezclient.           1020
teardrop.               979
pod.                    264
guess_passwd.            53
buffer_overflow.         30
land.                    21
warezmaster.             20
imap.                    12
rootkit.                 10
loadmodule.               9
ftp_write.                8
multihop.                 7
phf.                      4
perl.                     3
spy.                      2
Name: label, dtype: int64

In [27]:
test_x.head()

Unnamed: 0,duration,protocol_type,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
0,0,1,9,215,45076,0,0,0,0,0,...,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,1,9,162,4528,0,0,0,0,0,...,1,1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0,1,9,236,1228,0,0,0,0,0,...,2,2,1.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0
3,0,1,9,233,2032,0,0,0,0,0,...,3,3,1.0,0.0,0.33,0.0,0.0,0.0,0.0,0.0
4,0,1,9,239,486,0,0,0,0,0,...,4,4,1.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0


In [28]:
test_x  = scaler.fit_transform(test_x)

In [29]:
test_x = pca.fit_transform(test_x)

In [30]:
t1=time.perf_counter()
pred=clf.predict(test_x)
t2=time.perf_counter()

In [31]:
print(t2-t1,"秒")

1019.4853761630075 秒


In [32]:
print(classification_report(test_y, pred))
print(confusion_matrix(test_y, pred))

  'precision', 'predicted', average, warn_for)


                  precision    recall  f1-score   support

           back.       0.16      0.99      0.28      2203
buffer_overflow.       0.00      0.00      0.00        30
      ftp_write.       0.00      0.00      0.00         8
   guess_passwd.       0.00      0.00      0.00        53
           imap.       0.00      0.33      0.01        12
        ipsweep.       0.00      0.00      0.00     12481
           land.       0.00      0.00      0.00        21
     loadmodule.       0.00      0.00      0.00         9
       multihop.       0.00      0.00      0.00         7
        neptune.       0.91      0.81      0.86   1072017
           nmap.       0.00      0.00      0.00      2316
         normal.       0.99      0.87      0.93    972781
           perl.       0.00      0.00      0.00         3
            phf.       0.00      0.00      0.00         4
            pod.       0.01      0.30      0.01       264
      portsweep.       0.00      0.03      0.00     10413
        rootk

In [33]:
from sklearn import metrics

In [34]:
metrics.accuracy_score(test_y, pred)

0.92373415079236598

In [35]:
back
neptune
satan
smurf

NameError: name 'back' is not defined