In [None]:
from sklearn import datasets 
import time 
from sklearn import svm
import pandas as pd 
import numpy as np 
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import MinMaxScaler 
from sklearn.metrics import confusion_matrix 
import itertools 
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn import svm, datasets
from sklearn.pipeline import Pipeline

rseed = 93 
random_state = 2 
data_kdd99 = datasets.fetch_kddcup99 (subset=None, percent10=True, random_state=random_state)

X = pd.DataFrame(data_kdd99.data) 
Y = pd.DataFrame(data_kdd99.target) 
col_names = ["duration","protocol_type","service","flag","src_bytes","dst_bytes","land","wrong_fragment",
             "urgent","hot","num_failed_logins","logged_in","num_compromised",
             "root_shell","su_attempted","num_root","num_file_creations",
             "num_shells","num_access_files","num_outbound_cmds",
             "is_hot_login","is_guest_login","count","srv_count","serror_rate",
             "srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate",
             "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count",
             "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate",
             "dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate",
             "dst_host_rerror_rate","dst_host_srv_rerror_rate","target"]

print(X.shape) 
print(Y.shape)

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=rseed)

X_train_trans = X_train.drop(X_train.columns[[1, 2, 3]], axis=1)
X_test_trans = X_test.drop(X_test.columns[[1, 2, 3]], axis=1)

train_label = y_train[0].tolist()
test_label = y_test[0].tolist() 

################
rseed = 5

scalestep = MinMaxScaler(feature_range = (-1,1))
pcastep = PCA(n_components = 20)
svmstep = svm.SVC(kernel='linear', verbose=True, random_state=rseed, decision_function_shape="ovo")
steps = [('scalar', scalestep), ('pca', pcastep), ('SVM', svmstep)]

pipeline = Pipeline(steps)
parameters = {'pca__n_components':[15,20,25,30]':[15, 20, 25, 30], 'SVM__gamma':[0.1,0.01]}
grid = GridSearchCV(pipeline, param_grid=parameters, cv=5, n_jobs=1)

grid.fit(X_train_trans, train_label)

print("score = %3.2f" %(grid.score(X_test_trans, test_label)))
print(grid.best_params_)
print(sorted(grid.cv_results_.keys()))
################
time_start = time.time() 
      
cnf_model1 = confusion_matrix(grid.predict(X_test_trans), test_label)

import numpy as np

def plot_confusion_matrix(cm, classes, normalize=False, title='Confusion matrix', cmap=plt.cm.Blues):
    np.set_printoptions(precision=2)
    
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:,np.newaxis]
    else:
        print("")
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)
    
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i,j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j,i,format(cm[i,j], fmt),
                horizontalalignment="center",
                color = "white" if cm[i,j] > thresh else "black")
        
np.set_printoptions(precision = 2)

width = 12
height= 12
plt.figure(figsize=(width, height))
plot_confusion_matrix(cnf_model1, classes=grid.classes_, title="confusion matrix, without normalization")

plt.figure(figsize=(width, height))
plot_confusion_matrix(cnf_model1, classes = grid.classes_, normalize=True, title="confusion matrix, with normalization")
plt.show()