In [None]:
from time import time
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import IsolationForest
from sklearn.metrics import roc_curve, auc
from sklearn.datasets import fetch_kddcup99
%matplotlib inline
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.preprocessing import MultiLabelBinarizer

In [None]:
#load KDDCup 99 dataset [Ref: http://kdd.ics.uci.edu/databases/kddcup99/kddcup99.html]

dataset = fetch_kddcup99(subset=None, shuffle=True, percent10=True)
# http://www.kdd.org/kdd-cup/view/kdd-cup-1999/Tasks

In [None]:
#set X and y
X = dataset.data
y = dataset.target

In [None]:
#Viewing Data
feature_cols = ['duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login', 'count', 'srv_count', 'serror_rate', 'srv_serrer_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate']
X = pd.DataFrame(X, columns = feature_cols)
 
y = pd.Series(y)
X.head()

In [None]:
#converting columns to float
for col in X.columns:  
    try:
        X[col] = X[col].astype(float)
    except ValueError:
        pass

In [None]:
#convert to dummy codes
X = pd.get_dummies(X, prefix=['protocol_type_', 'service_', 'flag_'], drop_first=True)
X.head()

In [None]:
#print counts of the attacks
y.value_counts()

In [None]:
#Create DecisionTreeClassifier and fit the model
mlb = MultiLabelBinarizer()
y=mlb.fit_transform(y)
treeclf = DecisionTreeClassifier(max_depth=7)
clf = treeclf.fit(X, y)
scores = cross_val_score(treeclf, X, y, scoring='accuracy', cv=5)
print(np.mean(scores))

In [None]:
#show the features with their importance
pd.DataFrame({'feature':X.columns, 'importance':treeclf.feature_importances_}).sort_values('importance', ascending=False).head(10)

In [None]:
#Visualizing Decision Tree
from sklearn import tree
fig, ax = plt.subplots(figsize=(10, 10))
tree.plot_tree(clf,filled=True) 