In [30]:
import numpy as np 
import pandas as pd
import tensorflow as tf

In [42]:
from tensorflow.keras.utils import get_file
try:
    path = get_file('kddcup.data_10_percent.gz', origin='http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data_10_percent.gz')
except:
    print('Error downloading')
    raise
    
print(path) 

Downloading data from http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data_10_percent.gz
C:\Users\ngohuy143\.keras\datasets\kddcup.data_10_percent.gz


In [45]:
df = pd.read_csv(path, header=None)

In [49]:
print("Read {} rows.". format(len(df)))
df = df.sample(frac=0.1, replace=False) # sample only 10% of the dataset

df.dropna(inplace=True, axis=1) # for now, just drop NA's (rows with missing values)

# the csv file has no column heads, just add them 

df.columns = [
    'duration',
    'protocol_type',
    'service',
    'flag',
    'src_bytes',
    'dst_bytes',
    'land',
    'wrong_fragment',
    'urgent',
    'hot',
    'num_failed_logins',
    'logged_in',
    'num_compromised',
    'root_shell',
    'su_attempted',
    'num_root',
    'num_file_creations',
    'num_shells',
    'num_access_files',
    'num_outbound_cmds',
    'is_host_login',
    'is_guest_login',
    'count',
    'srv_count',
    'serror_rate',
    'srv_serror_rate',
    'rerror_rate',
    'srv_rerror_rate',
    'same_srv_rate',
    'diff_srv_rate',
    'srv_diff_host_rate',
    'dst_host_count',
    'dst_host_srv_count',
    'dst_host_same_srv_rate',
    'dst_host_diff_srv_rate',
    'dst_host_same_src_port_rate',
    'dst_host_srv_diff_host_rate',
    'dst_host_serror_rate',
    'dst_host_srv_serror_rate',
    'dst_host_rerror_rate',
    'dst_host_srv_rerror_rate',
    'outcome'
]

# display 5 rows
df[0:5]

Read 4940 rows.


Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,outcome
83523,0,tcp,http,SF,321,182,0,0,0,0,...,255,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal.
137164,0,icmp,ecr_i,SF,30,0,0,0,0,0,...,15,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,normal.
352907,0,tcp,private,S0,0,0,0,0,0,0,...,2,0.01,0.07,0.0,0.0,1.0,1.0,0.0,0.0,neptune.
235661,0,icmp,ecr_i,SF,1032,0,0,0,0,0,...,255,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,smurf.
275711,0,icmp,ecr_i,SF,1032,0,0,0,0,0,...,255,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,smurf.


## Analyzing the dataset

In [52]:
ENCODING = 'utf-8'
def expand_categories(values):
    result = []
    s = values.value_counts()
    t = float(len(values))
    for v in s.index:
        result.append("{}:{}%".format(v,round(100*(s[v]/t),2)))
    return "[{}]".format(",".join(result))
        
def analyze(df):
    print()
    cols = df.columns.values
    total = float(len(df))

    print("{} rows".format(int(total)))
    for col in cols:
        uniques = df[col].unique()
        unique_count = len(uniques)
        if unique_count>100:
            print("** {}:{} ({}%)".format(col,unique_count,int(((unique_count)/total)*100)))
        else:
            print("** {}:{}".format(col,expand_categories(df[col])))
            expand_categories(df[col])

In [53]:
# analyze our dataset
import os
from sklearn import metrics
from scipy.stats import zscore

analyze(df)


494 rows
** duration:[0:97.77%,9672:0.2%,8639:0.2%,1925:0.2%,2350:0.2%,2630:0.2%,50:0.2%,28:0.2%,5:0.2%,3:0.2%,2:0.2%,1:0.2%]
** protocol_type:[icmp:55.47%,tcp:40.28%,udp:4.25%]
** service:[ecr_i:55.06%,private:24.49%,http:12.75%,other:2.23%,smtp:1.42%,domain_u:1.01%,nntp:0.61%,ftp_data:0.4%,time:0.4%,eco_i:0.4%,bgp:0.2%,csnet_ns:0.2%,domain:0.2%,ftp:0.2%,pop_3:0.2%,gopher:0.2%]
** flag:[SF:73.68%,S0:19.84%,REJ:5.87%,RSTR:0.61%]
** src_bytes:[1032:45.34%,0:26.52%,520:9.51%,147:1.21%,105:1.01%,146:0.61%,294:0.4%,28:0.4%,30:0.4%,226:0.4%,235:0.4%,260:0.4%,54540:0.4%,219:0.4%,306:0.4%,318:0.4%,300:0.2%,1621:0.2%,193:0.2%,191:0.2%,189:0.2%,178:0.2%,685:0.2%,170:0.2%,680:0.2%,164:0.2%,317:0.2%,4215:0.2%,321:0.2%,8:0.2%,199:0.2%,777:0.2%,46:0.2%,44:0.2%,42:0.2%,1065:0.2%,1064:0.2%,36:0.2%,327:0.2%,29:0.2%,330:0.2%,331:0.2%,334:0.2%,197:0.2%,208:0.2%,299:0.2%,310:0.2%,1321:0.2%,296:0.2%,303:0.2%,291:0.2%,282:0.2%,266:0.2%,265:0.2%,308:0.2%,254:0.2%,242:0.2%,241:0.2%,238:0.2%,234:0.2%,209:0.2

## Encoding the feature vector


In [56]:
# Encode a numeric column as zscores
def encode_numeric_zscore(df, name, mean=None, sd=None):
    if mean is None:
        mean = df[name].mean()

    if sd is None:
        sd = df[name].std()

    df[name] = (df[name] - mean) / sd
    
# Encode text values to dummy variables(i.e. [1,0,0],[0,1,0],[0,0,1] for red,green,blue)
def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = f"{name}-{x}"
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)

In [66]:
encode_numeric_zscore(df, 'duration')
encode_text_dummy(df, 'protocol_type')
encode_text_dummy(df, 'service')
encode_text_dummy(df, 'flag')
encode_numeric_zscore(df, 'src_bytes')
encode_numeric_zscore(df, 'dst_bytes')
encode_text_dummy(df, 'land')
encode_numeric_zscore(df, 'wrong_fragment')
encode_numeric_zscore(df, 'urgent')
encode_numeric_zscore(df, 'hot')
encode_numeric_zscore(df, 'num_failed_logins')
encode_text_dummy(df, 'logged_in')
encode_numeric_zscore(df, 'num_compromised')
encode_numeric_zscore(df, 'root_shell')
encode_numeric_zscore(df, 'su_attempted')
encode_numeric_zscore(df, 'num_root')
encode_numeric_zscore(df, 'num_file_creations')
encode_numeric_zscore(df, 'num_shells')
encode_numeric_zscore(df, 'num_access_files')
encode_numeric_zscore(df, 'num_outbound_cmds')
encode_text_dummy(df, 'is_host_login')
encode_text_dummy(df, 'is_guest_login')
encode_numeric_zscore(df, 'count')
encode_numeric_zscore(df, 'srv_count')
encode_numeric_zscore(df, 'serror_rate')
encode_numeric_zscore(df, 'srv_serror_rate')
encode_numeric_zscore(df, 'rerror_rate')
encode_numeric_zscore(df, 'srv_rerror_rate')
encode_numeric_zscore(df, 'same_srv_rate')
encode_numeric_zscore(df, 'diff_srv_rate')
encode_numeric_zscore(df, 'srv_diff_host_rate')
encode_numeric_zscore(df, 'dst_host_count')
encode_numeric_zscore(df, 'dst_host_srv_count')
encode_numeric_zscore(df, 'dst_host_same_srv_rate')
encode_numeric_zscore(df, 'dst_host_diff_srv_rate')
encode_numeric_zscore(df, 'dst_host_same_src_port_rate')
encode_numeric_zscore(df, 'dst_host_srv_diff_host_rate')
encode_numeric_zscore(df, 'dst_host_serror_rate')
encode_numeric_zscore(df, 'dst_host_srv_serror_rate')
encode_numeric_zscore(df, 'dst_host_rerror_rate')
encode_numeric_zscore(df, 'dst_host_srv_rerror_rate')

# display 5 rows

df.dropna(inplace=True,axis=1)
df[0:5]
# This is the numeric feature vector, as it goes to the neural net

KeyError: 'protocol_type'

In [65]:
df[0:5]

Unnamed: 0,duration,src_bytes,dst_bytes,wrong_fragment,hot,num_compromised,count,srv_count,serror_rate,srv_serror_rate,...,flag-REJ,flag-RSTR,flag-S0,flag-SF,land-0,logged_in-0,logged_in-1,is_host_login-0,is_guest_login-0,is_guest_login-1
83523,-0.08407,-0.142442,-0.112348,-0.063693,-0.052758,-0.063693,-1.450481,-1.058628,-0.498204,-0.49709,...,0,0,0,1,1,0,1,1,1,0
137164,-0.08407,-0.225535,-0.165062,-0.063693,-0.052758,-0.063693,-1.544987,-1.139231,-0.498204,-0.49709,...,0,0,0,1,1,1,0,1,1,0
352907,-0.08407,-0.234101,-0.165062,-0.063693,-0.052758,-0.063693,-0.174645,-1.090869,2.007525,2.008137,...,0,0,1,0,1,1,0,1,1,0
235661,-0.08407,0.060579,-0.165062,-0.063693,-0.052758,-0.063693,0.864924,0.916138,-0.498204,-0.49709,...,0,0,0,1,1,1,0,1,1,0
275711,-0.08407,0.060579,-0.165062,-0.063693,-0.052758,-0.063693,0.864924,0.916138,-0.498204,-0.49709,...,0,0,0,1,1,1,0,1,1,0


In [67]:
# convert to numpy - classification
x_columns = df.columns.drop('outcome')
x = df[x_columns].values
dummies = pd.get_dummies(df['outcome']) # Classification
outcomes = dummies.columns
num_classes = len(outcomes)
y = dummies.values

In [68]:
df.groupby('outcome')['outcome'].count()

outcome
back.             2
ipsweep.          1
neptune.        121
normal.          92
portsweep.        3
satan.            1
smurf.          271
teardrop.         2
warezclient.      1
Name: outcome, dtype: int64

## Training the neural network

In [69]:
import pandas as pd
import io
import requests
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn import metrics
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.callbacks import EarlyStopping

# Create a test/train split.  25% test
# Split into train/test
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.25, random_state=42)

# Create neural net
model = Sequential()
model.add(Dense(10, input_dim=x.shape[1], kernel_initializer='normal', activation='relu'))
model.add(Dense(50, input_dim=x.shape[1], kernel_initializer='normal', activation='relu'))
model.add(Dense(10, input_dim=x.shape[1], kernel_initializer='normal', activation='relu'))
model.add(Dense(1, kernel_initializer='normal'))
model.add(Dense(y.shape[1],activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')
monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=5, verbose=1, mode='auto')
model.fit(x_train,y_train,validation_data=(x_test,y_test),callbacks=[monitor],verbose=2,epochs=19)

Train on 370 samples, validate on 124 samples
Epoch 1/19
370/370 - 3s - loss: 2.1898 - val_loss: 2.1812
Epoch 2/19
370/370 - 0s - loss: 2.1714 - val_loss: 2.1619
Epoch 3/19
370/370 - 0s - loss: 2.1468 - val_loss: 2.1325
Epoch 4/19
370/370 - 0s - loss: 2.1052 - val_loss: 2.0800
Epoch 5/19
370/370 - 0s - loss: 2.0278 - val_loss: 1.9826
Epoch 6/19
370/370 - 0s - loss: 1.8928 - val_loss: 1.8183
Epoch 7/19
370/370 - 0s - loss: 1.6828 - val_loss: 1.5888
Epoch 8/19
370/370 - 0s - loss: 1.4233 - val_loss: 1.3245
Epoch 9/19
370/370 - 0s - loss: 1.1377 - val_loss: 1.0514
Epoch 10/19
370/370 - 0s - loss: 0.8669 - val_loss: 0.8102
Epoch 11/19
370/370 - 0s - loss: 0.6610 - val_loss: 0.6493
Epoch 12/19
370/370 - 0s - loss: 0.5438 - val_loss: 0.5577
Epoch 13/19
370/370 - 0s - loss: 0.4853 - val_loss: 0.5170
Epoch 14/19
370/370 - 0s - loss: 0.4532 - val_loss: 0.4882
Epoch 15/19
370/370 - 0s - loss: 0.4314 - val_loss: 0.4637
Epoch 16/19
370/370 - 0s - loss: 0.4134 - val_loss: 0.4525
Epoch 17/19
370/370

<tensorflow.python.keras.callbacks.History at 0x1f3c33d6470>

In [70]:
# Measure accuracy
pred = model.predict(x_test)
pred = np.argmax(pred,axis=1)
y_eval = np.argmax(y_test,axis=1)
score = metrics.accuracy_score(y_eval, pred)
print("Validation score: {}".format(score))

Validation score: 0.717741935483871
