In [1]:
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn import metrics
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.callbacks import EarlyStopping

Using TensorFlow backend.


In [2]:
from sklearn import preprocessing
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import shutil
import os
import keras
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:

# Encode text values to dummy variables(i.e. [1,0,0],[0,1,0],[0,0,1] for red,green,blue)
def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = "{}-{}".format(name, x)
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)


# Encode text values to a single dummy variable.  The new columns (which do not replace the old) will have a 1
# at every location where the original column (name) matches each of the target_values.  One column is added for
# each target value.
def encode_text_single_dummy(df, name, target_values):
    for tv in target_values:
        l = list(df[name].astype(str))
        l = [1 if str(x) == str(tv) else 0 for x in l]
        name2 = "{}-{}".format(name, tv)
        df[name2] = l


# Encode text values to indexes(i.e. [1],[2],[3] for red,green,blue).
def encode_text_index(df, name):
    le = preprocessing.LabelEncoder()
    df[name] = le.fit_transform(df[name])
    return le.classes_


# Encode a numeric column as zscores
def encode_numeric_zscore(df, name, mean=None, sd=None):
    if mean is None:
        mean = np.asarray(df[name], dtype = np.float).mean()

    if sd is None:
        sd = np.asarray(df[name], dtype = np.float).std()

    df[name] = (np.asarray(df[name], dtype = np.float) - mean) / sd


# Convert all missing values in the specified column to the median
def missing_median(df, name):
    med = df[name].median()
    df[name] = df[name].fillna(med)


# Convert all missing values in the specified column to the default
def missing_default(df, name, default_value):
    df[name] = df[name].fillna(default_value)


# Convert a Pandas dataframe to the x,y inputs that TensorFlow needs
def to_xy(df, target):
    result = []
    for x in df.columns:
        if x != target:
            result.append(x)
    
    target_type = df[target].dtypes
    target_type = target_type[0] if hasattr(target_type, '__iter__') else target_type
    # Encode to int for classification, float otherwise. TensorFlow likes 32 bits.
    if target_type in (np.int64, np.int32):
        # Classification
        dummies = pd.get_dummies(df[target])
        return df.as_matrix(result).astype(np.float32), dummies.as_matrix().astype(np.float32)
    else:
        # Regression
        return df.as_matrix(result).astype(np.float32), df.as_matrix([target]).astype(np.float32)

# Nicely formatted time string
def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return "{}:{:>02}:{:>05.2f}".format(h, m, s)


# Regression chart.
def chart_regression(pred,y,sort=True):
    t = pd.DataFrame({'pred' : pred, 'y' : y.flatten()})
    if sort:
        t.sort_values(by=['y'],inplace=True)
    a = plt.plot(t['y'].tolist(),label='expected')
    b = plt.plot(t['pred'].tolist(),label='prediction')
    plt.ylabel('output')
    plt.legend()
    plt.show()

# Remove all rows where the specified column is +/- sd standard deviations
def remove_outliers(df, name, sd):
    drop_rows = df.index[(np.abs(df[name] - df[name].mean()) >= (sd * df[name].std()))]
    df.drop(drop_rows, axis=0, inplace=True)


# Encode a column to a range between normalized_low and normalized_high.
def encode_numeric_range(df, name, normalized_low=-1, normalized_high=1,
                         data_low=None, data_high=None):
    if data_low is None:
        data_low = min(df[name])
        data_high = max(df[name])

    df[name] = ((df[name] - data_low) / (data_high - data_low)) \
               * (normalized_high - normalized_low) + normalized_low

In [4]:

# This file is a CSV, just no CSV extension or headers

df_ = pd.read_csv("./train_70%.csv")
df_test = pd.read_csv("./test_30%.csv")
print("Read df_ {} rows.".format(len(df_)))
print("Read df_test {} rows.".format(len(df_test)))
#print("Read {} rows.".format(len(df1)))
# df = df.sample(frac=0.1, replace=False) # Uncomment this line to sample only 10% of the dataset
df_ = df_.drop(df_.columns[0], axis=1)
df_test = df_test.drop(df_test.columns[0], axis=1)

df_.dropna(inplace=True,axis=1) # For now, just drop NA's (rows with missing values)
df_test.dropna(inplace=True,axis=1)

Read df_ 1769 rows.
Read df_test 759 rows.


In [5]:
df_.head(5)

Unnamed: 0,stddev(pkt_len),fb_ratio,inter_arrival_time,pkt_count,is_tcp,outcome
0,604.976342,0.9,0.013149,19,1,skype
1,384.856909,1.0,0.025902,14,1,skype
2,0.0,-1.0,0.000186,1000,1,download
3,-1.0,0.0,-1.0,1,0,skype
4,-1.0,0.0,-1.0,1,0,tencent


In [None]:
# # # Now encode the feature vector

# # # encode_text_dummy(df, 'protocol_type')
# # encode_numeric_zscore(df, 'forw_byte')
# # encode_numeric_zscore(df, 'back_byte')
# # encode_numeric_zscore(df, 'tot_byte')

# encode_numeric_zscore(df_, ' Flow Duration')
# encode_numeric_zscore(df_, ' Total Fwd Packets')
# encode_numeric_zscore(df_, ' Total Backward Packets')
# encode_numeric_zscore(df_, 'Total Length of Fwd Packets')
# encode_numeric_zscore(df_, ' Total Length of Bwd Packets')
# encode_numeric_zscore(df_, ' Fwd Packet Length Max')
# encode_numeric_zscore(df_, ' Fwd Packet Length Min')
# encode_numeric_zscore(df_, ' Fwd Packet Length Mean')
# encode_numeric_zscore(df_, ' Fwd Packet Length Std')
# encode_numeric_zscore(df_, 'Bwd Packet Length Max')
# encode_numeric_zscore(df_, ' Bwd Packet Length Min')
# encode_numeric_zscore(df_, ' Bwd Packet Length Mean')
# encode_numeric_zscore(df_, ' Bwd Packet Length Std')
# encode_numeric_zscore(df_, ' Min Packet Length')
# encode_numeric_zscore(df_, ' Max Packet Length')
# encode_numeric_zscore(df_, ' Packet Length Mean')
# encode_numeric_zscore(df_, ' Packet Length Std')
# encode_numeric_zscore(df_, 'total pkt count')
# encode_numeric_zscore(df_, 'total byte count')

# encode_numeric_zscore(df_test, ' Flow Duration')
# encode_numeric_zscore(df_test, ' Total Fwd Packets')
# encode_numeric_zscore(df_test, ' Total Backward Packets')
# encode_numeric_zscore(df_test, 'Total Length of Fwd Packets')
# encode_numeric_zscore(df_test, ' Total Length of Bwd Packets')
# encode_numeric_zscore(df_test, ' Fwd Packet Length Max')
# encode_numeric_zscore(df_test, ' Fwd Packet Length Min')
# encode_numeric_zscore(df_test, ' Fwd Packet Length Mean')
# encode_numeric_zscore(df_test, ' Fwd Packet Length Std')
# encode_numeric_zscore(df_test, 'Bwd Packet Length Max')
# encode_numeric_zscore(df_test, ' Bwd Packet Length Min')
# encode_numeric_zscore(df_test, ' Bwd Packet Length Mean')
# encode_numeric_zscore(df_test, ' Bwd Packet Length Std')
# encode_numeric_zscore(df_test, ' Min Packet Length')
# encode_numeric_zscore(df_test, ' Max Packet Length')
# encode_numeric_zscore(df_test, ' Packet Length Mean')
# encode_numeric_zscore(df_test, ' Packet Length Std')
# encode_numeric_zscore(df_test, 'total pkt count')
# encode_numeric_zscore(df_test, 'total byte count')

# encode_text_index(df_, ' Label')
# encode_text_index(df_test, ' Label')
# # num_classes = len(outcomes)

# # # display 5 rows

# # df.dropna(inplace=True,axis=1)
# # df[0:5]
# # # This is the numeric feature vector, as it goes to the neural net

In [7]:
encode_numeric_zscore(df_, 'fb_ratio')
encode_numeric_zscore(df_, 'pkt_count')
encode_numeric_zscore(df_, 'inter_arrival_time')
encode_numeric_zscore(df_, 'stddev(pkt_len)')
encode_text_index(df_, 'outcome')

encode_numeric_zscore(df_test, 'fb_ratio')
encode_numeric_zscore(df_test, 'pkt_count')
encode_numeric_zscore(df_test, 'inter_arrival_time')
encode_numeric_zscore(df_test, 'stddev(pkt_len)')
encode_text_index(df_test, 'outcome')

array(['LoL', 'bili', 'download', 'netease', 'skype', 'tencent', 'youtube'], dtype=object)

In [8]:
x_train, y_train = to_xy(df_, 'outcome')
x_test, y_test = to_xy(df_test, 'outcome')
# x, y = to_xy(df_, ' Label')

In [9]:
df_.head(5)

Unnamed: 0,stddev(pkt_len),fb_ratio,inter_arrival_time,pkt_count,is_tcp,outcome
0,1.976651,0.826805,-0.114249,-0.054208,1,4
1,0.954096,0.932452,-0.113765,-0.054785,1,4
2,-0.83374,-1.18048,-0.114741,0.059085,1,2
3,-0.838385,-0.124014,-0.152726,-0.056287,0,4
4,-0.838385,-0.124014,-0.152726,-0.056287,0,5


In [37]:
import keras.backend as K
K.clear_session()

In [38]:
# Create neural net
model = Sequential()

# Used relu for activation function
# model.add(Dense(4, input_dim=x_train.shape[1], kernel_initializer='normal', activation='relu'))
# model.add(Dense(8, input_dim=x_train.shape[1], kernel_initializer='normal', activation='relu'))
# model.add(Dense(4, input_dim=x_train.shape[1], kernel_initializer='normal', activation='relu'))

# model.add(Dense(5, input_dim=x_train.shape[1], kernel_initializer='normal', activation='relu'))
model.add(Dense(16, input_dim=x_train.shape[1], kernel_initializer='normal', activation='relu'))
# model.add(Dense(32, input_dim=x_train.shape[1], kernel_initializer='normal', activation='relu'))
model.add(Dense(y_train.shape[1],activation='sigmoid'))

In [39]:
print(x_train.shape[1])

5


In [40]:
# Print the model summary
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 16)                96        
_________________________________________________________________
dense_2 (Dense)              (None, 7)                 119       
Total params: 215
Trainable params: 215
Non-trainable params: 0
_________________________________________________________________


In [41]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=5, verbose=1, mode='auto')
batch_size = 10
epochs = 1000
model.fit(x_train,y_train,validation_data=(x_test,y_test),callbacks=[monitor],verbose=1,batch_size = batch_size, epochs=epochs)

Train on 1769 samples, validate on 759 samples
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 00017: early stopping


<keras.callbacks.History at 0x120c6ada0>

In [42]:
# Measure accuracy
pred = model.predict(x_test)
pred = np.argmax(pred,axis=1)
y_eval = np.argmax(y_test,axis=1)
score = metrics.accuracy_score(y_eval, pred)
print("Validation score: {}".format(score))

Validation score: 0.5375494071146245


In [None]:
print(y_eval[:100])

In [None]:
print(pred[:100])

In [None]:
iff = np.ones(272618, dtype=int)

In [None]:
iff = 3*iff

In [43]:
from sklearn.metrics import classification_report,confusion_matrix
Cm = confusion_matrix(y_eval,pred)
C = np.sum(Cm)
Cm = Cm/C
print('Confusion Matrix:')
print(np.array_str(Cm, precision=4, suppress_small=True))

Confusion Matrix:
[[ 0.0158  0.0013  0.0264  0.      0.      0.      0.    ]
 [ 0.      0.      0.0448  0.      0.0382  0.      0.    ]
 [ 0.      0.      0.1212  0.      0.0013  0.      0.    ]
 [ 0.      0.0013  0.      0.      0.0382  0.0119  0.    ]
 [ 0.      0.      0.      0.      0.3715  0.004   0.    ]
 [ 0.      0.0105  0.0105  0.      0.1528  0.029   0.    ]
 [ 0.      0.      0.0184  0.      0.0988  0.004   0.    ]]


In [None]:
# from sklearn.metrics import precision_recall_fscore_support
# print('DDos precision & recall & f1_score & support:')
# typ = [ "precision", "recall", "f1_score", "support"]
# ddos_prfs = precision_recall_fscore_support(y_eval, pred)
# ddos_list = [x for x,_ in ddos_prfs]
# for f, b in zip(typ, ddos_list):
#     print( f, b)

In [None]:
print('Normal precision & recall & f1_score & support:')
typ = [ "precision", "recall", "f1_score", "support"]
ddos_prfs = precision_recall_fscore_support(y_eval, pred)
ddos_list = [_ for x,_ in ddos_prfs]
for f, b in zip(typ, ddos_list):
    print( f, b)

In [None]:
valAcc = history_cb.val_acc
valLoss = history_cb.val_loss
epoch_it = np.arange(1,16)
plt.subplot(121)
plt.plot(epoch_it, valAcc)
plt.grid()
plt.xlabel('Epoch')
plt.ylabel('Validation accuracy')
plt.tight_layout()
plt.subplot(122)
plt.plot(epoch_it, valLoss)
plt.grid()
plt.xlabel('Epoch')
plt.ylabel('Validation loss')
plt.tight_layout()
plt.show()

In [None]:
Ls = history_cb.losses
Acc = history_cb.accs
ntr = np.shape(x_train)[0]
epochNum = []
for i in range(5040):
    epochNum.append(i*100/ntr)

plt.subplot(122)
matplotlib.pyplot.semilogy(epochNum, Ls)
plt.grid()
plt.xlabel('Epoch')
plt.ylabel('Training Loss')
plt.tight_layout()
# plt.show()
plt.subplot(121)
matplotlib.pyplot.semilogy(epochNum, Acc)
# plt.plot(epochNum, Acc)
plt.grid()
plt.xlabel('Epoch')
plt.ylabel('Training Accuracy')
plt.tight_layout()
# plt.axis([0.3,3,0.96,1])
plt.show()

In [None]:
matplotlib.pyplot.semilogy(epochNum, Acc)
# plt.plot(epochNum, Acc)
plt.grid()
plt.xlabel('Epoch')
plt.ylabel('Training Accuracy')
plt.tight_layout()
# plt.axis([0.3,3,0.96,1])
plt.show()



In [None]:
from keras.models import load_model

# model.save('model_trained_nosampled_sampling@100%.h5')  # creates a HDF5 file 'my_model.h5'
# del model  # deletes the existing model

# # returns a compiled model
# # identical to the previous one
# model = load_model('my_model.h5')

In [None]:
# hist = model.fit(x, y, validation_split=0.2)
# print(hist.history)

In [None]:
# np.shape(hist.history['acc'])

In [None]:
# model2 = load_model('model_trained_sampling@100%.h5')

In [None]:
# # Measure accuracy
# pred = model2.predict(x_test)
# pred = np.argmax(pred,axis=1)
# y_eval = np.argmax(y_test,axis=1)
# score = metrics.accuracy_score(y_eval, pred)
# print("Validation score: {}".format(score))

In [None]:
/