In [1]:
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn import metrics
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.callbacks import EarlyStopping

Using TensorFlow backend.


In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import shutil
import os
import keras
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:

# Encode text values to dummy variables(i.e. [1,0,0],[0,1,0],[0,0,1] for red,green,blue)
def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = "{}-{}".format(name, x)
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)


# Encode text values to a single dummy variable.  The new columns (which do not replace the old) will have a 1
# at every location where the original column (name) matches each of the target_values.  One column is added for
# each target value.
def encode_text_single_dummy(df, name, target_values):
    for tv in target_values:
        l = list(df[name].astype(str))
        l = [1 if str(x) == str(tv) else 0 for x in l]
        name2 = "{}-{}".format(name, tv)
        df[name2] = l


# Encode text values to indexes(i.e. [1],[2],[3] for red,green,blue).
def encode_text_index(df, name):
    le = preprocessing.LabelEncoder()
    df[name] = le.fit_transform(df[name])
    return le.classes_


# Encode a numeric column as zscores
def encode_numeric_zscore(df, name, mean=None, sd=None):
    if mean is None:
        mean = np.asarray(df[name], dtype = np.float).mean()

    if sd is None:
        sd = np.asarray(df[name], dtype = np.float).std()

    df[name] = (np.asarray(df[name], dtype = np.float) - mean) / sd


# Convert all missing values in the specified column to the median
def missing_median(df, name):
    med = df[name].median()
    df[name] = df[name].fillna(med)


# Convert all missing values in the specified column to the default
def missing_default(df, name, default_value):
    df[name] = df[name].fillna(default_value)


# Convert a Pandas dataframe to the x,y inputs that TensorFlow needs
def to_xy(df, target):
    result = []
    for x in df.columns:
        if x != target:
            result.append(x)
    
    target_type = df[target].dtypes
    target_type = target_type[0] if hasattr(target_type, '__iter__') else target_type
    # Encode to int for classification, float otherwise. TensorFlow likes 32 bits.
    if target_type in (np.int64, np.int32):
        # Classification
        dummies = pd.get_dummies(df[target])
        return df.as_matrix(result).astype(np.float32), dummies.as_matrix().astype(np.float32)
    else:
        # Regression
        return df.as_matrix(result).astype(np.float32), df.as_matrix([target]).astype(np.float32)

# Nicely formatted time string
def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return "{}:{:>02}:{:>05.2f}".format(h, m, s)


# Regression chart.
def chart_regression(pred,y,sort=True):
    t = pd.DataFrame({'pred' : pred, 'y' : y.flatten()})
    if sort:
        t.sort_values(by=['y'],inplace=True)
    a = plt.plot(t['y'].tolist(),label='expected')
    b = plt.plot(t['pred'].tolist(),label='prediction')
    plt.ylabel('output')
    plt.legend()
    plt.show()

# Remove all rows where the specified column is +/- sd standard deviations
def remove_outliers(df, name, sd):
    drop_rows = df.index[(np.abs(df[name] - df[name].mean()) >= (sd * df[name].std()))]
    df.drop(drop_rows, axis=0, inplace=True)


# Encode a column to a range between normalized_low and normalized_high.
def encode_numeric_range(df, name, normalized_low=-1, normalized_high=1,
                         data_low=None, data_high=None):
    if data_low is None:
        data_low = min(df[name])
        data_high = max(df[name])

    df[name] = ((df[name] - data_low) / (data_high - data_low)) \
               * (normalized_high - normalized_low) + normalized_low

In [4]:

# This file is a CSV, just no CSV extension or headers

df_ = pd.read_csv("./train_50%_v4.csv")
df_test = pd.read_csv("./test_50%_v4.csv")
print("Read df_ {} rows.".format(len(df_)))
print("Read df_test {} rows.".format(len(df_test)))
#print("Read {} rows.".format(len(df1)))
# df = df.sample(frac=0.1, replace=False) # Uncomment this line to sample only 10% of the dataset
df_ = df_.drop([df_.columns[0], df_.columns[8], df_.columns[9], df_.columns[10]], axis=1)
df_test = df_test.drop([df_test.columns[0], df_test.columns[8], df_test.columns[9], df_test.columns[10]], axis=1)

df_.dropna(inplace=True,axis=1) # For now, just drop NA's (rows with missing values)
df_test.dropna(inplace=True,axis=1)

Read df_ 999 rows.
Read df_test 999 rows.


In [5]:
df_.head(5)

Unnamed: 0,avg(pkt_len),stddev(pkt_len),fb_ratio,inter_arrival_time,pkt_count,duration,is_tcp,outcome
0,73.2,20.571825,1.859375,0.017486,5,0.069943,1,download
1,70.92381,11.51228,-1.0,0.002947,105,0.306517,1,game
2,193.84,364.80996,0.429077,0.0075,25,0.179999,1,download
3,347.0,488.311807,0.454012,0.266814,20,5.069462,1,voip
4,195.933333,371.472834,0.271744,0.093039,15,1.30254,1,voip


In [6]:
df_test.head(5)

Unnamed: 0,avg(pkt_len),stddev(pkt_len),fb_ratio,inter_arrival_time,pkt_count,duration,is_tcp,outcome
0,95.0,70.710678,0.310345,0.22253,2,0.22253,0,streaming
1,355.071429,519.826105,0.192086,1.134777,28,30.638989,1,game
2,73.2,20.571825,1.859375,0.000913,5,0.003653,1,streaming
3,195.933333,371.472834,0.271744,0.012241,15,0.171377,1,voip
4,1352.0,0.0,-1.0,9.3e-05,775,0.071769,1,download


In [7]:
encode_numeric_zscore(df_, 'fb_ratio')
encode_numeric_zscore(df_, 'pkt_count')
encode_numeric_zscore(df_, 'inter_arrival_time')
encode_numeric_zscore(df_, 'stddev(pkt_len)')
encode_numeric_zscore(df_, 'avg(pkt_len)')
# encode_numeric_zscore(df_, 'pkt_len')
encode_numeric_zscore(df_, 'duration')
encode_text_index(df_, 'outcome')

encode_numeric_zscore(df_test, 'fb_ratio')
encode_numeric_zscore(df_test, 'pkt_count')
encode_numeric_zscore(df_test, 'inter_arrival_time')
encode_numeric_zscore(df_test, 'stddev(pkt_len)')
encode_numeric_zscore(df_test, 'avg(pkt_len)')
# encode_numeric_zscore(df_test, 'pkt_len')
encode_numeric_zscore(df_test, 'duration')
encode_text_index(df_test, 'outcome')

array(['download', 'game', 'streaming', 'voip'], dtype=object)

In [8]:
x_train, y_train = to_xy(df_, 'outcome')
x_test, y_test = to_xy(df_test, 'outcome')
# x, y = to_xy(df_, ' Label')

In [9]:
df_.head(5)

Unnamed: 0,avg(pkt_len),stddev(pkt_len),fb_ratio,inter_arrival_time,pkt_count,duration,is_tcp,outcome
0,-0.842605,-0.857108,0.297072,-0.09354,-0.381792,-0.211227,1,0
1,-0.847032,-0.900791,-0.252081,-0.094069,-0.273562,-0.209695,1,1
2,-0.607948,0.802716,0.022378,-0.093903,-0.360146,-0.210514,1,0
3,-0.310037,1.39821,0.027167,-0.084476,-0.365558,-0.178858,1,3
4,-0.603877,0.834843,-0.007838,-0.090794,-0.370969,-0.203246,1,3


In [10]:
import keras.backend as K
K.clear_session()

In [11]:
# Create neural net
model = Sequential()

model.add(Dense(16, input_dim=x_train.shape[1], kernel_initializer='normal', activation='relu'))
model.add(Dense(y_train.shape[1],activation='sigmoid'))

In [12]:
# Print the model summary
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 16)                128       
_________________________________________________________________
dense_2 (Dense)              (None, 4)                 68        
Total params: 196
Trainable params: 196
Non-trainable params: 0
_________________________________________________________________


In [13]:
class LossHistory(keras.callbacks.Callback):
    def on_train_begin(self, logs={}):
        # TODO:  Create two empty lists, self.loss and self.val_acc
        self.losses = []
        self.accs = []
        self.val_acc = []
        self.val_loss = []
    def on_batch_end(self, batch, logs={}):
        # TODO:  This is called at the end of each batch.  
        # Add the loss in logs.get('loss') to the loss list
        loss = logs.get('loss')
        acc = logs.get('acc')
        self.losses.append(loss)
        self.accs.append(acc)
    def on_epoch_end(self, epoch, logs):
        # TODO:  This is called at the end of each epoch.  
        # Add the test accuracy in logs.get('loss') to the val_acc list
        val_acc = logs.get('val_acc')
        val_loss = logs.get('val_loss')
        self.val_acc.append(val_acc)
        self.val_loss.append(val_loss)

# Create an instance of the history callback
history_cb = LossHistory()

In [14]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
monitor = EarlyStopping(monitor='val_loss', min_delta=1e-5, patience=20, verbose=1, mode='auto')
# batch_size = 10
epochs = 300
model.fit(x_train,y_train,validation_data=(x_test,y_test),callbacks=[monitor],verbose=1, epochs=epochs)

Train on 999 samples, validate on 999 samples
Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300
Epoch 26/300
Epoch 27/300
Epoch 28/300
Epoch 29/300
Epoch 30/300
Epoch 31/300
Epoch 32/300
Epoch 33/300
Epoch 34/300
Epoch 35/300
Epoch 36/300
Epoch 37/300
Epoch 38/300
Epoch 39/300
Epoch 40/300
Epoch 41/300
Epoch 42/300
Epoch 43/300
Epoch 44/300
Epoch 45/300
Epoch 46/300
Epoch 47/300
Epoch 48/300
Epoch 49/300
Epoch 50/300
Epoch 51/300
Epoch 52/300
Epoch 53/300
Epoch 54/300
Epoch 55/300
Epoch 56/300
Epoch 57/300
Epoch 58/300
Epoch 59/300
Epoch 60/300
Epoch 61/300
Epoch 62/300
Epoch 63/300
Epoch 64/300
Epoch 65/300
Epoch 66/300


Epoch 67/300
Epoch 68/300
Epoch 69/300
Epoch 70/300
Epoch 71/300
Epoch 72/300
Epoch 73/300
Epoch 74/300
Epoch 75/300
Epoch 76/300
Epoch 77/300
Epoch 78/300
Epoch 79/300
Epoch 80/300
Epoch 81/300
Epoch 82/300
Epoch 83/300
Epoch 84/300
Epoch 85/300
Epoch 86/300
Epoch 87/300
Epoch 88/300
Epoch 89/300
Epoch 90/300
Epoch 91/300
Epoch 92/300
Epoch 93/300
Epoch 94/300
Epoch 95/300
Epoch 96/300
Epoch 97/300
Epoch 98/300
Epoch 99/300
Epoch 100/300
Epoch 101/300
Epoch 102/300
Epoch 103/300
Epoch 104/300
Epoch 105/300
Epoch 106/300
Epoch 107/300
Epoch 108/300
Epoch 109/300
Epoch 110/300
Epoch 111/300
Epoch 112/300
Epoch 113/300
Epoch 114/300
Epoch 115/300
Epoch 116/300
Epoch 117/300
Epoch 00116: early stopping


<keras.callbacks.History at 0x1199f69b0>

In [15]:
# Measure accuracy
pred = model.predict(x_test)
pred = np.argmax(pred,axis=1)
y_eval = np.argmax(y_test,axis=1)
score = metrics.accuracy_score(y_eval, pred)
print("Validation score: {}".format(score))

Validation score: 0.6366366366366366


In [16]:
from sklearn.metrics import classification_report,confusion_matrix
Cm = confusion_matrix(y_eval,pred)
C = np.sum(Cm)
Cm = Cm/C
print('Confusion Matrix:')
print(np.array_str(Cm, precision=4, suppress_small=True))

Confusion Matrix:
[[ 0.1592  0.013   0.011   0.0651]
 [ 0.041   0.1712  0.038   0.017 ]
 [ 0.039   0.0501  0.1311  0.016 ]
 [ 0.011   0.038   0.024   0.1752]]


In [17]:
# valAcc = history_cb.val_acc
# valLoss = history_cb.val_loss
# epoch_it = np.arange(1,301)
# plt.subplot(121)
# plt.plot(epoch_it, valAcc)
# plt.grid()
# plt.xlabel('Epoch')
# plt.ylabel('Validation accuracy')
# plt.tight_layout()
# plt.subplot(122)
# plt.plot(epoch_it, valLoss)
# plt.grid()
# plt.xlabel('Epoch')
# plt.ylabel('Validation loss')
# plt.tight_layout()
# plt.show()

In [18]:
# Ls = history_cb.losses
# Acc = history_cb.accs
# ntr = np.shape(x_train)[0]
# epochNum = []
# for i in range(33000):
#     epochNum.append(i*10/ntr)

# plt.subplot(121)
# matplotlib.pyplot.semilogy(epochNum, Acc)
# # plt.plot(epochNum, Acc)
# plt.grid()
# plt.xlabel('Epoch')
# plt.ylabel('Accuracy')
# plt.tight_layout()

# plt.subplot(122)
# matplotlib.pyplot.semilogy(epochNum, Ls)
# plt.grid()
# plt.xlabel('Epoch')
# plt.ylabel('Loss')
# plt.tight_layout()
# # plt.show()

# # plt.axis([0.3,3,0.96,1])
# plt.show()

In [19]:
# print(Cm[0])
nd = [None]*4
for i,nd in enumerate (Cm):
    for j in range (len(nd)):
        
        print(i,j,"%0.2f"%(nd[j]/np.sum(nd)))
    print("---")

0 0 0.64
0 1 0.05
0 2 0.04
0 3 0.26
---
1 0 0.15
1 1 0.64
1 2 0.14
1 3 0.06
---
2 0 0.17
2 1 0.21
2 2 0.56
2 3 0.07
---
3 0 0.04
3 1 0.15
3 2 0.10
3 3 0.71
---
