In [1]:
import tensorflow as tf
import keras as kr
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import math
from functools import reduce

from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten, Conv2D, Input, MaxPooling2D, AveragePooling2D

%matplotlib inline

Using TensorFlow backend.


In [2]:
NUMBER_SIZE = 15
THRESHOLD_COUNT = 5

BATCH_NUMBER_COUNT_50 = 50
SINGLE_SAMPLE_SIZE_50 = BATCH_NUMBER_COUNT_50 * NUMBER_SIZE

CLASSES = { "none": 0, "attack": 1 }

In [3]:
def create_single_sample_from_array(numbers):
  # Treat the list of phone numbers as a single huge list digits
  # pretty much like greyscale images, etc.
  digit_str = ''.join([str(i) for i in numbers])
  # Now create an image-like array
  return list(digit_str)

def create_single_sample_from_dataframe(dataframe_row):
  return create_single_sample_from_array(np.array(dataframe_row))

def combine_dfs(dfs, outcome_values=None, shuffle=True):
  # Combine data with sequence with outcome
  df_list = []
  for idx, df in enumerate(dfs):
    data_arr = []
    for index, series in df.iterrows():
      series_value = [v for _, v in series.iteritems()]
      combined_series = create_single_sample_from_array(series_value)
      data_arr.append(combined_series)
    
    df_arr_row_count = len(data_arr)
    
    outcome_value = outcome_values[idx] if outcome_values else idx
    data_outcome_tuples = zip(data_arr, [outcome_value] * df_arr_row_count)

    # DEBUG
    print("outcome_value:%s of all possible outcome_values:%s" % (outcome_value, outcome_values))

    df_list += data_outcome_tuples   
    
  # Merge data with sequence and no sequence
  dataframe = pd.DataFrame(df_list, columns=["sample","outcome"])
  # Shuffle (or rather randomly select samples) but 1.0 means all
  df_random = dataframe.sample(frac=1) if shuffle else dataframe
  return df_random

def extract_sample_and_outcome(df, sample_col_name='sample', outcome_col_name='outcome'):
  X = [i for i in df[sample_col_name]]
  Y = [i for i in df[outcome_col_name]]
  return (X, Y)

def load_data(filename, expected_shape_tuple):
  data = pd.read_csv(filename, header=None)
  print("data file:%s, data.shape (should be %s): %s" % (filename, expected_shape_tuple, data.shape))
  if data.shape != expected_shape_tuple:
    raise ValueError("data.shape:%s does not match excpected shape:%s" % (data.shape, expected_shape_tuple))
  return data
    
def prepare_data(filenames, expected_shape_tuples, outcome_values=None, shuffle=True):
  dfs = []
  for idx, filename in enumerate(filenames):
    dfs.append(load_data(filename, expected_shape_tuples[idx]))

  df_all = combine_dfs(dfs, outcome_values, shuffle)
  if (df_all.shape[0] != reduce((lambda m, i: m + i.shape[0]), dfs,0)):
    raise ValueError("There is a problem with combine_dfs. df_all.shape:%s does not match the sum of df_seq:%s and df_no_seq:%s" % (df_all.shape, df_seq.shape, df_no_seq.shape))
  return df_all

def prepare_train_data(dataframe, single_sample_size):
  df_row_count = dataframe.shape[0]
  (X, Y) = extract_sample_and_outcome(dataframe)
  X_train = np.array(X).reshape(df_row_count, 1, single_sample_size, 1)
  Y_train = np.array(Y).reshape(df_row_count)

  print("type(X):%s, len(X):%d, len(X[0]):%d" % (type(X), len(X), len(X[0])))
  print("df_row_count: %d" % df_row_count)
  print("single_sample_size_bit:%d" % single_sample_size)
  print("X_train.shape:%s, Y_train.shape:%s" % (X_train.shape, Y_train.shape))
  print("\n")

  print("type(Y[0]):%s, Y[0]:%s" % (type(Y[0]), Y[0]))

  return (X_train, Y_train)

In [4]:
# Load data from files into dataframe

df_random_2040_2040_combo_sample_number_50 = prepare_data(['data_no_sequence_2040_sample_number_50.csv','data_sequence_sparse_head_heavy_tail_heavy_ooo_mid_combo_2040_sample_number_50.csv'], [(2040,50), (2040,50)], [CLASSES["none"],CLASSES["attack"]]) 

df_random_10200_10200_combo_sample_number_50 = prepare_data(['data_no_sequence_10200_sample_number_50.csv','data_sequence_sparse_head_heavy_tail_heavy_ooo_mid_combo_10200_sample_number_50.csv'], [(10200,50), (10200,50)], [CLASSES["none"],CLASSES["attack"]]) 

data file:data_no_sequence_2040_sample_number_50.csv, data.shape (should be (2040, 50)): (2040, 50)
data file:data_sequence_sparse_head_heavy_tail_heavy_ooo_mid_combo_2040_sample_number_50.csv, data.shape (should be (2040, 50)): (2040, 50)
outcome_value:0 of all possible outcome_values:[0, 1]
outcome_value:1 of all possible outcome_values:[0, 1]
data file:data_no_sequence_10200_sample_number_50.csv, data.shape (should be (10200, 50)): (10200, 50)
data file:data_sequence_sparse_head_heavy_tail_heavy_ooo_mid_combo_10200_sample_number_50.csv, data.shape (should be (10200, 50)): (10200, 50)
outcome_value:0 of all possible outcome_values:[0, 1]
outcome_value:1 of all possible outcome_values:[0, 1]


In [5]:
# SANITY CHECK #1

# There should be 'sample' and 'outcome' columns
# 'sample' contains all phone numbers in period broken down into individual digits
# and combined
# 'outcome' is 0 or 1 for binary
# Also there should be mixed 0 and 1 since the samples have been randomize
df_random_2040_2040_combo_sample_number_50.tail()

Unnamed: 0,sample,outcome
522,"[9, 5, 1, 0, 0, 0, 6, 6, 6, 8, 9, 6, 3, 3, 2, ...",0
3208,"[4, 8, 3, 7, 3, 7, 8, 4, 8, 1, 6, 7, 4, 4, 5, ...",1
122,"[9, 2, 8, 5, 1, 4, 0, 2, 3, 9, 6, 9, 1, 9, 8, ...",0
319,"[4, 4, 4, 8, 8, 4, 5, 1, 5, 6, 3, 1, 4, 7, 5, ...",0
72,"[2, 8, 1, 6, 7, 1, 7, 9, 6, 4, 8, 2, 5, 6, 0, ...",0


In [6]:
# SANITY CHECK #2

# 50% or 2040 samples should be 1
np.count_nonzero(df_random_2040_2040_combo_sample_number_50['outcome'])

2040

In [7]:
# SANITY CHECK #3

# The 'sample' length should be same as SINGLE_SAMPLE_SIZE_50 = BATCH_NUMBER_COUNT_50 * NUMBER_SIZE
len(df_random_2040_2040_combo_sample_number_50['sample'][1])

750

In [8]:
# Split out dataframe containing both samples and outcome

(X_2040_2040_combo_sample_number_50_train, Y_2040_2040_combo_sample_number_50_train) = prepare_train_data(df_random_2040_2040_combo_sample_number_50, SINGLE_SAMPLE_SIZE_50)

(X_10200_10200_combo_sample_number_50_train, Y_10200_10200_combo_sample_number_50_train) = prepare_train_data(df_random_10200_10200_combo_sample_number_50, SINGLE_SAMPLE_SIZE_50)

type(X):<class 'list'>, len(X):4080, len(X[0]):750
df_row_count: 4080
single_sample_size_bit:750
X_train.shape:(4080, 1, 750, 1), Y_train.shape:(4080,)


type(Y[0]):<class 'numpy.int64'>, Y[0]:0
type(X):<class 'list'>, len(X):20400, len(X[0]):750
df_row_count: 20400
single_sample_size_bit:750
X_train.shape:(20400, 1, 750, 1), Y_train.shape:(20400,)


type(Y[0]):<class 'numpy.int64'>, Y[0]:1


In [9]:
# For CNN, the dimensions are number of samples, height, width, channel/feature maps
# Number of samples: sum of 'attack' and 'no attack' samples
# Height: 1
# Width: SINGLE_SAMPLE_SIZE_50
# Feature maps/Channels: 1
X_2040_2040_combo_sample_number_50_train.shape

(4080, 1, 750, 1)

In [10]:
Y_2040_2040_combo_sample_number_50_train.shape

(4080,)

In [11]:
# Various multi-class CNN models tested

def create_cnn_model_G_50():
  # Create model
  model = Sequential()
  # Max number of digits are different in number yet still can be within threshold
  digit_diff_size = 2
  digit_same_size = NUMBER_SIZE - digit_diff_size
  # If the seq is in the middle then parts that are same is split in two
  smallest_digit_same_size = math.floor(digit_same_size/2)
  # add model layers
  number_digit_diff_size = math.floor(NUMBER_SIZE/digit_diff_size)
  model.add(Conv2D(250, kernel_size=(1,smallest_digit_same_size), strides=(1,1), activation='relu', input_shape=(1,SINGLE_SAMPLE_SIZE_50,1)))
  model.add(Conv2D(50, kernel_size=(1,NUMBER_SIZE), strides=(1,NUMBER_SIZE), activation='relu'))
  #model.add(Conv2D(THRESHOLD_COUNT, kernel_size=(1,NUMBER_SIZE_BIT), activation='relu'))
  model.add(Flatten())
  model.add(Dense(BATCH_NUMBER_COUNT_50, activation="relu"))
  model.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))
  # Compile model
  model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
  return model

def create_cnn_model_G_50_coarse():
  # Create model
  model = Sequential()
  model.add(Conv2D(5, kernel_size=(1,NUMBER_SIZE), strides=(1,NUMBER_SIZE), activation='relu', input_shape=(1,SINGLE_SAMPLE_SIZE_50,1)))
  model.add(Conv2D(2, kernel_size=(1,1), strides=(1,1), activation='relu'))
  #model.add(Conv2D(THRESHOLD_COUNT, kernel_size=(1,NUMBER_SIZE_BIT), activation='relu'))
  model.add(Flatten())
  model.add(Dense(BATCH_NUMBER_COUNT_50, activation="relu"))
  model.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))
  # Compile model
  model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
  return model

def create_cnn_model_G_50_mod():
  # Create model
  model = Sequential()
  # Max number of bits are different in number yet still can be within threshold
  bit_diff_size = BIT_PER_DIGIT*2
  bit_same_size = NUMBER_SIZE_BIT - bit_diff_size
  # If the seq is in the middle then parts that are same is split in two
  smallest_bit_same_size = int(math.floor(bit_same_size/2) / BIT_PER_DIGIT)
  # add model layers
  number_bit_diff_size = math.floor(NUMBER_SIZE_BIT/bit_diff_size)
  model.add(Conv2D(150, kernel_size=(1,smallest_bit_same_size), strides=(1,BIT_PER_DIGIT), activation='relu', input_shape=(1,SINGLE_SAMPLE_SIZE_BIT_50,1)))
  model.add(Conv2D(5, kernel_size=(1,NUMBER_SIZE_BIT ), strides=(1,NUMBER_SIZE_BIT), activation='relu'))
  #model.add(Conv2D(THRESHOLD_COUNT, kernel_size=(1,NUMBER_SIZE_BIT), activation='relu'))
  model.add(Flatten())
  model.add(Dense(10 - THRESHOLD_COUNT, activation="relu"))
  model.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))
  # Compile model
  model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
  return model

def create_cnn_model_G_50_maxpool():
  # Create model
  model = Sequential()
  # Max number of bits are different in number yet still can be within threshold
  # bit_diff_size = BIT_PER_DIGIT*2
  bit_same_size = NUMBER_SIZE_BIT - bit_diff_size
  # If the seq is in the middle then parts that are same is split in two
  smallest_bit_same_size = math.floor(bit_same_size/2)
  # add model layers
  number_bit_diff_size = math.floor(NUMBER_SIZE_BIT/bit_diff_size)
  model.add(Conv2D(150, kernel_size=(1,smallest_bit_same_size), strides=(1,BIT_PER_DIGIT), activation='relu', input_shape=(1,SINGLE_SAMPLE_SIZE_BIT_50,1)))
  model.add(MaxPooling2D(pool_size=(1,smallest_bit_same_size), strides=(1,math.floor(smallest_bit_same_size/BIT_PER_DIGIT))))
  model.add(Conv2D(5, kernel_size=(1,1), strides=(1,1), activation='relu'))
  #model.add(Conv2D(5, kernel_size=(1,NUMBER_SIZE_BIT ), strides=(1,NUMBER_SIZE_BIT), activation='relu'))
  #model.add(Conv2D(THRESHOLD_COUNT, kernel_size=(1,NUMBER_SIZE_BIT), activation='relu'))
  model.add(Flatten())
  model.add(Dense(10 - THRESHOLD_COUNT, activation="relu"))
  model.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))
  # Compile model
  model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
  return model

def create_cnn_model_G_50_avepool():
  # Create model
  model = Sequential()
  # Max number of bits are different in number yet still can be within threshold
  digit_diff_size = 2
  digit_same_size = NUMBER_SIZE - digit_diff_size
  # If the seq is in the middle then parts that are same is split in two
  smallest_digit_same_size = math.floor(digit_same_size/2)
  # add model layers
  number_digit_diff_size = math.floor(NUMBER_SIZE/digit_diff_size)
  model.add(Conv2D(150, kernel_size=(1,smallest_digit_same_size), strides=(1,1), activation='relu', input_shape=(1,SINGLE_SAMPLE_SIZE_50,1)))
  model.add(AveragePooling2D(pool_size=(1,NUMBER_SIZE), strides=(1,NUMBER_SIZE)))
  # model.add(Conv2D(50, kernel_size=(1,1), strides=(1,1), activation='relu'))
  #model.add(Conv2D(5, kernel_size=(1,NUMBER_SIZE_BIT ), strides=(1,NUMBER_SIZE_BIT), activation='relu'))
  #model.add(Conv2D(THRESHOLD_COUNT, kernel_size=(1,NUMBER_SIZE_BIT), activation='relu'))
  model.add(Flatten())
  model.add(Dense(BATCH_NUMBER_COUNT_50, activation="relu"))
  model.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))
  # Compile model
  model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
  return model

def create_cnn_model_G_50_coarse_3():
  # Create model
  model = Sequential()
  model.add(Conv2D(20, kernel_size=(1,NUMBER_SIZE), strides=(1,NUMBER_SIZE), activation='relu', input_shape=(1,SINGLE_SAMPLE_SIZE_50,1)))
  model.add(MaxPooling2D(pool_size=(1,1), strides=(1,1)))
  model.add(Conv2D(10, kernel_size=(1,1), strides=(1,1), activation='relu'))
  model.add(MaxPooling2D(pool_size=(1,1), strides=(1,1)))
  #model.add(Conv2D(THRESHOLD_COUNT, kernel_size=(1,NUMBER_SIZE_BIT), activation='relu'))
  model.add(Flatten())
  model.add(Dense(BATCH_NUMBER_COUNT_50, activation="relu"))
  model.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))
  # Compile model
  model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
  return model

In [12]:
# For multi-class this is best so we use it here in binary class CNN: create_cnn_model_G_50_coarse_3

cnn_model_G_50_coarse_3 = create_cnn_model_G_50_coarse_3()

cnn_model_G_50_coarse_3.fit(X_10200_10200_combo_sample_number_50_train, Y_10200_10200_combo_sample_number_50_train, validation_split=0.2, batch_size=10, epochs=50)

Train on 16320 samples, validate on 4080 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x116563f60>

In [13]:
predict_arr = cnn_model_G_50_coarse_3.predict_classes(X_2040_2040_combo_sample_number_50_train)
# Each element in 'a' will contain 1/True if prediction matches expected outcome
a = (predict_arr == np.array([[r] for r in Y_2040_2040_combo_sample_number_50_train]))
# % of correct predictions
np.count_nonzero(a)/np.size(a)



0.6056372549019607

In [14]:
# Alternatively you can evaluate using StratifiedKFold

from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

# Evaluate baseline model
seed = 7
estimator = KerasClassifier(build_fn=create_cnn_model_G_50_coarse_3, epochs=10, batch_size=5, verbose=1)
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
results = cross_val_score(estimator, X_10200_10200_combo_sample_number_50_train, Y_10200_10200_combo_sample_number_50_train, cv=kfold)

print("Results: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
