In [1]:
import tensorflow as tf
import keras as kr
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import math
from functools import reduce

from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten, Conv2D, Input, MaxPooling2D, AveragePooling2D

%matplotlib inline

Using TensorFlow backend.


In [2]:
NUMBER_SIZE = 15
THRESHOLD_COUNT = 5

BATCH_NUMBER_COUNT_50 = 50
SINGLE_SAMPLE_SIZE_50 = BATCH_NUMBER_COUNT_50 * NUMBER_SIZE

CLASSES = { "none": [1,0,0,0], "sparse": [0,1,0,0], "head-heavy": [0,0,1,0], "tail-heavy": [0,0,0,1] }

In [3]:
def create_single_sample_from_array(numbers):
  # Treat the list of phone numbers as a single huge list digits
  # pretty much like greyscale images, etc.
  # We then specify the number boundaries in the conv net
  digit_str = ''.join([str(i) for i in numbers])
  # Now create an image-like array
  return list(digit_str)

def create_single_sample_from_dataframe(dataframe_row):
  return create_single_sample_from_array(np.array(dataframe_row))

def combine_dfs(dfs, outcome_values=None, shuffle=True):# Combine data with sequence with outcome
  df_list = []
  for idx, df in enumerate(dfs):
    data_arr = []
    for index, series in df.iterrows():
      series_value = [v for _, v in series.iteritems()]
      combined_series = create_single_sample_from_array(series_value)
      data_arr.append(combined_series)
    
    df_arr_row_count = len(data_arr)
            
    outcome_value = outcome_values[idx] if outcome_values else idx
    data_outcome_tuples = zip(data_arr, [outcome_value] * df_arr_row_count)

    # DEBUG
    print("outcome_value:%s, outcome_values:%s" % (outcome_value, outcome_values))

    df_list += data_outcome_tuples   
    
  # Merge data with sequence and no sequence
  dataframe = pd.DataFrame(df_list, columns=["sample","outcome"])
  # Shuffle (or rather randomly select samples) but 1.0 means all
  df_random = dataframe.sample(frac=1) if shuffle else dataframe
  return df_random

def extract_sample_and_outcome(df, sample_col_name='sample', outcome_col_name='outcome'):
  X = [i for i in df[sample_col_name]]
  # Y = [int(i) for i in df[outcome_col_name]]
  Y = [i for i in df[outcome_col_name]]
  return (X, Y)

def load_data(filename, expected_shape_tuple):
  data = pd.read_csv(filename, header=None)
  print("data file:%s, data.shape (should be %s): %s" % (filename, expected_shape_tuple, data.shape))
  if data.shape != expected_shape_tuple:
    raise ValueError("data.shape:%s does not match excpected shape:%s" % (data.shape, expected_shape_tuple))
  return data
    
def prepare_data(filenames, expected_shape_tuples, outcome_values=None, shuffle=True):
  dfs = []
  for idx, filename in enumerate(filenames):
    dfs.append(load_data(filename, expected_shape_tuples[idx]))

  df_all = combine_dfs(dfs, outcome_values, shuffle)
  if (df_all.shape[0] != reduce((lambda m, i: m + i.shape[0]), dfs,0)):
    raise ValueError("There is a problem with combine_seq_no_seq. df_all.shape:%s does not match the sum of df_seq:%s and df_no_seq:%s" % (df_all.shape, df_seq.shape, df_no_seq.shape))
  return df_all

def prepare_train_data(dataframe, single_sample_size):
  df_row_count = dataframe.shape[0]
  (X, Y) = extract_sample_and_outcome(dataframe)
  X_train = np.array(X).reshape(df_row_count, 1, single_sample_size, 1)
  Y_train = np.array(Y).reshape(df_row_count, len(CLASSES.keys()))

  print("type(X):%s, len(X):%d, len(X[0]):%d" % (type(X), len(X), len(X[0])))
  print("df_row_count: %d" % df_row_count)
  print("single_sample_size_bit:%d" % single_sample_size)
  print("X_train.shape:%s, Y_train.shape:%s" % (X_train.shape, Y_train.shape))
  print("\n")

  print("type(Y[0]):%s, Y[0]:%s" % (type(Y[0]), Y[0]))

  return (X_train, Y_train)

In [4]:
# Load data from files into dataframe

df_random_with_mid_ooo_2000_2000_2000_2000_sample_number_50 = prepare_data(['data_no_sequence_2000_sample_number_50.csv', 'data_sequence_sparse_ooo_mid_combo_2000_sample_number_50.csv', 'data_sequence_head_heavy_ooo_mid_combo_2000_sample_number_50.csv', 'data_sequence_tail_heavy_ooo_mid_combo_2000_sample_number_50.csv'], [(2000,50), (2000,50), (2000,50), (2000,50)], [CLASSES["none"], CLASSES["sparse"], CLASSES["head-heavy"], CLASSES["tail-heavy"]])

df_random_with_mid_ooo_10000_10000_10000_10000_sample_number_50 = prepare_data(['data_no_sequence_10000_sample_number_50.csv', 'data_sequence_sparse_ooo_mid_combo_10000_sample_number_50.csv', 'data_sequence_head_heavy_ooo_mid_combo_10000_sample_number_50.csv', 'data_sequence_tail_heavy_ooo_mid_combo_10000_sample_number_50.csv'], [(10000,50), (10000,50), (10000,50), (10000,50)], [CLASSES["none"], CLASSES["sparse"], CLASSES["head-heavy"], CLASSES["tail-heavy"]])

data file:data_no_sequence_2000_sample_number_50.csv, data.shape (should be (2000, 50)): (2000, 50)
data file:data_sequence_sparse_ooo_mid_combo_2000_sample_number_50.csv, data.shape (should be (2000, 50)): (2000, 50)
data file:data_sequence_head_heavy_ooo_mid_combo_2000_sample_number_50.csv, data.shape (should be (2000, 50)): (2000, 50)
data file:data_sequence_tail_heavy_ooo_mid_combo_2000_sample_number_50.csv, data.shape (should be (2000, 50)): (2000, 50)
outcome_value:[1, 0, 0, 0], outcome_values:[[1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1]]
outcome_value:[0, 1, 0, 0], outcome_values:[[1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1]]
outcome_value:[0, 0, 1, 0], outcome_values:[[1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1]]
outcome_value:[0, 0, 0, 1], outcome_values:[[1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1]]
data file:data_no_sequence_10000_sample_number_50.csv, data.shape (should be (10000, 50)): (10000, 50)
data file:data_sequence_sparse_ooo_

In [5]:
# SANITY CHECK #1

# There should be 'sample' and 'outcome' columns
# 'sample' contains all phone numbers in period broken down into individual digits
# and combined
# 'outcome' is 0 or 1 for binary
# Also there should be mixed 0 and 1 since the samples have been randomize

df_random_with_mid_ooo_2000_2000_2000_2000_sample_number_50.head(20)

Unnamed: 0,sample,outcome
27,"[8, 5, 3, 9, 6, 8, 4, 7, 7, 0, 3, 4, 3, 9, 8, ...","[1, 0, 0, 0]"
6812,"[4, 6, 4, 0, 3, 3, 0, 3, 8, 1, 8, 2, 0, 2, 0, ...","[0, 0, 0, 1]"
5641,"[3, 6, 3, 2, 2, 1, 7, 6, 3, 0, 8, 0, 3, 2, 8, ...","[0, 0, 1, 0]"
7227,"[3, 2, 7, 5, 2, 9, 2, 3, 6, 5, 6, 7, 6, 2, 8, ...","[0, 0, 0, 1]"
2843,"[7, 4, 5, 5, 6, 4, 1, 5, 2, 7, 3, 2, 3, 9, 9, ...","[0, 1, 0, 0]"
962,"[9, 0, 7, 2, 6, 4, 5, 5, 1, 4, 2, 4, 3, 3, 2, ...","[1, 0, 0, 0]"
6709,"[1, 4, 5, 7, 6, 1, 1, 4, 1, 4, 5, 7, 4, 9, 4, ...","[0, 0, 0, 1]"
4621,"[6, 7, 3, 4, 9, 0, 1, 7, 7, 5, 4, 0, 9, 4, 8, ...","[0, 0, 1, 0]"
7357,"[7, 9, 5, 4, 4, 2, 9, 6, 6, 9, 7, 0, 2, 1, 1, ...","[0, 0, 0, 1]"
6472,"[2, 0, 9, 6, 5, 0, 1, 6, 7, 3, 6, 7, 1, 4, 1, ...","[0, 0, 0, 1]"


In [6]:
# SANITY CHECK #2

# 25% or 2000 samples should be 0,0,0,1
np.count_nonzero(df_random_with_mid_ooo_2000_2000_2000_2000_sample_number_50['outcome'].apply(lambda x: x == [0,0,0,1]))

2000

In [7]:
# SANITY CHECK #3

# The 'sample' length should be same as SINGLE_SAMPLE_SIZE_50 = BATCH_NUMBER_COUNT_50 * NUMBER_SIZE
len(df_random_with_mid_ooo_2000_2000_2000_2000_sample_number_50['sample'][1])

750

In [8]:
# Split out dataframe containing both samples and outcome

(X_2000_2000_2000_2000_with_mid_ooo_sample_number_50_train, Y_2000_2000_2000_2000_with_mid_ooo_sample_number_50_train) = prepare_train_data(df_random_with_mid_ooo_2000_2000_2000_2000_sample_number_50, SINGLE_SAMPLE_SIZE_50)

(X_10000_10000_10000_10000_with_mid_ooo_sample_number_50_train, Y_10000_10000_10000_10000_with_mid_ooo_sample_number_50_train) = prepare_train_data(df_random_with_mid_ooo_10000_10000_10000_10000_sample_number_50, SINGLE_SAMPLE_SIZE_50)

type(X):<class 'list'>, len(X):8000, len(X[0]):750
df_row_count: 8000
single_sample_size_bit:750
X_train.shape:(8000, 1, 750, 1), Y_train.shape:(8000, 4)


type(Y[0]):<class 'list'>, Y[0]:[1, 0, 0, 0]
type(X):<class 'list'>, len(X):40000, len(X[0]):750
df_row_count: 40000
single_sample_size_bit:750
X_train.shape:(40000, 1, 750, 1), Y_train.shape:(40000, 4)


type(Y[0]):<class 'list'>, Y[0]:[1, 0, 0, 0]


In [9]:
# For CNN, the dimensions are number of samples, height, width, channel/feature maps
# Number of samples: sum of 'attack' and 'no attack' samples
# Height: 1
# Width: SINGLE_SAMPLE_SIZE_50
# Feature maps/Channels: 1
X_2000_2000_2000_2000_with_mid_ooo_sample_number_50_train.shape

(8000, 1, 750, 1)

In [10]:
# SANITY CHECK #5

# The total rows should be attack and non-attack rows
# The wide should be array of 4, since it can be [0,0,0,0], [0,0,0,1], etc.
Y_2000_2000_2000_2000_with_mid_ooo_sample_number_50_train.shape

(8000, 4)

In [11]:
# Various multi-class CNN models tested

def create_cnn_model_G_50():
  # Create model
  model = Sequential()
  # Max number of digits are different in number yet still can be within threshold
  digit_diff_size = 2
  digit_same_size = NUMBER_SIZE - digit_diff_size
  # If the seq is in the middle then parts that are same is split in two
  smallest_digit_same_size = math.floor(digit_same_size/2)
  # add model layers
  number_digit_diff_size = math.floor(NUMBER_SIZE/digit_diff_size)
  model.add(Conv2D(50, kernel_size=(1,smallest_digit_same_size), strides=(1,1), activation='relu', input_shape=(1,SINGLE_SAMPLE_SIZE_50,1)))
  model.add(Conv2D(20, kernel_size=(1,NUMBER_SIZE), strides=(1,NUMBER_SIZE), activation='relu'))
  model.add(Flatten())
  model.add(Dense(BATCH_NUMBER_COUNT_50, activation="relu"))
  model.add(Dense(4, kernel_initializer='normal', activation='sigmoid'))
  # Compile model
  model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
  return model

def create_cnn_model_G_50_coarse():
  # Create model
  model = Sequential()
  model.add(Conv2D(50, kernel_size=(1,NUMBER_SIZE), strides=(1,NUMBER_SIZE), activation='relu', input_shape=(1,SINGLE_SAMPLE_SIZE_50,1)))
  model.add(Conv2D(20, kernel_size=(1,1), strides=(1,1), activation='relu'))
  model.add(Flatten())
  model.add(Dense(BATCH_NUMBER_COUNT_50, activation="relu"))
  model.add(Dense(4, kernel_initializer='normal', activation='sigmoid'))
  # Compile model
  model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
  return model

def create_cnn_model_G_50_coarse_2():
  # Create model
  model = Sequential()
  model.add(Conv2D(40, kernel_size=(1,NUMBER_SIZE), strides=(1,NUMBER_SIZE), activation='relu', input_shape=(1,SINGLE_SAMPLE_SIZE_50,1)))
  model.add(MaxPooling2D(pool_size=(1,1), strides=(1,1)))
  model.add(Conv2D(20, kernel_size=(1,1), strides=(1,1), activation='relu'))
  model.add(Flatten())
  model.add(Dense(BATCH_NUMBER_COUNT_50, activation="relu"))
  model.add(Dense(4, kernel_initializer='normal', activation='sigmoid'))
  # Compile model
  model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
  return model

def create_cnn_model_G_50_coarse_3():
  # Create model
  model = Sequential()
  model.add(Conv2D(50, kernel_size=(1,NUMBER_SIZE), strides=(1,NUMBER_SIZE), activation='relu', input_shape=(1,SINGLE_SAMPLE_SIZE_50,1)))
  # MaxPooling2D 1x1 theoretically does nothing but without it the trained model is 5% less accurate
  model.add(MaxPooling2D(pool_size=(1,1), strides=(1,1)))
  model.add(Conv2D(30, kernel_size=(1,1), strides=(1,1), activation='relu'))
  # MaxPooling2D 1x1 theoretically does nothing but without it the trained model is 5% less accurate
  model.add(MaxPooling2D(pool_size=(1,1), strides=(1,1)))
  model.add(Flatten())
  model.add(Dense(BATCH_NUMBER_COUNT_50, activation="relu"))
  model.add(Dense(4, kernel_initializer='normal', activation='sigmoid'))
  # Compile model
  model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
  return model

def create_cnn_model_G_50_coarse_4():
  # Create model
  model = Sequential()
  model.add(Conv2D(50, kernel_size=(1,NUMBER_SIZE), strides=(1,NUMBER_SIZE), activation='relu', input_shape=(1,SINGLE_SAMPLE_SIZE_50,1)))
  model.add(MaxPooling2D(pool_size=(1,1), strides=(1,1)))
  model.add(Conv2D(30, kernel_size=(1,1), strides=(1,1), activation='relu'))
  model.add(MaxPooling2D(pool_size=(1,1), strides=(1,1)))
  model.add(Flatten())
  model.add(Dense(int(BATCH_NUMBER_COUNT_50), activation="relu"))
  model.add(Dense(4, kernel_initializer='normal', activation='sigmoid'))
  # Compile model
  model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
  return model

def create_cnn_model_G_50_mod():
  # Create model
  model = Sequential()
  # Max number of bits are different in number yet still can be within threshold
  bit_diff_size = BIT_PER_DIGIT*2
  bit_same_size = NUMBER_SIZE_BIT - bit_diff_size
  # If the seq is in the middle then parts that are same is split in two
  smallest_bit_same_size = int(math.floor(bit_same_size/2) / BIT_PER_DIGIT)
  # add model layers
  number_bit_diff_size = math.floor(NUMBER_SIZE_BIT/bit_diff_size)
  model.add(Conv2D(150, kernel_size=(1,smallest_bit_same_size), strides=(1,BIT_PER_DIGIT), activation='relu', input_shape=(1,SINGLE_SAMPLE_SIZE_BIT_50,1)))
  model.add(Conv2D(5, kernel_size=(1,NUMBER_SIZE_BIT ), strides=(1,NUMBER_SIZE_BIT), activation='relu'))
  model.add(Flatten())
  model.add(Dense(10 - THRESHOLD_COUNT, activation="relu"))
  model.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))
  # Compile model
  model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
  return model

def create_cnn_model_G_50_maxpool():
  # Create model
  model = Sequential()
  # Max number of bits are different in number yet still can be within threshold
  digit_diff_size = 2
  digit_same_size = NUMBER_SIZE - digit_diff_size
  # If the seq is in the middle then parts that are same is split in two
  smallest_digit_same_size = math.floor(digit_same_size/2)
  # add model layers
  number_digit_diff_size = math.floor(NUMBER_SIZE/digit_diff_size)
  model.add(Conv2D(50, kernel_size=(1,smallest_digit_same_size), strides=(1,1), activation='relu', input_shape=(1,SINGLE_SAMPLE_SIZE_50,1)))
  model.add(Conv2D(20, kernel_size=(1,NUMBER_SIZE), strides=(1,NUMBER_SIZE), activation='relu'))
  model.add(MaxPooling2D(pool_size=(1,1), strides=(1,1)))
  model.add(Flatten())
  model.add(Dense(BATCH_NUMBER_COUNT_50, activation="relu"))
  model.add(Dense(4, kernel_initializer='normal', activation='sigmoid'))
  # Compile model
  model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
  return model

def create_cnn_model_G_50_avepool():
  # Create model
  model = Sequential()
  # Max number of bits are different in number yet still can be within threshold
  digit_diff_size = 2
  digit_same_size = NUMBER_SIZE - digit_diff_size
  # If the seq is in the middle then parts that are same is split in two
  smallest_digit_same_size = math.floor(digit_same_size/2)
  # add model layers
  number_digit_diff_size = math.floor(NUMBER_SIZE/digit_diff_size)
  model.add(Conv2D(50, kernel_size=(1,smallest_digit_same_size), strides=(1,1), activation='relu', input_shape=(1,SINGLE_SAMPLE_SIZE_50,1)))
  model.add(AveragePooling2D(pool_size=(1,NUMBER_SIZE), strides=(1,NUMBER_SIZE)))
  model.add(Flatten())
  model.add(Dense(BATCH_NUMBER_COUNT_50, activation="relu"))
  model.add(Dense(4, kernel_initializer='normal', activation='sigmoid'))
  # Compile model
  model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
  return model


In [31]:
# The best is perhaps: create_cnn_model_G_50_coarse_3

cnn_model_G_50_coarse_3 = create_cnn_model_G_50_coarse_3()

cnn_model_G_50_coarse_3.fit(X_10000_10000_10000_10000_with_mid_ooo_sample_number_50_train, Y_10000_10000_10000_10000_with_mid_ooo_sample_number_50_train, validation_split=0.2, batch_size=10, epochs=15)

Train on 32000 samples, validate on 8000 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x1374d05f8>

In [33]:
predict_arr = cnn_model_G_50_coarse_3.predict_classes(X_2000_2000_2000_2000_with_mid_ooo_sample_number_50_train)
# Each element in 'a' will contain 1/True if prediction matches expected outcome
expected_arr = [np.where(r==1)[0][0] for r in Y_2000_2000_2000_2000_with_mid_ooo_sample_number_50_train]
a = (predict_arr==expected_arr)
# % of correct predictions
np.count_nonzero(a)/np.size(a)



0.679

In [None]:
# NOTE: We cannot use StratifiedKFold for multi-class unless we do some transformation
# https://stackoverflow.com/questions/48508036/sklearn-stratifiedkfold-valueerror-supported-target-types-are-binary-mul

from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

seed = 7
estimator = KerasClassifier(build_fn=create_cnn_model_G_50_coarse_3, epochs=20, batch_size=10, verbose=1)
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
results = cross_val_score(estimator, X_10000_10000_10000_10000_with_mid_ooo_sample_number_50_train, Y_10000_10000_10000_10000_with_mid_ooo_sample_number_50_train, cv=kfold)

print("Results: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

In [34]:
# IMPORTANT: This is ONLY for saving models
model_json = cnn_model_G_50_coarse_3.to_json()
with open("cnn_model_G_50_coarse-multi_class_non_binary_sparse-tail_heavy-head_heavy_train_mid_ooo_10000-sample_71-pct-20190518.json", "w") as json_file:
    json_file.write(model_json)
cnn_model_G_50_coarse_3.save('cnn_model_G_50_coarse-multi_class_non_binary_sparse-tail_heavy-head_heavy_train_mid_ooo_10000-sample_71-pct-20190518.h5')

In [24]:
# Additional measurements to ensure that the accuracy is not skewed by being able to
# detect attacks very well of which there are 3 classes (75%)
# but not detect non-attacks which is 1 class (25%)
# So we measure accurary with data set where 50% is non attack, and 16.67% for
# each of the 3 attack classes

In [18]:
# Load saved model
cnn_model_G_50_coarse_3 = create_cnn_model_G_50_coarse_3()
cnn_model_G_50_coarse_3.load_weights('cnn_model_G_50_coarse-multi_class_non_binary_sparse-tail_heavy-head_heavy_train_mid_ooo_10000-sample_71-pct-20190518.h5')

In [22]:
df_random_with_mid_ooo_6000_2000_2000_2000_sample_number_50 = prepare_data(['data_no_sequence_6000_sample_number_50.csv', 'data_sequence_sparse_ooo_mid_combo_2000_sample_number_50.csv', 'data_sequence_head_heavy_ooo_mid_combo_2000_sample_number_50.csv', 'data_sequence_tail_heavy_ooo_mid_combo_2000_sample_number_50.csv'], [(6000,50), (2000,50), (2000,50), (2000,50)], [CLASSES["none"], CLASSES["sparse"], CLASSES["head-heavy"], CLASSES["tail-heavy"]])

data file:data_no_sequence_6000_sample_number_50.csv, data.shape (should be (6000, 50)): (6000, 50)
data file:data_sequence_sparse_ooo_mid_combo_2000_sample_number_50.csv, data.shape (should be (2000, 50)): (2000, 50)
data file:data_sequence_head_heavy_ooo_mid_combo_2000_sample_number_50.csv, data.shape (should be (2000, 50)): (2000, 50)
data file:data_sequence_tail_heavy_ooo_mid_combo_2000_sample_number_50.csv, data.shape (should be (2000, 50)): (2000, 50)
outcome_value:[1, 0, 0, 0], outcome_values:[[1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1]]
outcome_value:[0, 1, 0, 0], outcome_values:[[1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1]]
outcome_value:[0, 0, 1, 0], outcome_values:[[1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1]]
outcome_value:[0, 0, 0, 1], outcome_values:[[1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1]]


In [23]:
# SANITY CHECK #1

# There should be 'sample' and 'outcome' columns
# 'sample' contains all phone numbers in period broken down into individual digits
# and combined
# 'outcome' is 0 or 1 for binary
# Also there should be mixed 0 and 1 since the samples have been randomize

df_random_with_mid_ooo_6000_2000_2000_2000_sample_number_50.head(20)

Unnamed: 0,sample,outcome
10306,"[9, 4, 6, 8, 2, 6, 2, 5, 7, 7, 5, 9, 0, 9, 6, ...","[0, 0, 0, 1]"
10143,"[7, 3, 8, 5, 7, 1, 5, 3, 4, 7, 6, 0, 1, 2, 0, ...","[0, 0, 0, 1]"
5207,"[1, 3, 6, 1, 0, 3, 4, 7, 0, 8, 3, 8, 7, 8, 1, ...","[1, 0, 0, 0]"
11756,"[9, 1, 8, 7, 1, 6, 2, 0, 3, 5, 5, 5, 1, 5, 6, ...","[0, 0, 0, 1]"
7343,"[7, 3, 9, 7, 9, 4, 0, 0, 0, 9, 0, 1, 8, 5, 2, ...","[0, 1, 0, 0]"
1471,"[7, 2, 5, 7, 9, 3, 9, 7, 0, 9, 9, 1, 6, 2, 0, ...","[1, 0, 0, 0]"
1663,"[3, 0, 4, 2, 4, 7, 9, 8, 9, 6, 6, 4, 5, 6, 7, ...","[1, 0, 0, 0]"
2000,"[3, 5, 9, 0, 0, 5, 4, 6, 3, 2, 5, 6, 5, 8, 7, ...","[1, 0, 0, 0]"
2881,"[5, 1, 5, 1, 9, 6, 4, 1, 9, 2, 1, 1, 4, 5, 7, ...","[1, 0, 0, 0]"
425,"[7, 0, 3, 3, 4, 2, 6, 0, 6, 6, 0, 9, 7, 1, 7, ...","[1, 0, 0, 0]"


In [24]:
# SANITY CHECK #2a

# 50% or 6000 samples should be 1,0,0,0
np.count_nonzero(df_random_with_mid_ooo_6000_2000_2000_2000_sample_number_50['outcome'].apply(lambda x: x == [1,0,0,0]))

6000

In [25]:
# SANITY CHECK #2b

# 2000 samples should be 0,0,0,1, i.e., other classes
np.count_nonzero(df_random_with_mid_ooo_6000_2000_2000_2000_sample_number_50['outcome'].apply(lambda x: x == [0,0,0,1]))

2000

In [26]:
# SANITY CHECK #3

# The 'sample' length should be same as SINGLE_SAMPLE_SIZE_50 = BATCH_NUMBER_COUNT_50 * NUMBER_SIZE
len(df_random_with_mid_ooo_6000_2000_2000_2000_sample_number_50['sample'][1])

750

In [27]:
(X_6000_2000_2000_2000_with_mid_ooo_sample_number_50_train, Y_6000_2000_2000_2000_with_mid_ooo_sample_number_50_train) = prepare_train_data(df_random_with_mid_ooo_6000_2000_2000_2000_sample_number_50, SINGLE_SAMPLE_SIZE_50)

type(X):<class 'list'>, len(X):12000, len(X[0]):750
df_row_count: 12000
single_sample_size_bit:750
X_train.shape:(12000, 1, 750, 1), Y_train.shape:(12000, 4)


type(Y[0]):<class 'list'>, Y[0]:[0, 0, 0, 1]


In [28]:
# For CNN, the dimensions are number of samples, height, width, channel/feature maps
# Number of samples: sum of 'attack' and 'no attack' samples
# Height: 1
# Width: SINGLE_SAMPLE_SIZE_50
# Feature maps/Channels: 1
X_6000_2000_2000_2000_with_mid_ooo_sample_number_50_train.shape

(12000, 1, 750, 1)

In [29]:
# SANITY CHECK #5

# The total rows should be attack and non-attack rows
# The wide should be array of 4, since it can be [0,0,0,0], [0,0,0,1], etc.
Y_6000_2000_2000_2000_with_mid_ooo_sample_number_50_train.shape

(12000, 4)

In [32]:
predict_arr = cnn_model_G_50_coarse_3.predict_classes(X_6000_2000_2000_2000_with_mid_ooo_sample_number_50_train)
# Each element in 'a' will contain 1/True if prediction matches expected outcome
expected_arr = [np.where(r==1)[0][0] for r in Y_6000_2000_2000_2000_with_mid_ooo_sample_number_50_train]
a = (predict_arr==expected_arr)
# % of correct predictions
np.count_nonzero(a)/np.size(a)



0.6503333333333333