In [1]:
import importlib
import numpy as np
import matplotlib.pyplot as plt
from keras.optimizers import Adam

import preprocess
import split_train_test
import pair
import features_and_labels
import model
importlib.reload(preprocess)
importlib.reload(split_train_test)
importlib.reload(pair)
importlib.reload(features_and_labels)
importlib.reload(model)

from preprocess import (prepare_dataset,
                        one_hot_encode, )

from split_train_test import (id_split_train_test,
                              print_dataset_details,
                              get_train_test_split_data,
                              print_train_test_value_counts,)

from pair import (get_positive_pairs,
                  validate_positive_pairs,
                  get_negative_pairs,
                  validate_negative_pairs,        
                  balance_negative_pairs_equal_to_positive_pairs,
                  validate_positive_and_negative_pairs,
                  delete_redundant_columns,)

from features_and_labels import(create_labels,
                                reshape_X,
                                reshape_y,
                                split_train_validation,)

from model import (create_model,)

# dataset path 

dataset_path = 'dataset/tested_merged_data1.csv'
keywords = ['mean','expected_result', 'user_id', 'sensors','inputs_dd_000','inputs_dd_001',
            'inputs_dd_002', 'inputs_dd_003', 'inputs_dd_004']

# get preprocessed dataset ( from preprocess.py )
dataset = prepare_dataset(dataset_path, keywords) 


Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])



Original Shape : (18926, 211)

After pre-processing : (11807, 29)


### SPLIT TRAIN TEST DATA

    # from split_train_test.py

In [2]:
max_range_of_records = 36
min_range_of_records = 10

# split IDs in train test 
common_user_ids, train_user_ids ,test_user_ids = id_split_train_test(dataset,
                                                                             max_range_of_records,
                                                                             min_range_of_records)
# print status of data
print_dataset_details(dataset, common_user_ids)



Total Unique USER IDS in dataset :  196

**** USERS SELECTION IS BASED ON EQUAL NUMBER OF GENUINE & FRAUD RECORDS ****

Total Unique USER IDS having both fraud & genuine records :  123

Users in TRAIN :  58

Users in TEST: 13

Users DROPPED: 52

GENUINE & FRAUD RECORD COUNTS VIEW : 

 [(253, 253), (182, 182), (170, 170), (152, 152), (142, 142), (134, 134), (114, 114), (96, 96), (89, 89), (84, 84), (77, 77), (74, 74), (71, 71), (70, 70), (68, 68), (65, 65), (65, 65), (62, 62), (59, 59), (58, 58), (56, 56), (53, 53), (52, 52), (51, 51), (46, 46), (46, 46), (44, 44), (44, 44), (44, 44), (44, 44), (44, 44), (44, 44), (44, 44), (44, 44), (44, 44), (44, 44), (44, 44), (44, 44), (44, 44), (44, 44), (44, 44), (44, 44), (44, 44), (44, 44), (44, 44), (44, 44), (44, 44), (44, 44), (44, 44), (44, 44), (44, 44), (44, 44), (44, 44), (44, 44), (44, 44), (44, 44), (44, 44), (44, 44), (43, 43), (42, 42), (42, 42), (42, 42), (42, 42), (42, 42), (42, 42), (42, 42), (42, 42), (41, 41), (41, 41), (41, 41)

### GET TRAIN TEST DATAFRAMES
        # from split_train_test.py

In [3]:
number_of_train_records = max_range_of_records #(36)
number_of_test_records = min_range_of_records  #(10)

# get the splitted data
train_df, test_df = get_train_test_split_data(dataset,
                                             train_user_ids,
                                             test_user_ids,
                                             number_of_train_records,
                                             number_of_test_records)


**** DATASETS VALUE ****

TRAIN DF :  (4176, 28)
TEST DF :  (260, 28) 



<font size=4 color='red'>Unique users in training & testing </font>

<font size=4 color='blue'> 36 records per user {genuine + fraud = 72} </font>

<font size=4 color='blue'> 10 records per user {genuine + fraud = 20} </font>

    # from split_train_test.py

In [4]:
print_train_test_value_counts(train_df, test_df, train_user_ids ,test_user_ids)



TRAIN :

 d2ebd2fb-0183-4d93-a7c8-6a162d1d    72
19df8afe-3262-42b7-871c-f3d87e9d    72
47551081-7af1-4c62-842f-b3f28b68    72
4c499df0-ff3b-4231-bd07-7ff74dae    72
4ff5b449-16b8-4ce8-9f1c-edb35b76    72
27693542-8b55-4b16-a909-331b862e    72
ea03edd8-0ca6-4302-a0a6-399ab3b9    72
66cf099e-59fa-42be-890f-ed4fcd03    72
8ad31046-f734-4008-b351-e1d744ed    72
d4b18daa-0f06-4920-9dbe-8c25bcbe    72
81279401-4a1c-4dd0-a6ba-dac4aeac    72
578defd9-4912-4a5e-8549-35a6c891    72
47c3ff02-141b-410c-a3a4-9cf03e51    72
e8bcffff-8db4-4376-b3d0-f6063074    72
4e631425-a08c-4352-817b-040320b3    72
31d90bdf-7d73-4389-9532-c6174817    72
a0848573-b8e7-46ba-9157-c89edaaf    72
dd4fc622-42aa-467c-94c1-928dfcac    72
86a7d333-beae-47a7-af1a-11fe981c    72
9ae71631-31b2-4473-bdd6-31840bc7    72
770b7817-ba15-45cc-8f2e-1ce49d3f    72
b86146f6-5fc1-4b8b-bf72-ef3ede41    72
f61f8475-1dec-4236-b132-d4660c34    72
b63045cb-6fe0-4833-95df-8a15ee16    72
218ee191-308e-4bae-a8f9-2d924d65    72
3b94592f-8632

### ONE HOT ENCODE TRAIN & TEST DATAFRAMES - (user_id)
    # from preprocess.py

In [5]:
train_df = one_hot_encode(train_df, 'user_id')
test_df = one_hot_encode(test_df, 'user_id')

### SEPARATE POSITIVE & NEGATIVE DATA  IN TRAIN & TEST

In [6]:
train_positives_df = train_df.query('(expected_result == 1)')
train_negatives_df = train_df.query('(expected_result == -1)')

test_positives_df = test_df.query('(expected_result == 1)')
test_negatives_df = test_df.query('(expected_result == -1)')

print(f"\nTRAIN POSITIVE RECORDS : {train_positives_df.shape}")
print(f"TRAIN NEGATIVE RECORDS : {train_negatives_df.shape}")
print(f"\nTEST POSITIVE RECORDS : {test_positives_df.shape}")
print(f"TEST NEGATIVE RECORDS : {test_negatives_df.shape}")


TRAIN POSITIVE RECORDS : (2088, 28)
TRAIN NEGATIVE RECORDS : (2088, 28)

TEST POSITIVE RECORDS : (130, 28)
TEST NEGATIVE RECORDS : (130, 28)


In [7]:
train_positives_df.head(2)

Unnamed: 0,expected_result,inputs_dd_000,inputs_dd_001,inputs_dd_002,inputs_dd_003,inputs_dd_004,motion_A_0_mean,motion_A_1_mean,motion_A_2_mean,motion_G_0_mean,...,motion_R_0_mean,motion_R_1_mean,motion_R_2_mean,motion_Y_0_mean,motion_Y_1_mean,motion_Y_2_mean,motion_a_0_mean,motion_a_1_mean,motion_a_2_mean,user_id
2143,1,0.143996,0.197712,0.175011,0.27082,0.0,0.093804,5.617165,8.314385,0.029451,...,0.220718,0.196895,0.638908,-0.007516,-0.029335,-0.00432,0.070737,0.075397,0.207277,0
2188,1,0.220325,0.132894,0.252969,0.256306,0.0,-0.089522,7.012722,7.003768,-0.100165,...,0.302824,0.232009,0.550533,0.002607,-0.008654,-0.000132,0.010643,0.096375,0.11603,0


In [8]:
train_negatives_df.head(2)

Unnamed: 0,expected_result,inputs_dd_000,inputs_dd_001,inputs_dd_002,inputs_dd_003,inputs_dd_004,motion_A_0_mean,motion_A_1_mean,motion_A_2_mean,motion_G_0_mean,...,motion_R_0_mean,motion_R_1_mean,motion_R_2_mean,motion_Y_0_mean,motion_Y_1_mean,motion_Y_2_mean,motion_a_0_mean,motion_a_1_mean,motion_a_2_mean,user_id
2101,-1,0.245908,0.375753,0.706719,0.18553,0.0,0.617978,6.48432,7.58921,0.634965,...,0.242845,0.252376,0.733069,-0.025758,-0.007999,-0.022977,0.002904,0.113257,0.187677,0
2100,-1,0.16754,0.124143,0.33175,0.19819,0.0,0.776021,6.126912,7.927501,0.803202,...,0.307191,0.090773,0.375153,-0.059509,-0.171004,-0.320601,-0.101103,0.082505,0.281851,0


### GET POSITIVE & NEGATIVE  PAIRS : <font color='blue'> TRAINING </font>
    # from pair.py

In [9]:
''' POSITIVE PAIRS '''

train_positive_pairs = get_positive_pairs(train_positives_df, train_positives_df['user_id'].unique())
print(f"\nTRAIN POSITIVE PAIRS : {train_positive_pairs.shape} using {train_positives_df.shape} train positive records.")

# print(type(train_negatives_df), type(train_positives_df))

''' NEGATIVE PAIRS '''
# make positive pairs 
train_negative_pairs = get_negative_pairs(train_negatives_df, train_positives_df, train_negatives_df['user_id'].unique())
print(f"TRAIN NEGATIVE PAIRS : {train_negative_pairs.shape} using {train_negatives_df.shape} train negative records.")

# ''' GET EQUAL POSITIVE & NEGATIVE PAIRS 
# # balancing positive & negative pairs
# balanced_negative_pairs = balance_negative_pairs_equal_to_positive_pairs(train_negative_pairs)
# # validate balanced pairs
# validate_positive_and_negative_pairs(len(train_positive_pairs), len(balanced_negative_pairs)) '''

  user_data = user_data_df.as_matrix()



TRAIN POSITIVE PAIRS : (36540, 56) using (2088, 28) train positive records.


  user_positive_data = positive_user_data_df.as_matrix()
  user_negative_data = negative_user_data_df.as_matrix()


TRAIN NEGATIVE PAIRS : (75168, 56) using (2088, 28) train negative records.


In [10]:
print(f"Sample of POSITIVE PAIR ROW :\n{train_positive_pairs[0]}")
print(f"\nSample of NEGATIVE PAIR ROW :\n{train_negative_pairs[0]}")

Sample of POSITIVE PAIR ROW :
[ 1.00000000e+00  1.68116000e-01  2.28227300e-01  2.48105100e-01
  3.12209600e-01  0.00000000e+00 -4.40755720e-01  5.11401500e+00
  8.56745600e+00 -2.87455440e-01  5.08279280e+00  8.32782900e+00
  3.62908630e+01  1.90586650e+00 -5.17275670e+00 -1.42763960e+00
 -5.46674200e-01  5.00124300e-02  1.92246650e-01  1.89653340e-01
  6.35622260e-01 -1.95395440e-02  1.27240690e-01  9.31144950e-02
 -1.24985576e-01  8.31490600e-02  2.45967400e-01  0.00000000e+00
  1.00000000e+00  1.91728600e-01  1.96550600e-01  1.96769300e-01
  2.58984500e-01  0.00000000e+00 -6.47445900e-02  5.59892600e+00
  8.03305300e+00  8.03189050e-02  5.46342230e+00  8.07887500e+00
 -1.95008530e+00  7.30764900e+00 -2.79142670e+01  9.26725100e-02
 -6.08467600e-01  6.67863340e-03  2.70560530e-01 -3.24646690e-03
 -4.19465100e-02  8.05368500e-02  2.30216100e-01  3.46195640e-01
 -1.19300574e-01  1.20531045e-01 -3.63791300e-02  0.00000000e+00]

Sample of NEGATIVE PAIR ROW :
[ 1.00000000e+00  1.68116000

### REMOVE REDUNDANCY

In [11]:
''' Remove existing labels from the dataframes to reduce redundancy in the pair data '''

train_positive_pairs = delete_redundant_columns(train_positive_pairs, 0)
train_negative_pairs = delete_redundant_columns(train_negative_pairs, 0)
train_positive_pairs = delete_redundant_columns(train_positive_pairs, 27)
train_negative_pairs = delete_redundant_columns(train_negative_pairs, 27)
train_positive_pairs = delete_redundant_columns(train_positive_pairs, -1)
train_negative_pairs = delete_redundant_columns(train_negative_pairs, -1)

print(train_positive_pairs.shape, train_negative_pairs.shape)
print(train_positive_pairs.shape, train_negative_pairs.shape)

(36540, 53) (75168, 53)
(36540, 53) (75168, 53)


### CREATE LABELS

    # from features_and_labels.py

In [17]:
''' Create labels for training data '''

pos_neg_flag = False

if pos_neg_flag:
    labels = create_labels(train_positive_pairs, train_negative_pairs[:len(train_positive_pairs)], pos_neg_flag)
    train_data = np.hstack([np.vstack([train_positive_pairs, train_negative_pairs[:len(train_positive_pairs)]]), labels])
else:
    labels = create_labels(train_positive_pairs)
    train_data = np.hstack([np.vstack([train_positive_pairs]), labels])

print(f"Unique labels : {np.unique(labels)}\nLength of labels : {len(labels)}\nTraining data shape : {train_data.shape}\nSample training row :\n{train_data[0]}")


''' Shuffle training data '''
np.random.shuffle(train_data)

Unique labels : [1.]
Length of labels : 36540
Training data shape : (36540, 54)
Sample training row :
[ 1.68116000e-01  2.28227300e-01  2.48105100e-01  3.12209600e-01
  0.00000000e+00 -4.40755720e-01  5.11401500e+00  8.56745600e+00
 -2.87455440e-01  5.08279280e+00  8.32782900e+00  3.62908630e+01
  1.90586650e+00 -5.17275670e+00 -1.42763960e+00 -5.46674200e-01
  5.00124300e-02  1.92246650e-01  1.89653340e-01  6.35622260e-01
 -1.95395440e-02  1.27240690e-01  9.31144950e-02 -1.24985576e-01
  8.31490600e-02  2.45967400e-01  0.00000000e+00  1.91728600e-01
  1.96550600e-01  1.96769300e-01  2.58984500e-01  0.00000000e+00
 -6.47445900e-02  5.59892600e+00  8.03305300e+00  8.03189050e-02
  5.46342230e+00  8.07887500e+00 -1.95008530e+00  7.30764900e+00
 -2.79142670e+01  9.26725100e-02 -6.08467600e-01  6.67863340e-03
  2.70560530e-01 -3.24646690e-03 -4.19465100e-02  8.05368500e-02
  2.30216100e-01  3.46195640e-01 -1.19300574e-01  1.20531045e-01
 -3.63791300e-02  1.00000000e+00]


### FEATURES & LABELS
    # from features_and_labels.py

In [19]:
importlib.reload(features_and_labels)
from features_and_labels import(create_labels,
                                reshape_X,
                                reshape_y,
                                split_train_validation,)

nrows = (train_data.shape[0])
ncols = (train_data.shape[1])

VALIDATION_SPLIT = 0.1



In [20]:
train_data[0].shape[0]-1, train_data.shape, train_data.shape, (train_data[0].shape[0]-1)//2

(53, (36540, 54), (36540, 54), 26)

In [None]:
''' Split labels & fetaures '''
X, y = train_data_t, train_data[:, -1]
X[0],y[0]

In [None]:
''' Reshape y '''
y_ = reshape_y(y, nrows)
y_

In [None]:
''' Split X & y'''

x_train, y_train, x_test, y_test = split_train_validation(X, y_, VALIDATION_SPLIT)
x_train.shape, y_train.shape, x_test.shape, y_test.shape

In [None]:
x_train[0], y_train[0], x_train,y_train, x_train.shape, y_train.shape

### MODEL
    # from model.py

In [25]:
import model
importlib.reload(model)
from keras.optimizers import Adam
from model import (create_model,)

''' Model initialzations '''

INPUT_SHAPE = [None, 2]

BATCH_SIZE = 1

EPOCHS = 1

# input shape (X)
Tx, n_in = nrows, ncols

# number of columns in 2 input rows
n_a = 27

# number of outputs 
n_out =  1

reshape_row, reshape_col = 1, 4

a0 = np.zeros((x_train.shape[0], n_a))

c0 = np.zeros((x_train.shape[0], n_a))

optimizer = Adam(lr=0.01, beta_1=0.9, beta_2=0.999, decay=0.01)


NameError: name 'x_train' is not defined

In [25]:
model = create_model(Tx, n_in, n_a, n_out, reshape_row, reshape_col)

model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
print(model.summary())

ValueError: total size of new array must be unchanged

In [None]:
x_train.shape

In [None]:
model.fit([x_train, a0, c0], list(y_train), epochs=EPOCHS)

### EVALUATE

In [None]:
a0_t = np.zeros((x_test.shape[0], n_a))
c0_t = np.zeros((x_test.shape[0], n_a))

x_test[0], y_test[0], x_test[0].shape, y_test[0].shape

In [None]:
evaluation = model.evaluate([x_test, a0_t, c0_t], list(y_test))

In [None]:
print(model.metrics_names)

In [None]:
print("ACCURACY :\n", evaluation[16:])

In [None]:
model.save_weights('model.h5')

In [None]:
# model.history.history

In [None]:
# X = train_positive_pairs 
# y = labels
# val_split = 10

# # X = train_positive_pairs[:,1:] 
# # y = train_positive_pairs[:,0]
# # val_size = 1000

# # split_train_validation(X, y, val_split)

# # X[:2], y, np.unique(y)

# split_train_validation(X, y, val_split)

# X[:2], y, np.unique(y)

In [None]:
# X[:val_size], y[:, :val_size, :], X[val_size:], y[:,val_size:, :]

In [None]:
# all_data = np.hstack([np.vstack([train_positive_pairs]), labels])
# all_data_t = np.zeros((all_data.shape[0], 15, 4))


# ctr = 0
# for i, j in zip(range(0, 30, 2), range(30, 60, 2)):
#     all_data_t[:, ctr, :] = np.hstack([all_data[:, i:i+2], all_data[:, j:j+2]])
#     ctr += 1