# Keystroke Dynamics Using RNNs
This notebook goes from the process of cleaning and preparing data to train and test a RNN for the job of Keystroke Dynamics. The Authentication Procedure comprises of using the timing of key strike, key hold, key change and so on to reach a conclusion whether the subject under observation is authentic or not.

## Data Preparation &amp; Cleaning

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
df = pd.read_csv('./datasets/data.csv')

df.head()

Unnamed: 0,subject,sessionIndex,rep,H.period,DD.period.t,UD.period.t,H.t,DD.t.i,UD.t.i,H.i,...,H.a,DD.a.n,UD.a.n,H.n,DD.n.l,UD.n.l,H.l,DD.l.Return,UD.l.Return,H.Return
0,s002,1,1,0.1491,0.3979,0.2488,0.1069,0.1674,0.0605,0.1169,...,0.1349,0.1484,0.0135,0.0932,0.3515,0.2583,0.1338,0.3509,0.2171,0.0742
1,s002,1,2,0.1111,0.3451,0.234,0.0694,0.1283,0.0589,0.0908,...,0.1412,0.2558,0.1146,0.1146,0.2642,0.1496,0.0839,0.2756,0.1917,0.0747
2,s002,1,3,0.1328,0.2072,0.0744,0.0731,0.1291,0.056,0.0821,...,0.1621,0.2332,0.0711,0.1172,0.2705,0.1533,0.1085,0.2847,0.1762,0.0945
3,s002,1,4,0.1291,0.2515,0.1224,0.1059,0.2495,0.1436,0.104,...,0.1457,0.1629,0.0172,0.0866,0.2341,0.1475,0.0845,0.3232,0.2387,0.0813
4,s002,1,5,0.1249,0.2317,0.1068,0.0895,0.1676,0.0781,0.0903,...,0.1312,0.1582,0.027,0.0884,0.2517,0.1633,0.0903,0.2517,0.1614,0.0818


In [3]:
subjects = df['subject'].unique()

In [4]:
subjects_to_int = {subject: i  for i, subject in enumerate(subjects)}
int_to_subjects = {i: subject for i, subject in enumerate(subjects)}

In [5]:
df = df.replace(subjects_to_int)

In [6]:
df.head()

Unnamed: 0,subject,sessionIndex,rep,H.period,DD.period.t,UD.period.t,H.t,DD.t.i,UD.t.i,H.i,...,H.a,DD.a.n,UD.a.n,H.n,DD.n.l,UD.n.l,H.l,DD.l.Return,UD.l.Return,H.Return
0,0,1,1,0.1491,0.3979,0.2488,0.1069,0.1674,0.0605,0.1169,...,0.1349,0.1484,0.0135,0.0932,0.3515,0.2583,0.1338,0.3509,0.2171,0.0742
1,0,1,2,0.1111,0.3451,0.234,0.0694,0.1283,0.0589,0.0908,...,0.1412,0.2558,0.1146,0.1146,0.2642,0.1496,0.0839,0.2756,0.1917,0.0747
2,0,1,3,0.1328,0.2072,0.0744,0.0731,0.1291,0.056,0.0821,...,0.1621,0.2332,0.0711,0.1172,0.2705,0.1533,0.1085,0.2847,0.1762,0.0945
3,0,1,4,0.1291,0.2515,0.1224,0.1059,0.2495,0.1436,0.104,...,0.1457,0.1629,0.0172,0.0866,0.2341,0.1475,0.0845,0.3232,0.2387,0.0813
4,0,1,5,0.1249,0.2317,0.1068,0.0895,0.1676,0.0781,0.0903,...,0.1312,0.1582,0.027,0.0884,0.2517,0.1633,0.0903,0.2517,0.1614,0.0818


In [7]:
data = df.as_matrix()

In [8]:
def generate_positives(data, n_pos_per_subject=150):
    n_subjects = np.unique(data[:, 0]).shape[0]
    poss = []
    for i in range(n_subjects):
        temp_d = data[data[:, 0] == i]
        first_half = temp_d[np.random.choice(400, n_pos_per_subject), 3:-1]
        second_half = temp_d[np.random.choice(400, n_pos_per_subject), 3:-1]
        poss.append(np.hstack([first_half, second_half]))
    return np.vstack(poss)

In [9]:
poss = generate_positives(data)
poss.shape

(7650, 60)

In [10]:
def generate_negatives(data, n_neg_per_subject=150):
    n_subjects = np.unique(data[:, 0]).shape[0]
    negs = []
    for i in range(n_subjects):
        temp_d = data[data[:, 0] == i]
        temp_not_d = data[data[:, 0] != i]
        first_half = temp_d[np.random.choice(400, n_neg_per_subject), 3:-1]
        second_half = temp_not_d[np.random.choice(400, n_neg_per_subject), 3:-1]
        negs.append(np.hstack([first_half, second_half]))
    return np.vstack(negs)

In [11]:
negs = generate_negatives(data)
negs.shape

(7650, 60)

In [12]:
labels = np.zeros(poss.shape[0] + negs.shape[0])
labels[:poss.shape[0]] = 1
labels = np.expand_dims(labels, axis=1)
labels.shape

(15300, 1)

In [13]:
all_data = np.hstack([np.vstack([poss, negs]), labels])
all_data

array([[ 0.1024,  0.1622,  0.0598, ...,  0.3598,  0.2524,  1.    ],
       [ 0.1074,  0.1525,  0.0451, ...,  0.2532,  0.1466,  1.    ],
       [ 0.0953,  0.1032,  0.0079, ...,  0.2263,  0.1413,  1.    ],
       ..., 
       [ 0.0765,  0.1231,  0.0466, ...,  0.192 ,  0.1073,  0.    ],
       [ 0.1031,  0.1049,  0.0018, ...,  0.2313,  0.1252,  0.    ],
       [ 0.1055,  0.1089,  0.0034, ...,  0.2089,  0.1279,  0.    ]])

In [14]:
np.random.shuffle(all_data)

In [15]:
all_data_t = np.zeros((all_data.shape[0], 15, 4))

ctr = 0
for i, j in zip(range(0, 30, 2), range(30, 60, 2)):
    all_data_t[:, ctr, :] = np.hstack([all_data[:, i:i+2], all_data[:, j:j+2]])
    ctr += 1

In [16]:
X, y = all_data_t, all_data[:, -1]

## Training Phase
This Phase defines parameters of the model, the model itself along with its training to produce a simple RNN that can predict whether a person is the genuine holder of account or not.

### Some Params and HyperParams

In [17]:
from keras.layers import Dense, Activation, Dropout, Input, LSTM, Reshape, Lambda, RepeatVector, Concatenate
from keras.initializers import glorot_uniform
from keras.utils import to_categorical
from keras.models import Model
from keras.optimizers import Adam

Using TensorFlow backend.


In [18]:
VALIDATION_SPLIT = 0.1

INPUT_SHAPE = [None, 2]

BATCH_SIZE = 32

EPOCHS = 20

In [19]:
def train_dev_split(x, y, val_split=0.1):
    m = x.shape[0]
    val_size = int(0.1 * m)
    return x[:-val_size], y[:, :-val_size, :], x[-val_size:], y[:, -val_size:, :]

In [20]:
y = to_categorical(y)

y_ = np.zeros((15, y.shape[0], y.shape[1]))

for i in range(15):
    y_[i, :, :] = y

In [21]:
y_[0, 0, :], y[0]

(array([ 0.,  1.]), array([ 0.,  1.]))

In [22]:
x_train, y_train, x_test, y_test = train_dev_split(X, y_)

x_train.shape, y_train.shape, x_test.shape, y_test.shape

((13770, 15, 4), (15, 13770, 2), (1530, 15, 4), (15, 1530, 2))

In [23]:
n_a = 10
n_out = 2

In [24]:
reshapor = Reshape((1, 4))
LSTM_cell = LSTM(n_a, return_state = True)
densor = Dense(n_out, activation='softmax')

In [25]:
def keystroke_model(Tx, n_in, n_a, n_out):
    X = Input(shape=(Tx, n_in))
    
    a0 = Input(shape=(n_a,), name='a0')
    c0 = Input(shape=(n_a,), name='c0')
    a = a0
    c = c0
    
    outputs = []

    for t in range(Tx):
         
        x = Lambda(lambda x: X[:, t, :])(X)
        x = reshapor(x)

        a, _, c = LSTM_cell(x, initial_state=[a, c])

        out = densor(a)

        outputs.append(out)

    model = Model(inputs=[X, a0, c0], outputs=outputs)
    
    return model

In [26]:
model = keystroke_model(15, 4, n_a, n_out)

In [27]:
opt = Adam(lr=0.01, beta_1=0.9, beta_2=0.999, decay=0.01)

model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])

In [28]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 15, 4)        0                                            
__________________________________________________________________________________________________
lambda_1 (Lambda)               (None, 4)            0           input_1[0][0]                    
__________________________________________________________________________________________________
reshape_1 (Reshape)             (None, 1, 4)         0           lambda_1[0][0]                   
                                                                 lambda_2[0][0]                   
                                                                 lambda_3[0][0]                   
                                                                 lambda_4[0][0]                   
          

In [29]:
m = x_train.shape[0]
a0 = np.zeros((m, n_a))
c0 = np.zeros((m, n_a))

In [31]:
model.fit([x_train, a0, c0], list(y_train), epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10




<keras.callbacks.History at 0x18a8d1009b0>

In [32]:
m_t = x_test.shape[0]
a0_t = np.zeros((m_t, n_a))
c0_t = np.zeros((m_t, n_a))

In [34]:
model.evaluate([x_test, a0_t, c0_t], list(y_test))



[8.1394441193225333,
 0.68649219843297227,
 0.66842293131585218,
 0.6474493969499675,
 0.62604353209726171,
 0.60595984139473613,
 0.55427271182241,
 0.53408347352657448,
 0.51291458419725011,
 0.50179924154593269,
 0.48426854883144105,
 0.47200996883554397,
 0.467594226980521,
 0.45899881054373348,
 0.45738684663585588,
 0.46174781665303349,
 0.5294117646279678,
 0.63333333372290612,
 0.65163398677227546,
 0.65947712457257934,
 0.67450980423322693,
 0.73464052264207325,
 0.76143790880839035,
 0.76928104551789023,
 0.78169934663897245,
 0.78888888912263255,
 0.79607843160629277,
 0.79411764674716523,
 0.79411764674716523,
 0.80196078400206716,
 0.80326797354455093]

In [36]:
from keras.utils import plot_model
plot_model(model, to_file='model.png')

![](model.png)

In [45]:
model.save_weights('model.h5')