Implementing a Simple Recurrent Neural Network
----

In [1]:
%load_ext autoreload
%autoreload 2

In [6]:
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tensorflow import keras
sys.path.append("../src")
import utils

In [10]:
# load balanced training data and test data
training_data = pd.read_csv(utils.prepend_dir('tx_train_set.csv'), index_col=0)
testing_data = pd.read_csv(utils.prepend_dir('tx_test.csv'), index_col=0)

mcc_converter = utils.MCCRates()
dummy_converter = utils.MakeDummies('errors')

# y_train = training_data.is_fraud.copy()
# X_train = mcc_converter.fit_transform(training_data.drop(columns='is_fraud'))
X_train = mcc_converter.fit_transform(training_data)
X_train = dummy_converter.fit_transform(X_train)

# y_test = testing_data.is_fraud.copy()
# X_test = mcc_converter.transform(testing_data.drop(columns='is_fraud'))
# X_test = dummy_converter.transform(X_test)
# X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [12]:
X_train.head().T

Unnamed: 0,2001,2005,2018,2028,2031
user,22,22,22,22,22
card,0,0,0,0,0
amount,149.42,2.32,61.4,41.87,12.59
is_fraud,True,True,True,True,True
has_chip,True,True,True,True,True
cards_issued,2,2,2,2,2
credit_limit,4488.0,4488.0,4488.0,4488.0,4488.0
latitude,41.47,41.47,41.47,41.47,41.47
longitude,-81.67,-81.67,-81.67,-81.67,-81.67
per_capita_income_zipcode,19524.0,19524.0,19524.0,19524.0,19524.0


In [17]:
max_length = X_train.groupby('user')['amount'].agg('count').max()
D = X_train.shape[1]
train_users = training_data.user.unique()
print(len(train_users), max_length, D)

217 47363 38


Keras RNNs take 3-d arrays of shape $(N, L, D)$, where $L$ is a fixed sequence length. Observations with shorter sequences should be "padded" to the right with 0 vectors. These can then be processed by a `Mask(mask_value=0.0, input_shape=(L,D))` layer, which will output a boolean matrix of size $(N,L)$ indicating which sequence entries are real observations. There is a utility function for padding, but only seems to work with 1d sequences.

In [25]:
X_seq_train = np.zeros((len(train_users), max_length, D-2))
y_seq_train = np.zeros((len(train_users), max_length))
X_seq_train.shape

(217, 47363, 36)

In [26]:
for i, user in enumerate(train_users):
    seq_length = len(X_train[X_train.user==user])
    X_seq_train[i,:seq_length,:] = X_train[X_train.user==user].drop(columns=['user', 'is_fraud']).copy()
    y_seq_train[i,:seq_length] = X_train[X_train.user==user].is_fraud.copy()

In [29]:
seq_length, col_nums = X_seq_train.shape[1:]

This actually needs to have a decoder...

In [36]:
rnn = keras.models.Sequential()
rnn.add(keras.layers.Masking(input_shape=(seq_length, col_nums)))
rnn.add(keras.layers.LSTM(25, return_sequences=True))
# rnn.add(keras.layers.Dense(50, activation='relu'))
rnn.add(keras.layers.SimpleRNN(1, return_sequences=True))
rnn.compile(
    optimizer='adam',
    loss='binary_crossentropy'
)
rnn.fit(X_seq_train, y_seq_train)

ValueError: in user code:

    File "c:\Users\marks\.virtualenvs\CreditCardFraud-UbjN0nMT\lib\site-packages\keras\src\engine\training.py", line 1338, in train_function  *
        return step_function(self, iterator)
    File "c:\Users\marks\.virtualenvs\CreditCardFraud-UbjN0nMT\lib\site-packages\keras\src\engine\training.py", line 1322, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "c:\Users\marks\.virtualenvs\CreditCardFraud-UbjN0nMT\lib\site-packages\keras\src\engine\training.py", line 1303, in run_step  **
        outputs = model.train_step(data)
    File "c:\Users\marks\.virtualenvs\CreditCardFraud-UbjN0nMT\lib\site-packages\keras\src\engine\training.py", line 1081, in train_step
        loss = self.compute_loss(x, y, y_pred, sample_weight)
    File "c:\Users\marks\.virtualenvs\CreditCardFraud-UbjN0nMT\lib\site-packages\keras\src\engine\training.py", line 1139, in compute_loss
        return self.compiled_loss(
    File "c:\Users\marks\.virtualenvs\CreditCardFraud-UbjN0nMT\lib\site-packages\keras\src\engine\compile_utils.py", line 265, in __call__
        loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    File "c:\Users\marks\.virtualenvs\CreditCardFraud-UbjN0nMT\lib\site-packages\keras\src\losses.py", line 160, in __call__
        return losses_utils.compute_weighted_loss(
    File "c:\Users\marks\.virtualenvs\CreditCardFraud-UbjN0nMT\lib\site-packages\keras\src\utils\losses_utils.py", line 350, in compute_weighted_loss
        ) = squeeze_or_expand_dimensions(losses, None, sample_weight)
    File "c:\Users\marks\.virtualenvs\CreditCardFraud-UbjN0nMT\lib\site-packages\keras\src\utils\losses_utils.py", line 224, in squeeze_or_expand_dimensions
        sample_weight = tf.squeeze(sample_weight, [-1])

    ValueError: Can not squeeze dim[1], expected a dimension of 1, got 47363 for '{{node binary_crossentropy/weighted_loss/Squeeze}} = Squeeze[T=DT_FLOAT, squeeze_dims=[-1]](binary_crossentropy/mul_3)' with input shapes: [?,47363].


In [None]:
from sklearn.metrics import classification_report
y_pred = rnn.predict(X_seq_train)



ValueError: Classification metrics can't handle a mix of multilabel-indicator and continuous targets

In [None]:
print(y_pred.shape)

(217, 1)
